livekit-plugins-google 0.10.5__py3-none-any.whl → 0.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,14 +10,15 @@ from livekit.agents import llm, utils
10
10
  from livekit.agents.llm.function_context import _is_optional_type
11
11
 
12
12
  from google.genai import types
13
-
14
- JSON_SCHEMA_TYPE_MAP: dict[type, types.Type] = {
15
- str: "STRING",
16
- int: "INTEGER",
17
- float: "NUMBER",
18
- bool: "BOOLEAN",
19
- dict: "OBJECT",
20
- list: "ARRAY",
13
+ from google.genai.types import Type as GenaiType
14
+
15
+ JSON_SCHEMA_TYPE_MAP: dict[type, GenaiType] = {
16
+ str: GenaiType.STRING,
17
+ int: GenaiType.INTEGER,
18
+ float: GenaiType.NUMBER,
19
+ bool: GenaiType.BOOLEAN,
20
+ dict: GenaiType.OBJECT,
21
+ list: GenaiType.ARRAY,
21
22
  }
22
23
 
23
24
  __all__ = ["_build_gemini_ctx", "_build_tools"]
@@ -38,7 +39,7 @@ def _build_parameters(arguments: Dict[str, Any]) -> types.Schema | None:
38
39
  item_type = get_args(py_type)[0]
39
40
  if item_type not in JSON_SCHEMA_TYPE_MAP:
40
41
  raise ValueError(f"Unsupported type: {item_type}")
41
- prop.type = "ARRAY"
42
+ prop.type = GenaiType.ARRAY
42
43
  prop.items = types.Schema(type=JSON_SCHEMA_TYPE_MAP[item_type])
43
44
 
44
45
  if arg_info.choices:
@@ -62,7 +63,7 @@ def _build_parameters(arguments: Dict[str, Any]) -> types.Schema | None:
62
63
  required.append(arg_name)
63
64
 
64
65
  if properties:
65
- parameters = types.Schema(type="OBJECT", properties=properties)
66
+ parameters = types.Schema(type=GenaiType.OBJECT, properties=properties)
66
67
  if required:
67
68
  parameters.required = required
68
69
 
@@ -119,7 +120,6 @@ def _build_gemini_ctx(
119
120
  parts.append(
120
121
  types.Part(
121
122
  function_call=types.FunctionCall(
122
- id=fnc.tool_call_id,
123
123
  name=fnc.function_info.name,
124
124
  args=fnc.arguments,
125
125
  )
@@ -132,7 +132,6 @@ def _build_gemini_ctx(
132
132
  parts.append(
133
133
  types.Part(
134
134
  function_response=types.FunctionResponse(
135
- id=msg.tool_call_id,
136
135
  name=msg.name,
137
136
  response=msg.content,
138
137
  )
@@ -142,7 +141,6 @@ def _build_gemini_ctx(
142
141
  parts.append(
143
142
  types.Part(
144
143
  function_response=types.FunctionResponse(
145
- id=msg.tool_call_id,
146
144
  name=msg.name,
147
145
  response={"result": msg.content},
148
146
  )
@@ -193,8 +191,7 @@ def _build_gemini_image_part(image: llm.ChatImage, cache_key: Any) -> types.Part
193
191
  height=image.inference_height,
194
192
  strategy="scale_aspect_fit",
195
193
  )
196
- encoded_data = utils.images.encode(image.image, opts)
197
- image._cache[cache_key] = base64.b64encode(encoded_data).decode("utf-8")
194
+ image._cache[cache_key] = utils.images.encode(image.image, opts)
198
195
 
199
196
  return types.Part.from_bytes(
200
197
  data=image._cache[cache_key], mime_type="image/jpeg"
@@ -9,14 +9,15 @@ from typing import AsyncIterable, Literal
9
9
  from livekit import rtc
10
10
  from livekit.agents import llm, utils
11
11
  from livekit.agents.llm.function_context import _create_ai_function_info
12
+ from livekit.agents.utils import images
12
13
 
13
14
  from google import genai
14
- from google.genai._api_client import HttpOptions
15
15
  from google.genai.types import (
16
16
  Blob,
17
17
  Content,
18
18
  FunctionResponse,
19
19
  GenerationConfig,
20
+ HttpOptions,
20
21
  LiveClientContent,
21
22
  LiveClientRealtimeInput,
22
23
  LiveClientToolResponse,
@@ -107,7 +108,7 @@ class RealtimeModel:
107
108
  model: LiveAPIModels | str = "gemini-2.0-flash-exp",
108
109
  api_key: str | None = None,
109
110
  voice: Voice | str = "Puck",
110
- modalities: list[Modality] = ["AUDIO"],
111
+ modalities: list[Modality] = [Modality.AUDIO],
111
112
  enable_user_audio_transcription: bool = True,
112
113
  enable_agent_audio_transcription: bool = True,
113
114
  vertexai: bool = False,
@@ -258,6 +259,8 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
258
259
  self._fnc_ctx = fnc_ctx
259
260
  self._fnc_tasks = utils.aio.TaskSet()
260
261
  self._is_interrupted = False
262
+ self._playout_complete = asyncio.Event()
263
+ self._playout_complete.set()
261
264
 
262
265
  tools = []
263
266
  if self._fnc_ctx is not None:
@@ -317,6 +320,10 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
317
320
  self._send_ch.close()
318
321
  await self._main_atask
319
322
 
323
+ @property
324
+ def playout_complete(self) -> asyncio.Event | None:
325
+ return self._playout_complete
326
+
320
327
  @property
321
328
  def fnc_ctx(self) -> llm.FunctionContext | None:
322
329
  return self._fnc_ctx
@@ -325,14 +332,53 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
325
332
  def fnc_ctx(self, value: llm.FunctionContext | None) -> None:
326
333
  self._fnc_ctx = value
327
334
 
328
- def _push_audio(self, frame: rtc.AudioFrame) -> None:
329
- if self._opts.enable_user_audio_transcription:
330
- self._transcriber._push_audio(frame)
335
+ def _push_media_chunk(self, data: bytes, mime_type: str) -> None:
331
336
  realtime_input = LiveClientRealtimeInput(
332
- media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
337
+ media_chunks=[Blob(data=data, mime_type=mime_type)],
333
338
  )
334
339
  self._queue_msg(realtime_input)
335
340
 
341
+ DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
342
+ format="JPEG",
343
+ quality=75,
344
+ resize_options=images.ResizeOptions(
345
+ width=1024, height=1024, strategy="scale_aspect_fit"
346
+ ),
347
+ )
348
+
349
+ def push_video(
350
+ self,
351
+ frame: rtc.VideoFrame,
352
+ encode_options: images.EncodeOptions = DEFAULT_ENCODE_OPTIONS,
353
+ ) -> None:
354
+ """Push a video frame to the Gemini Multimodal Live session.
355
+
356
+ Args:
357
+ frame (rtc.VideoFrame): The video frame to push.
358
+ encode_options (images.EncodeOptions, optional): The encode options for the video frame. Defaults to 1024x1024 JPEG.
359
+
360
+ Notes:
361
+ - This will be sent immediately so you should use a sampling frame rate that makes sense for your application and Gemini's constraints. 1 FPS is a good starting point.
362
+ """
363
+ encoded_data = images.encode(
364
+ frame,
365
+ encode_options,
366
+ )
367
+ mime_type = (
368
+ "image/jpeg"
369
+ if encode_options.format == "JPEG"
370
+ else "image/png"
371
+ if encode_options.format == "PNG"
372
+ else "image/jpeg"
373
+ )
374
+ self._push_media_chunk(encoded_data, mime_type)
375
+
376
+ def _push_audio(self, frame: rtc.AudioFrame) -> None:
377
+ if self._opts.enable_user_audio_transcription:
378
+ self._transcriber._push_audio(frame)
379
+
380
+ self._push_media_chunk(frame.data.tobytes(), "audio/pcm")
381
+
336
382
  def _queue_msg(self, msg: ClientEvents) -> None:
337
383
  self._send_ch.send_nowait(msg)
338
384
 
@@ -479,12 +525,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
479
525
  logger.warning(
480
526
  "function call cancelled",
481
527
  extra={
482
- "function_call_ids": response.tool_call_cancellation.function_call_ids,
528
+ "function_call_ids": response.tool_call_cancellation.ids,
483
529
  },
484
530
  )
485
531
  self.emit(
486
532
  "function_calls_cancelled",
487
- response.tool_call_cancellation.function_call_ids,
533
+ response.tool_call_cancellation.ids,
488
534
  )
489
535
 
490
536
  async with self._client.aio.live.connect(
@@ -55,7 +55,7 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
55
55
  parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
56
56
  )
57
57
  self._config = types.LiveConnectConfig(
58
- response_modalities=["TEXT"],
58
+ response_modalities=[types.Modality.TEXT],
59
59
  system_instruction=system_instructions,
60
60
  generation_config=types.GenerationConfig(temperature=0.0),
61
61
  )
@@ -240,7 +240,7 @@ class LLMStream(llm.LLMStream):
240
240
  # specific function
241
241
  tool_config = types.ToolConfig(
242
242
  function_calling_config=types.FunctionCallingConfig(
243
- mode="ANY",
243
+ mode=types.FunctionCallingConfigMode.ANY,
244
244
  allowed_function_names=[self._tool_choice.name],
245
245
  )
246
246
  )
@@ -248,7 +248,7 @@ class LLMStream(llm.LLMStream):
248
248
  # model must call any function
249
249
  tool_config = types.ToolConfig(
250
250
  function_calling_config=types.FunctionCallingConfig(
251
- mode="ANY",
251
+ mode=types.FunctionCallingConfigMode.ANY,
252
252
  allowed_function_names=[
253
253
  fnc.name
254
254
  for fnc in self._fnc_ctx.ai_functions.values()
@@ -259,14 +259,14 @@ class LLMStream(llm.LLMStream):
259
259
  # model can call any function
260
260
  tool_config = types.ToolConfig(
261
261
  function_calling_config=types.FunctionCallingConfig(
262
- mode="AUTO"
262
+ mode=types.FunctionCallingConfigMode.AUTO
263
263
  )
264
264
  )
265
265
  elif self._tool_choice == "none":
266
266
  # model cannot call any function
267
267
  tool_config = types.ToolConfig(
268
268
  function_calling_config=types.FunctionCallingConfig(
269
- mode="NONE",
269
+ mode=types.FunctionCallingConfigMode.NONE,
270
270
  )
271
271
  )
272
272
  opts["tool_config"] = tool_config
@@ -282,11 +282,12 @@ class LLMStream(llm.LLMStream):
282
282
  system_instruction=system_instruction,
283
283
  **opts,
284
284
  )
285
- async for response in self._client.aio.models.generate_content_stream(
285
+ stream = await self._client.aio.models.generate_content_stream(
286
286
  model=self._model,
287
287
  contents=cast(types.ContentListUnion, turns),
288
288
  config=config,
289
- ):
289
+ )
290
+ async for response in stream: # type: ignore
290
291
  if response.prompt_feedback:
291
292
  raise APIStatusError(
292
293
  response.prompt_feedback.json(),
@@ -10,6 +10,8 @@ SpeechModels = Literal[
10
10
  "medical_conversation",
11
11
  "chirp",
12
12
  "chirp_2",
13
+ "latest_long",
14
+ "latest_short",
13
15
  ]
14
16
 
15
17
  SpeechLanguages = Literal[
@@ -92,8 +94,6 @@ SpeechLanguages = Literal[
92
94
 
93
95
  Gender = Literal["male", "female", "neutral"]
94
96
 
95
- AudioEncoding = Literal["wav", "mp3", "ogg", "mulaw", "alaw", "linear16"]
96
-
97
97
  ChatModels = Literal[
98
98
  "gemini-2.0-flash-001",
99
99
  "gemini-2.0-flash-lite-preview-02-05",
@@ -19,7 +19,7 @@ import dataclasses
19
19
  import time
20
20
  import weakref
21
21
  from dataclasses import dataclass
22
- from typing import List, Union
22
+ from typing import Callable, List, Union
23
23
 
24
24
  from livekit import rtc
25
25
  from livekit.agents import (
@@ -61,7 +61,7 @@ class STTOptions:
61
61
  interim_results: bool
62
62
  punctuate: bool
63
63
  spoken_punctuation: bool
64
- model: SpeechModels
64
+ model: SpeechModels | str
65
65
  sample_rate: int
66
66
  keywords: List[tuple[str, float]] | None
67
67
 
@@ -93,7 +93,7 @@ class STT(stt.STT):
93
93
  interim_results: bool = True,
94
94
  punctuate: bool = True,
95
95
  spoken_punctuation: bool = False,
96
- model: SpeechModels = "chirp_2",
96
+ model: SpeechModels | str = "latest_long",
97
97
  location: str = "us-central1",
98
98
  sample_rate: int = 16000,
99
99
  credentials_info: dict | None = None,
@@ -106,12 +106,24 @@ class STT(stt.STT):
106
106
  Credentials must be provided, either by using the ``credentials_info`` dict, or reading
107
107
  from the file specified in ``credentials_file`` or via Application Default Credentials as
108
108
  described in https://cloud.google.com/docs/authentication/application-default-credentials
109
+
110
+ args:
111
+ languages(LanguageCode): list of language codes to recognize (default: "en-US")
112
+ detect_language(bool): whether to detect the language of the audio (default: True)
113
+ interim_results(bool): whether to return interim results (default: True)
114
+ punctuate(bool): whether to punctuate the audio (default: True)
115
+ spoken_punctuation(bool): whether to use spoken punctuation (default: False)
116
+ model(SpeechModels): the model to use for recognition default: "latest_long"
117
+ location(str): the location to use for recognition default: "us-central1"
118
+ sample_rate(int): the sample rate of the audio default: 16000
119
+ credentials_info(dict): the credentials info to use for recognition (default: None)
120
+ credentials_file(str): the credentials file to use for recognition (default: None)
121
+ keywords(List[tuple[str, float]]): list of keywords to recognize (default: None)
109
122
  """
110
123
  super().__init__(
111
124
  capabilities=stt.STTCapabilities(streaming=True, interim_results=True)
112
125
  )
113
126
 
114
- self._client: SpeechAsyncClient | None = None
115
127
  self._location = location
116
128
  self._credentials_info = credentials_info
117
129
  self._credentials_file = credentials_file
@@ -140,40 +152,44 @@ class STT(stt.STT):
140
152
  keywords=keywords,
141
153
  )
142
154
  self._streams = weakref.WeakSet[SpeechStream]()
155
+ self._pool = utils.ConnectionPool[SpeechAsyncClient](
156
+ max_session_duration=_max_session_duration,
157
+ connect_cb=self._create_client,
158
+ )
143
159
 
144
- def _ensure_client(self) -> SpeechAsyncClient:
160
+ async def _create_client(self) -> SpeechAsyncClient:
145
161
  # Add support for passing a specific location that matches recognizer
146
162
  # see: https://cloud.google.com/speech-to-text/v2/docs/speech-to-text-supported-languages
147
163
  client_options = None
164
+ client: SpeechAsyncClient | None = None
148
165
  if self._location != "global":
149
166
  client_options = ClientOptions(
150
167
  api_endpoint=f"{self._location}-speech.googleapis.com"
151
168
  )
152
169
  if self._credentials_info:
153
- self._client = SpeechAsyncClient.from_service_account_info(
170
+ client = SpeechAsyncClient.from_service_account_info(
154
171
  self._credentials_info,
155
172
  client_options=client_options,
156
173
  )
157
174
  elif self._credentials_file:
158
- self._client = SpeechAsyncClient.from_service_account_file(
175
+ client = SpeechAsyncClient.from_service_account_file(
159
176
  self._credentials_file,
160
177
  client_options=client_options,
161
178
  )
162
179
  else:
163
- self._client = SpeechAsyncClient(
180
+ client = SpeechAsyncClient(
164
181
  client_options=client_options,
165
182
  )
166
- assert self._client is not None
167
- return self._client
183
+ assert client is not None
184
+ return client
168
185
 
169
- @property
170
- def _recognizer(self) -> str:
186
+ def _get_recognizer(self, client: SpeechAsyncClient) -> str:
171
187
  # TODO(theomonnom): should we use recognizers?
172
188
  # recognizers may improve latency https://cloud.google.com/speech-to-text/v2/docs/recognizers#understand_recognizers
173
189
 
174
190
  # TODO(theomonnom): find a better way to access the project_id
175
191
  try:
176
- project_id = self._ensure_client().transport._credentials.project_id # type: ignore
192
+ project_id = client.transport._credentials.project_id # type: ignore
177
193
  except AttributeError:
178
194
  from google.auth import default as ga_default
179
195
 
@@ -224,16 +240,17 @@ class STT(stt.STT):
224
240
  )
225
241
 
226
242
  try:
227
- raw = await self._ensure_client().recognize(
228
- cloud_speech.RecognizeRequest(
229
- recognizer=self._recognizer,
230
- config=config,
231
- content=frame.data.tobytes(),
232
- ),
233
- timeout=conn_options.timeout,
234
- )
243
+ async with self._pool.connection() as client:
244
+ raw = await client.recognize(
245
+ cloud_speech.RecognizeRequest(
246
+ recognizer=self._get_recognizer(client),
247
+ config=config,
248
+ content=frame.data.tobytes(),
249
+ ),
250
+ timeout=conn_options.timeout,
251
+ )
235
252
 
236
- return _recognize_response_to_speech_event(raw)
253
+ return _recognize_response_to_speech_event(raw)
237
254
  except DeadlineExceeded:
238
255
  raise APITimeoutError()
239
256
  except GoogleAPICallError as e:
@@ -253,8 +270,8 @@ class STT(stt.STT):
253
270
  config = self._sanitize_options(language=language)
254
271
  stream = SpeechStream(
255
272
  stt=self,
256
- client=self._ensure_client(),
257
- recognizer=self._recognizer,
273
+ pool=self._pool,
274
+ recognizer_cb=self._get_recognizer,
258
275
  config=config,
259
276
  conn_options=conn_options,
260
277
  )
@@ -287,13 +304,10 @@ class STT(stt.STT):
287
304
  self._config.spoken_punctuation = spoken_punctuation
288
305
  if model is not None:
289
306
  self._config.model = model
290
- client = None
291
- recognizer = None
292
307
  if location is not None:
293
308
  self._location = location
294
309
  # if location is changed, fetch a new client and recognizer as per the new location
295
- client = self._ensure_client()
296
- recognizer = self._recognizer
310
+ self._pool.invalidate()
297
311
  if keywords is not None:
298
312
  self._config.keywords = keywords
299
313
 
@@ -306,10 +320,12 @@ class STT(stt.STT):
306
320
  spoken_punctuation=spoken_punctuation,
307
321
  model=model,
308
322
  keywords=keywords,
309
- client=client,
310
- recognizer=recognizer,
311
323
  )
312
324
 
325
+ async def aclose(self) -> None:
326
+ await self._pool.aclose()
327
+ await super().aclose()
328
+
313
329
 
314
330
  class SpeechStream(stt.SpeechStream):
315
331
  def __init__(
@@ -317,16 +333,16 @@ class SpeechStream(stt.SpeechStream):
317
333
  *,
318
334
  stt: STT,
319
335
  conn_options: APIConnectOptions,
320
- client: SpeechAsyncClient,
321
- recognizer: str,
336
+ pool: utils.ConnectionPool[SpeechAsyncClient],
337
+ recognizer_cb: Callable[[SpeechAsyncClient], str],
322
338
  config: STTOptions,
323
339
  ) -> None:
324
340
  super().__init__(
325
341
  stt=stt, conn_options=conn_options, sample_rate=config.sample_rate
326
342
  )
327
343
 
328
- self._client = client
329
- self._recognizer = recognizer
344
+ self._pool = pool
345
+ self._recognizer_cb = recognizer_cb
330
346
  self._config = config
331
347
  self._reconnect_event = asyncio.Event()
332
348
  self._session_connected_at: float = 0
@@ -341,8 +357,6 @@ class SpeechStream(stt.SpeechStream):
341
357
  spoken_punctuation: bool | None = None,
342
358
  model: SpeechModels | None = None,
343
359
  keywords: List[tuple[str, float]] | None = None,
344
- client: SpeechAsyncClient | None = None,
345
- recognizer: str | None = None,
346
360
  ):
347
361
  if languages is not None:
348
362
  if isinstance(languages, str):
@@ -360,21 +374,19 @@ class SpeechStream(stt.SpeechStream):
360
374
  self._config.model = model
361
375
  if keywords is not None:
362
376
  self._config.keywords = keywords
363
- if client is not None:
364
- self._client = client
365
- if recognizer is not None:
366
- self._recognizer = recognizer
367
377
 
368
378
  self._reconnect_event.set()
369
379
 
370
380
  async def _run(self) -> None:
371
381
  # google requires a async generator when calling streaming_recognize
372
382
  # this function basically convert the queue into a async generator
373
- async def input_generator(should_stop: asyncio.Event):
383
+ async def input_generator(
384
+ client: SpeechAsyncClient, should_stop: asyncio.Event
385
+ ):
374
386
  try:
375
387
  # first request should contain the config
376
388
  yield cloud_speech.StreamingRecognizeRequest(
377
- recognizer=self._recognizer,
389
+ recognizer=self._recognizer_cb(client),
378
390
  streaming_config=self._streaming_config,
379
391
  )
380
392
 
@@ -395,7 +407,7 @@ class SpeechStream(stt.SpeechStream):
395
407
  "an error occurred while streaming input to google STT"
396
408
  )
397
409
 
398
- async def process_stream(stream):
410
+ async def process_stream(client: SpeechAsyncClient, stream):
399
411
  has_started = False
400
412
  async for resp in stream:
401
413
  if (
@@ -437,6 +449,7 @@ class SpeechStream(stt.SpeechStream):
437
449
  logger.debug(
438
450
  "Google STT maximum connection time reached. Reconnecting..."
439
451
  )
452
+ self._pool.remove(client)
440
453
  if has_started:
441
454
  self._event_ch.send_nowait(
442
455
  stt.SpeechEvent(
@@ -458,52 +471,57 @@ class SpeechStream(stt.SpeechStream):
458
471
 
459
472
  while True:
460
473
  try:
461
- self._streaming_config = cloud_speech.StreamingRecognitionConfig(
462
- config=cloud_speech.RecognitionConfig(
463
- explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
464
- encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
465
- sample_rate_hertz=self._config.sample_rate,
466
- audio_channel_count=1,
474
+ async with self._pool.connection() as client:
475
+ self._streaming_config = cloud_speech.StreamingRecognitionConfig(
476
+ config=cloud_speech.RecognitionConfig(
477
+ explicit_decoding_config=cloud_speech.ExplicitDecodingConfig(
478
+ encoding=cloud_speech.ExplicitDecodingConfig.AudioEncoding.LINEAR16,
479
+ sample_rate_hertz=self._config.sample_rate,
480
+ audio_channel_count=1,
481
+ ),
482
+ adaptation=self._config.build_adaptation(),
483
+ language_codes=self._config.languages,
484
+ model=self._config.model,
485
+ features=cloud_speech.RecognitionFeatures(
486
+ enable_automatic_punctuation=self._config.punctuate,
487
+ enable_word_time_offsets=True,
488
+ ),
467
489
  ),
468
- adaptation=self._config.build_adaptation(),
469
- language_codes=self._config.languages,
470
- model=self._config.model,
471
- features=cloud_speech.RecognitionFeatures(
472
- enable_automatic_punctuation=self._config.punctuate,
473
- enable_word_time_offsets=True,
490
+ streaming_features=cloud_speech.StreamingRecognitionFeatures(
491
+ enable_voice_activity_events=True,
492
+ interim_results=self._config.interim_results,
474
493
  ),
475
- ),
476
- streaming_features=cloud_speech.StreamingRecognitionFeatures(
477
- enable_voice_activity_events=True,
478
- interim_results=self._config.interim_results,
479
- ),
480
- )
481
-
482
- should_stop = asyncio.Event()
483
- stream = await self._client.streaming_recognize(
484
- requests=input_generator(should_stop),
485
- )
486
- self._session_connected_at = time.time()
494
+ )
487
495
 
488
- process_stream_task = asyncio.create_task(process_stream(stream))
489
- wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
496
+ should_stop = asyncio.Event()
497
+ stream = await client.streaming_recognize(
498
+ requests=input_generator(client, should_stop),
499
+ )
500
+ self._session_connected_at = time.time()
490
501
 
491
- try:
492
- done, _ = await asyncio.wait(
493
- [process_stream_task, wait_reconnect_task],
494
- return_when=asyncio.FIRST_COMPLETED,
502
+ process_stream_task = asyncio.create_task(
503
+ process_stream(client, stream)
495
504
  )
496
- for task in done:
497
- if task != wait_reconnect_task:
498
- task.result()
499
- if wait_reconnect_task not in done:
500
- break
501
- self._reconnect_event.clear()
502
- finally:
503
- await utils.aio.gracefully_cancel(
504
- process_stream_task, wait_reconnect_task
505
+ wait_reconnect_task = asyncio.create_task(
506
+ self._reconnect_event.wait()
505
507
  )
506
- should_stop.set()
508
+
509
+ try:
510
+ done, _ = await asyncio.wait(
511
+ [process_stream_task, wait_reconnect_task],
512
+ return_when=asyncio.FIRST_COMPLETED,
513
+ )
514
+ for task in done:
515
+ if task != wait_reconnect_task:
516
+ task.result()
517
+ if wait_reconnect_task not in done:
518
+ break
519
+ self._reconnect_event.clear()
520
+ finally:
521
+ await utils.aio.gracefully_cancel(
522
+ process_stream_task, wait_reconnect_task
523
+ )
524
+ should_stop.set()
507
525
  except DeadlineExceeded:
508
526
  raise APITimeoutError()
509
527
  except GoogleAPICallError as e:
@@ -15,10 +15,9 @@
15
15
  from __future__ import annotations
16
16
 
17
17
  from dataclasses import dataclass
18
+ from typing import Optional
18
19
 
19
- from livekit import rtc
20
20
  from livekit.agents import (
21
- DEFAULT_API_CONNECT_OPTIONS,
22
21
  APIConnectionError,
23
22
  APIConnectOptions,
24
23
  APIStatusError,
@@ -31,7 +30,7 @@ from google.api_core.exceptions import DeadlineExceeded, GoogleAPICallError
31
30
  from google.cloud import texttospeech
32
31
  from google.cloud.texttospeech_v1.types import SsmlVoiceGender, SynthesizeSpeechResponse
33
32
 
34
- from .models import AudioEncoding, Gender, SpeechLanguages
33
+ from .models import Gender, SpeechLanguages
35
34
 
36
35
 
37
36
  @dataclass
@@ -47,7 +46,6 @@ class TTS(tts.TTS):
47
46
  language: SpeechLanguages | str = "en-US",
48
47
  gender: Gender | str = "neutral",
49
48
  voice_name: str = "", # Not required
50
- encoding: AudioEncoding | str = "linear16",
51
49
  sample_rate: int = 24000,
52
50
  pitch: int = 0,
53
51
  effects_profile_id: str = "",
@@ -66,7 +64,6 @@ class TTS(tts.TTS):
66
64
  language (SpeechLanguages | str, optional): Language code (e.g., "en-US"). Default is "en-US".
67
65
  gender (Gender | str, optional): Voice gender ("male", "female", "neutral"). Default is "neutral".
68
66
  voice_name (str, optional): Specific voice name. Default is an empty string.
69
- encoding (AudioEncoding | str, optional): Audio encoding format (e.g., "linear16"). Default is "linear16".
70
67
  sample_rate (int, optional): Audio sample rate in Hz. Default is 24000.
71
68
  pitch (float, optional): Speaking pitch, ranging from -20.0 to 20.0 semitones relative to the original pitch. Default is 0.
72
69
  effects_profile_id (str): Optional identifier for selecting audio effects profiles to apply to the synthesized speech.
@@ -93,17 +90,10 @@ class TTS(tts.TTS):
93
90
  ssml_gender=_gender_from_str(gender),
94
91
  )
95
92
 
96
- if encoding == "linear16" or encoding == "wav":
97
- _audio_encoding = texttospeech.AudioEncoding.LINEAR16
98
- elif encoding == "mp3":
99
- _audio_encoding = texttospeech.AudioEncoding.MP3
100
- else:
101
- raise NotImplementedError(f"audio encoding {encoding} is not supported")
102
-
103
93
  self._opts = _TTSOptions(
104
94
  voice=voice,
105
95
  audio_config=texttospeech.AudioConfig(
106
- audio_encoding=_audio_encoding,
96
+ audio_encoding=texttospeech.AudioEncoding.OGG_OPUS,
107
97
  sample_rate_hertz=sample_rate,
108
98
  pitch=pitch,
109
99
  effects_profile_id=effects_profile_id,
@@ -160,7 +150,7 @@ class TTS(tts.TTS):
160
150
  self,
161
151
  text: str,
162
152
  *,
163
- conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
153
+ conn_options: Optional[APIConnectOptions] = None,
164
154
  ) -> "ChunkedStream":
165
155
  return ChunkedStream(
166
156
  tts=self,
@@ -177,9 +167,9 @@ class ChunkedStream(tts.ChunkedStream):
177
167
  *,
178
168
  tts: TTS,
179
169
  input_text: str,
180
- conn_options: APIConnectOptions,
181
170
  opts: _TTSOptions,
182
171
  client: texttospeech.TextToSpeechAsyncClient,
172
+ conn_options: Optional[APIConnectOptions] = None,
183
173
  ) -> None:
184
174
  super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
185
175
  self._opts, self._client = opts, client
@@ -195,35 +185,24 @@ class ChunkedStream(tts.ChunkedStream):
195
185
  timeout=self._conn_options.timeout,
196
186
  )
197
187
 
198
- if self._opts.audio_config.audio_encoding == "mp3":
199
- decoder = utils.codecs.Mp3StreamDecoder()
200
- bstream = utils.audio.AudioByteStream(
201
- sample_rate=self._opts.audio_config.sample_rate_hertz,
202
- num_channels=1,
203
- )
204
- for frame in decoder.decode_chunk(response.audio_content):
205
- for frame in bstream.write(frame.data.tobytes()):
206
- self._event_ch.send_nowait(
207
- tts.SynthesizedAudio(request_id=request_id, frame=frame)
208
- )
209
-
210
- for frame in bstream.flush():
211
- self._event_ch.send_nowait(
212
- tts.SynthesizedAudio(request_id=request_id, frame=frame)
213
- )
214
- else:
215
- data = response.audio_content[44:] # skip WAV header
216
- self._event_ch.send_nowait(
217
- tts.SynthesizedAudio(
218
- request_id=request_id,
219
- frame=rtc.AudioFrame(
220
- data=data,
221
- sample_rate=self._opts.audio_config.sample_rate_hertz,
222
- num_channels=1,
223
- samples_per_channel=len(data) // 2, # 16-bit
224
- ),
225
- )
188
+ # Create AudioStreamDecoder for OGG format
189
+ decoder = utils.codecs.AudioStreamDecoder(
190
+ sample_rate=self._opts.audio_config.sample_rate_hertz,
191
+ num_channels=1,
192
+ )
193
+
194
+ try:
195
+ decoder.push(response.audio_content)
196
+ decoder.end_input()
197
+ emitter = tts.SynthesizedAudioEmitter(
198
+ event_ch=self._event_ch,
199
+ request_id=request_id,
226
200
  )
201
+ async for frame in decoder:
202
+ emitter.push(frame)
203
+ emitter.flush()
204
+ finally:
205
+ await decoder.aclose()
227
206
 
228
207
  except DeadlineExceeded:
229
208
  raise APITimeoutError()
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.10.5"
15
+ __version__ = "0.11.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: livekit-plugins-google
3
- Version: 0.10.5
3
+ Version: 0.11.0
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -22,8 +22,8 @@ Description-Content-Type: text/markdown
22
22
  Requires-Dist: google-auth<3,>=2
23
23
  Requires-Dist: google-cloud-speech<3,>=2
24
24
  Requires-Dist: google-cloud-texttospeech<3,>=2
25
- Requires-Dist: google-genai==0.5.0
26
- Requires-Dist: livekit-agents>=0.12.11
25
+ Requires-Dist: google-genai==1.3.0
26
+ Requires-Dist: livekit-agents<1.0.0,>=0.12.16
27
27
  Dynamic: classifier
28
28
  Dynamic: description
29
29
  Dynamic: description-content-type
@@ -53,3 +53,57 @@ To use the STT and TTS API, you'll need to enable the respective services for yo
53
53
 
54
54
  - Cloud Speech-to-Text API
55
55
  - Cloud Text-to-Speech API
56
+
57
+
58
+ ## Gemini Multimodal Live
59
+
60
+ Gemini Multimodal Live can be used with the `MultimodalAgent` class. See examples/multimodal_agent/gemini_agent.py for an example.
61
+
62
+ ### Live Video Input (experimental)
63
+
64
+ You can push video frames to your Gemini Multimodal Live session alongside the audio automatically handled by the `MultimodalAgent`. The basic approach is to subscribe to the video track, create a video stream, sample frames at a suitable frame rate, and push them into the RealtimeSession:
65
+
66
+ ```
67
+ # Make sure you subscribe to audio and video tracks
68
+ await ctx.connect(auto_subscribe=AutoSubscribe.SUBSCRIBE_ALL)
69
+
70
+ # Create your RealtimeModel and store a reference
71
+ model = google.beta.realtime.RealtimeModel(
72
+ # ...
73
+ )
74
+
75
+ # Create your MultimodalAgent as usual
76
+ agent = MultimodalAgent(
77
+ model=model,
78
+ # ...
79
+ )
80
+
81
+ # Async method to process the video track and push frames to Gemini
82
+ async def _process_video_track(self, track: Track):
83
+ video_stream = VideoStream(track)
84
+ last_frame_time = 0
85
+
86
+ async for event in video_stream:
87
+ current_time = asyncio.get_event_loop().time()
88
+
89
+ # Sample at 1 FPS
90
+ if current_time - last_frame_time < 1.0:
91
+ continue
92
+
93
+ last_frame_time = current_time
94
+ frame = event.frame
95
+
96
+ # Push the frame into the RealtimeSession
97
+ model.sessions[0].push_video(frame)
98
+
99
+ await video_stream.aclose()
100
+
101
+ # Subscribe to new tracks and process them
102
+ @ctx.room.on("track_subscribed")
103
+ def _on_track_subscribed(track: Track, pub, participant):
104
+ if track.kind == TrackKind.KIND_VIDEO:
105
+ asyncio.create_task(self._process_video_track(track))
106
+ ```
107
+
108
+
109
+
@@ -0,0 +1,18 @@
1
+ livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
+ livekit/plugins/google/_utils.py,sha256=FG1_26nlWGcI6onPleQQcmGBMfb4QNYgis1B5BMJxWA,7131
3
+ livekit/plugins/google/llm.py,sha256=LZaHsrkjfboRZLWm7L2G0mw62q2sXBNj4YeeV2Sk2uU,16717
4
+ livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
5
+ livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
6
+ livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ livekit/plugins/google/stt.py,sha256=96GJmGDAIBdCpDECArwIXpj2s1xlcA_zuvTnwsvq4xA,22854
8
+ livekit/plugins/google/tts.py,sha256=pG9_pibO3NDGEMa4huU5S9lbeyI3daQyrS17SuTKfZI,8008
9
+ livekit/plugins/google/version.py,sha256=BvmVdoHkxksDSQP-uWrqIiyaAUImEyxSohntkIBNZRo,601
10
+ livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
+ livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
12
+ livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=vZHiWNk8PorxtrHSmA7Ya6ZvCjT37YSJN-MxK8ebdrs,22795
14
+ livekit/plugins/google/beta/realtime/transcriber.py,sha256=rjXO0cSPr3HATxrSfv1MX7IbrjmiTvnLPF280BfRBL8,9809
15
+ livekit_plugins_google-0.11.0.dist-info/METADATA,sha256=b8Aj_eQnGhAT3DQa77KLHZBDGAWZYdrnTBWjVODAm2k,3732
16
+ livekit_plugins_google-0.11.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
17
+ livekit_plugins_google-0.11.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
18
+ livekit_plugins_google-0.11.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,18 +0,0 @@
1
- livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
- livekit/plugins/google/_utils.py,sha256=mjsqblhGMgAZ2MNPisAVkNsqq4gfO6vvprEKzAGoVwE,7248
3
- livekit/plugins/google/llm.py,sha256=TVTerAabIf10AKVZr-Kn13eajhQ9RV7K4xaVD771yHU,16547
4
- livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
5
- livekit/plugins/google/models.py,sha256=Q47z_tIwLCufxhJyJHH7_1bo4xdBYZBSkkvMeycuItg,1493
6
- livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- livekit/plugins/google/stt.py,sha256=QcpKAcg8ltFlQnLGSdtRS2H12pFEPs1ZzLojKHB8bpY,21376
8
- livekit/plugins/google/tts.py,sha256=95qXCigVQYWNbcN3pIKBpIah4b31U_MWtXv5Ji0AMc4,9229
9
- livekit/plugins/google/version.py,sha256=na7fXYRLcWIgCRi4QSAbV4DZGA7YDgOWcE0O21jDlAo,601
10
- livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
- livekit/plugins/google/beta/realtime/__init__.py,sha256=sGTn6JFNyA30QUXBZ_BV3l2eHpGAzR35ByXxg77vWNU,205
12
- livekit/plugins/google/beta/realtime/api_proto.py,sha256=ralrRZqIbE71oyuLKRYaXHvm6tcHMwBJueKvSO8Xfus,658
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=RPGYAJXelYPo16YyR2qccjUjxUJCkJBU2N5rNTpKxyo,21281
14
- livekit/plugins/google/beta/realtime/transcriber.py,sha256=ZpKA3F8dqOtJPDlPiAgjw0AUDBIuhQiBVnvSYL4cdBg,9796
15
- livekit_plugins_google-0.10.5.dist-info/METADATA,sha256=AHhTVMBNVlOnqMnLPjncTO_iIqkDS-ExCm_5ubD9Mdg,2058
16
- livekit_plugins_google-0.10.5.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
17
- livekit_plugins_google-0.10.5.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
18
- livekit_plugins_google-0.10.5.dist-info/RECORD,,