livekit-plugins-google 0.11.0__py3-none-any.whl → 1.0.0.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,4 @@
1
- from .api_proto import (
2
- ClientEvents,
3
- LiveAPIModels,
4
- Voice,
5
- )
1
+ from .api_proto import ClientEvents, LiveAPIModels, Voice
6
2
  from .realtime_api import RealtimeModel
7
3
 
8
4
  __all__ = [
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Literal, Sequence, Union
3
+ from collections.abc import Sequence
4
+ from typing import Literal, Union
4
5
 
5
6
  from google.genai import types
6
7
 
@@ -3,21 +3,17 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import json
5
5
  import os
6
+ from collections.abc import AsyncIterable
6
7
  from dataclasses import dataclass
7
- from typing import AsyncIterable, Literal
8
-
9
- from livekit import rtc
10
- from livekit.agents import llm, utils
11
- from livekit.agents.llm.function_context import _create_ai_function_info
12
- from livekit.agents.utils import images
8
+ from typing import Literal
13
9
 
14
10
  from google import genai
11
+ from google.genai._api_client import HttpOptions
15
12
  from google.genai.types import (
16
13
  Blob,
17
14
  Content,
18
15
  FunctionResponse,
19
16
  GenerationConfig,
20
- HttpOptions,
21
17
  LiveClientContent,
22
18
  LiveClientRealtimeInput,
23
19
  LiveClientToolResponse,
@@ -29,15 +25,13 @@ from google.genai.types import (
29
25
  Tool,
30
26
  VoiceConfig,
31
27
  )
28
+ from livekit import rtc
29
+ from livekit.agents import llm, utils
30
+ from livekit.agents.llm.function_context import _create_ai_function_info
31
+ from livekit.agents.utils import images
32
32
 
33
33
  from ...log import logger
34
- from .api_proto import (
35
- ClientEvents,
36
- LiveAPIModels,
37
- Voice,
38
- _build_gemini_ctx,
39
- _build_tools,
40
- )
34
+ from .api_proto import ClientEvents, LiveAPIModels, Voice, _build_gemini_ctx, _build_tools
41
35
  from .transcriber import ModelTranscriber, TranscriberSession, TranscriptionContent
42
36
 
43
37
  EventTypes = Literal[
@@ -108,7 +102,7 @@ class RealtimeModel:
108
102
  model: LiveAPIModels | str = "gemini-2.0-flash-exp",
109
103
  api_key: str | None = None,
110
104
  voice: Voice | str = "Puck",
111
- modalities: list[Modality] = [Modality.AUDIO],
105
+ modalities: list[Modality] = None,
112
106
  enable_user_audio_transcription: bool = True,
113
107
  enable_agent_audio_transcription: bool = True,
114
108
  vertexai: bool = False,
@@ -155,6 +149,8 @@ class RealtimeModel:
155
149
  Raises:
156
150
  ValueError: If the API key is not provided and cannot be found in environment variables.
157
151
  """
152
+ if modalities is None:
153
+ modalities = ["AUDIO"]
158
154
  super().__init__()
159
155
  self._capabilities = Capabilities(
160
156
  supports_truncate=False,
@@ -180,9 +176,7 @@ class RealtimeModel:
180
176
  "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable"
181
177
  )
182
178
 
183
- instructions_content = (
184
- Content(parts=[Part(text=instructions)]) if instructions else None
185
- )
179
+ instructions_content = Content(parts=[Part(text=instructions)]) if instructions else None
186
180
 
187
181
  self._rt_sessions: list[GeminiRealtimeSession] = []
188
182
  self._opts = ModelOptions(
@@ -259,8 +253,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
259
253
  self._fnc_ctx = fnc_ctx
260
254
  self._fnc_tasks = utils.aio.TaskSet()
261
255
  self._is_interrupted = False
262
- self._playout_complete = asyncio.Event()
263
- self._playout_complete.set()
264
256
 
265
257
  tools = []
266
258
  if self._fnc_ctx is not None:
@@ -281,9 +273,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
281
273
  system_instruction=self._opts.instructions,
282
274
  speech_config=SpeechConfig(
283
275
  voice_config=VoiceConfig(
284
- prebuilt_voice_config=PrebuiltVoiceConfig(
285
- voice_name=self._opts.voice
286
- )
276
+ prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
287
277
  )
288
278
  ),
289
279
  tools=tools,
@@ -295,18 +285,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
295
285
  project=self._opts.project,
296
286
  location=self._opts.location,
297
287
  )
298
- self._main_atask = asyncio.create_task(
299
- self._main_task(), name="gemini-realtime-session"
300
- )
288
+ self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
301
289
  if self._opts.enable_user_audio_transcription:
302
- self._transcriber = TranscriberSession(
303
- client=self._client, model=self._opts.model
304
- )
290
+ self._transcriber = TranscriberSession(client=self._client, model=self._opts.model)
305
291
  self._transcriber.on("input_speech_done", self._on_input_speech_done)
306
292
  if self._opts.enable_agent_audio_transcription:
307
- self._agent_transcriber = ModelTranscriber(
308
- client=self._client, model=self._opts.model
309
- )
293
+ self._agent_transcriber = ModelTranscriber(client=self._client, model=self._opts.model)
310
294
  self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
311
295
  # init dummy task
312
296
  self._init_sync_task = asyncio.create_task(asyncio.sleep(0))
@@ -320,10 +304,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
320
304
  self._send_ch.close()
321
305
  await self._main_atask
322
306
 
323
- @property
324
- def playout_complete(self) -> asyncio.Event | None:
325
- return self._playout_complete
326
-
327
307
  @property
328
308
  def fnc_ctx(self) -> llm.FunctionContext | None:
329
309
  return self._fnc_ctx
@@ -341,9 +321,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
341
321
  DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
342
322
  format="JPEG",
343
323
  quality=75,
344
- resize_options=images.ResizeOptions(
345
- width=1024, height=1024, strategy="scale_aspect_fit"
346
- ),
324
+ resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
347
325
  )
348
326
 
349
327
  def push_video(
@@ -393,9 +371,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
393
371
 
394
372
  def create_response(
395
373
  self,
396
- on_duplicate: Literal[
397
- "cancel_existing", "cancel_new", "keep_both"
398
- ] = "keep_both",
374
+ on_duplicate: Literal["cancel_existing", "cancel_new", "keep_both"] = "keep_both",
399
375
  ) -> None:
400
376
  turns, _ = _build_gemini_ctx(self._chat_ctx, id(self))
401
377
  ctx = [self._opts.instructions] + turns if self._opts.instructions else turns
@@ -481,8 +457,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
481
457
  data=part.inline_data.data,
482
458
  sample_rate=24000,
483
459
  num_channels=1,
484
- samples_per_channel=len(part.inline_data.data)
485
- // 2,
460
+ samples_per_channel=len(part.inline_data.data) // 2,
486
461
  )
487
462
  if self._opts.enable_agent_audio_transcription:
488
463
  content.audio.append(frame)
@@ -525,12 +500,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
525
500
  logger.warning(
526
501
  "function call cancelled",
527
502
  extra={
528
- "function_call_ids": response.tool_call_cancellation.ids,
503
+ "function_call_ids": response.tool_call_cancellation.function_call_ids,
529
504
  },
530
505
  )
531
506
  self.emit(
532
507
  "function_calls_cancelled",
533
- response.tool_call_cancellation.ids,
508
+ response.tool_call_cancellation.function_call_ids,
534
509
  )
535
510
 
536
511
  async with self._client.aio.live.connect(
@@ -6,12 +6,12 @@ from dataclasses import dataclass
6
6
  from typing import Literal
7
7
 
8
8
  import websockets
9
- from livekit import rtc
10
- from livekit.agents import APIConnectionError, APIStatusError, utils
11
9
 
12
10
  from google import genai
13
11
  from google.genai import types
14
12
  from google.genai.errors import APIError, ClientError, ServerError
13
+ from livekit import rtc
14
+ from livekit.agents import APIConnectionError, APIStatusError, utils
15
15
 
16
16
  from ...log import logger
17
17
  from .api_proto import ClientEvents, LiveAPIModels
@@ -51,11 +51,9 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
51
51
  self._needed_sr = 16000
52
52
  self._closed = False
53
53
 
54
- system_instructions = types.Content(
55
- parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
56
- )
54
+ system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
57
55
  self._config = types.LiveConnectConfig(
58
- response_modalities=[types.Modality.TEXT],
56
+ response_modalities=["TEXT"],
59
57
  system_instruction=system_instructions,
60
58
  generation_config=types.GenerationConfig(temperature=0.0),
61
59
  )
@@ -81,17 +79,13 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
81
79
  for f in self._resampler.push(frame):
82
80
  self._queue_msg(
83
81
  types.LiveClientRealtimeInput(
84
- media_chunks=[
85
- types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")
86
- ]
82
+ media_chunks=[types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")]
87
83
  )
88
84
  )
89
85
  else:
90
86
  self._queue_msg(
91
87
  types.LiveClientRealtimeInput(
92
- media_chunks=[
93
- types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")
94
- ]
88
+ media_chunks=[types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")]
95
89
  )
96
90
  )
97
91
 
@@ -157,17 +151,11 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
157
151
  logger.exception(f"Uncaught error in transcriber _recv_task: {e}")
158
152
  self._closed = True
159
153
 
160
- async with self._client.aio.live.connect(
161
- model=self._model, config=self._config
162
- ) as session:
154
+ async with self._client.aio.live.connect(model=self._model, config=self._config) as session:
163
155
  self._session = session
164
156
  tasks = [
165
- asyncio.create_task(
166
- _send_task(), name="gemini-realtime-transcriber-send"
167
- ),
168
- asyncio.create_task(
169
- _recv_task(), name="gemini-realtime-transcriber-recv"
170
- ),
157
+ asyncio.create_task(_send_task(), name="gemini-realtime-transcriber-send"),
158
+ asyncio.create_task(_recv_task(), name="gemini-realtime-transcriber-recv"),
171
159
  ]
172
160
 
173
161
  try:
@@ -187,9 +175,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
187
175
  self._client = client
188
176
  self._model = model
189
177
  self._needed_sr = 16000
190
- self._system_instructions = types.Content(
191
- parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
192
- )
178
+ self._system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
193
179
  self._config = types.GenerateContentConfig(
194
180
  temperature=0.0,
195
181
  system_instruction=self._system_instructions,
@@ -198,9 +184,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
198
184
  self._resampler: rtc.AudioResampler | None = None
199
185
  self._buffer: rtc.AudioFrame | None = None
200
186
  self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
201
- self._main_atask = asyncio.create_task(
202
- self._main_task(), name="gemini-model-transcriber"
203
- )
187
+ self._main_atask = asyncio.create_task(self._main_task(), name="gemini-model-transcriber")
204
188
 
205
189
  async def aclose(self) -> None:
206
190
  if self._audio_ch.closed: