livekit-plugins-google 0.11.1__py3-none-any.whl → 1.0.0.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,4 @@
1
- from .api_proto import (
2
- ClientEvents,
3
- LiveAPIModels,
4
- Voice,
5
- )
1
+ from .api_proto import ClientEvents, LiveAPIModels, Voice
6
2
  from .realtime_api import RealtimeModel
7
3
 
8
4
  __all__ = [
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Literal, Sequence, Union
3
+ from collections.abc import Sequence
4
+ from typing import Literal, Union
4
5
 
5
6
  from google.genai import types
6
7
 
7
8
  from ..._utils import _build_gemini_ctx, _build_tools
8
9
 
9
- LiveAPIModels = Literal["gemini-2.0-flash-exp"]
10
+ LiveAPIModels = Literal["gemini-2.0-flash-001",]
10
11
 
11
12
  Voice = Literal["Puck", "Charon", "Kore", "Fenrir", "Aoede"]
12
13
 
@@ -3,21 +3,17 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import json
5
5
  import os
6
+ from collections.abc import AsyncIterable
6
7
  from dataclasses import dataclass
7
- from typing import AsyncIterable, Literal
8
-
9
- from livekit import rtc
10
- from livekit.agents import llm, utils
11
- from livekit.agents.llm.function_context import _create_ai_function_info
12
- from livekit.agents.utils import images
8
+ from typing import Literal
13
9
 
14
10
  from google import genai
11
+ from google.genai._api_client import HttpOptions
15
12
  from google.genai.types import (
16
13
  Blob,
17
14
  Content,
18
15
  FunctionResponse,
19
16
  GenerationConfig,
20
- HttpOptions,
21
17
  LiveClientContent,
22
18
  LiveClientRealtimeInput,
23
19
  LiveClientToolResponse,
@@ -29,15 +25,13 @@ from google.genai.types import (
29
25
  Tool,
30
26
  VoiceConfig,
31
27
  )
28
+ from livekit import rtc
29
+ from livekit.agents import llm, utils
30
+ from livekit.agents.llm.function_context import _create_ai_function_info
31
+ from livekit.agents.utils import images
32
32
 
33
33
  from ...log import logger
34
- from .api_proto import (
35
- ClientEvents,
36
- LiveAPIModels,
37
- Voice,
38
- _build_gemini_ctx,
39
- _build_tools,
40
- )
34
+ from .api_proto import ClientEvents, LiveAPIModels, Voice, _build_gemini_ctx, _build_tools
41
35
  from .transcriber import ModelTranscriber, TranscriberSession, TranscriptionContent
42
36
 
43
37
  EventTypes = Literal[
@@ -83,7 +77,6 @@ class Capabilities:
83
77
  class ModelOptions:
84
78
  model: LiveAPIModels | str
85
79
  api_key: str | None
86
- api_version: str
87
80
  voice: Voice | str
88
81
  response_modalities: list[Modality] | None
89
82
  vertexai: bool
@@ -108,9 +101,8 @@ class RealtimeModel:
108
101
  instructions: str | None = None,
109
102
  model: LiveAPIModels | str = "gemini-2.0-flash-exp",
110
103
  api_key: str | None = None,
111
- api_version: str = "v1alpha",
112
104
  voice: Voice | str = "Puck",
113
- modalities: list[Modality] = [Modality.AUDIO],
105
+ modalities: list[Modality] = None,
114
106
  enable_user_audio_transcription: bool = True,
115
107
  enable_agent_audio_transcription: bool = True,
116
108
  vertexai: bool = False,
@@ -138,7 +130,6 @@ class RealtimeModel:
138
130
  Args:
139
131
  instructions (str, optional): Initial system instructions for the model. Defaults to "".
140
132
  api_key (str or None, optional): Google Gemini API key. If None, will attempt to read from the environment variable GOOGLE_API_KEY.
141
- api_version (str, optional): The version of the API to use. Defaults to "v1alpha".
142
133
  modalities (list[Modality], optional): Modalities to use, such as ["TEXT", "AUDIO"]. Defaults to ["AUDIO"].
143
134
  model (str or None, optional): The name of the model to use. Defaults to "gemini-2.0-flash-exp".
144
135
  voice (api_proto.Voice, optional): Voice setting for audio outputs. Defaults to "Puck".
@@ -158,6 +149,8 @@ class RealtimeModel:
158
149
  Raises:
159
150
  ValueError: If the API key is not provided and cannot be found in environment variables.
160
151
  """
152
+ if modalities is None:
153
+ modalities = ["AUDIO"]
161
154
  super().__init__()
162
155
  self._capabilities = Capabilities(
163
156
  supports_truncate=False,
@@ -183,14 +176,11 @@ class RealtimeModel:
183
176
  "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable"
184
177
  )
185
178
 
186
- instructions_content = (
187
- Content(parts=[Part(text=instructions)]) if instructions else None
188
- )
179
+ instructions_content = Content(parts=[Part(text=instructions)]) if instructions else None
189
180
 
190
181
  self._rt_sessions: list[GeminiRealtimeSession] = []
191
182
  self._opts = ModelOptions(
192
183
  model=model,
193
- api_version=api_version,
194
184
  api_key=self._api_key,
195
185
  voice=voice,
196
186
  enable_user_audio_transcription=enable_user_audio_transcription,
@@ -263,8 +253,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
263
253
  self._fnc_ctx = fnc_ctx
264
254
  self._fnc_tasks = utils.aio.TaskSet()
265
255
  self._is_interrupted = False
266
- self._playout_complete = asyncio.Event()
267
- self._playout_complete.set()
268
256
 
269
257
  tools = []
270
258
  if self._fnc_ctx is not None:
@@ -285,32 +273,24 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
285
273
  system_instruction=self._opts.instructions,
286
274
  speech_config=SpeechConfig(
287
275
  voice_config=VoiceConfig(
288
- prebuilt_voice_config=PrebuiltVoiceConfig(
289
- voice_name=self._opts.voice
290
- )
276
+ prebuilt_voice_config=PrebuiltVoiceConfig(voice_name=self._opts.voice)
291
277
  )
292
278
  ),
293
279
  tools=tools,
294
280
  )
295
281
  self._client = genai.Client(
296
- http_options=HttpOptions(api_version=self._opts.api_version),
282
+ http_options=HttpOptions(api_version="v1alpha"),
297
283
  api_key=self._opts.api_key,
298
284
  vertexai=self._opts.vertexai,
299
285
  project=self._opts.project,
300
286
  location=self._opts.location,
301
287
  )
302
- self._main_atask = asyncio.create_task(
303
- self._main_task(), name="gemini-realtime-session"
304
- )
288
+ self._main_atask = asyncio.create_task(self._main_task(), name="gemini-realtime-session")
305
289
  if self._opts.enable_user_audio_transcription:
306
- self._transcriber = TranscriberSession(
307
- client=self._client, model=self._opts.model
308
- )
290
+ self._transcriber = TranscriberSession(client=self._client, model=self._opts.model)
309
291
  self._transcriber.on("input_speech_done", self._on_input_speech_done)
310
292
  if self._opts.enable_agent_audio_transcription:
311
- self._agent_transcriber = ModelTranscriber(
312
- client=self._client, model=self._opts.model
313
- )
293
+ self._agent_transcriber = ModelTranscriber(client=self._client, model=self._opts.model)
314
294
  self._agent_transcriber.on("input_speech_done", self._on_agent_speech_done)
315
295
  # init dummy task
316
296
  self._init_sync_task = asyncio.create_task(asyncio.sleep(0))
@@ -324,10 +304,6 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
324
304
  self._send_ch.close()
325
305
  await self._main_atask
326
306
 
327
- @property
328
- def playout_complete(self) -> asyncio.Event | None:
329
- return self._playout_complete
330
-
331
307
  @property
332
308
  def fnc_ctx(self) -> llm.FunctionContext | None:
333
309
  return self._fnc_ctx
@@ -345,9 +321,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
345
321
  DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
346
322
  format="JPEG",
347
323
  quality=75,
348
- resize_options=images.ResizeOptions(
349
- width=1024, height=1024, strategy="scale_aspect_fit"
350
- ),
324
+ resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
351
325
  )
352
326
 
353
327
  def push_video(
@@ -397,9 +371,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
397
371
 
398
372
  def create_response(
399
373
  self,
400
- on_duplicate: Literal[
401
- "cancel_existing", "cancel_new", "keep_both"
402
- ] = "keep_both",
374
+ on_duplicate: Literal["cancel_existing", "cancel_new", "keep_both"] = "keep_both",
403
375
  ) -> None:
404
376
  turns, _ = _build_gemini_ctx(self._chat_ctx, id(self))
405
377
  ctx = [self._opts.instructions] + turns if self._opts.instructions else turns
@@ -485,8 +457,7 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
485
457
  data=part.inline_data.data,
486
458
  sample_rate=24000,
487
459
  num_channels=1,
488
- samples_per_channel=len(part.inline_data.data)
489
- // 2,
460
+ samples_per_channel=len(part.inline_data.data) // 2,
490
461
  )
491
462
  if self._opts.enable_agent_audio_transcription:
492
463
  content.audio.append(frame)
@@ -529,12 +500,12 @@ class GeminiRealtimeSession(utils.EventEmitter[EventTypes]):
529
500
  logger.warning(
530
501
  "function call cancelled",
531
502
  extra={
532
- "function_call_ids": response.tool_call_cancellation.ids,
503
+ "function_call_ids": response.tool_call_cancellation.function_call_ids,
533
504
  },
534
505
  )
535
506
  self.emit(
536
507
  "function_calls_cancelled",
537
- response.tool_call_cancellation.ids,
508
+ response.tool_call_cancellation.function_call_ids,
538
509
  )
539
510
 
540
511
  async with self._client.aio.live.connect(
@@ -6,12 +6,12 @@ from dataclasses import dataclass
6
6
  from typing import Literal
7
7
 
8
8
  import websockets
9
- from livekit import rtc
10
- from livekit.agents import APIConnectionError, APIStatusError, utils
11
9
 
12
10
  from google import genai
13
11
  from google.genai import types
14
12
  from google.genai.errors import APIError, ClientError, ServerError
13
+ from livekit import rtc
14
+ from livekit.agents import APIConnectionError, APIStatusError, utils
15
15
 
16
16
  from ...log import logger
17
17
  from .api_proto import ClientEvents, LiveAPIModels
@@ -51,11 +51,9 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
51
51
  self._needed_sr = 16000
52
52
  self._closed = False
53
53
 
54
- system_instructions = types.Content(
55
- parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
56
- )
54
+ system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
57
55
  self._config = types.LiveConnectConfig(
58
- response_modalities=[types.Modality.TEXT],
56
+ response_modalities=["TEXT"],
59
57
  system_instruction=system_instructions,
60
58
  generation_config=types.GenerationConfig(temperature=0.0),
61
59
  )
@@ -81,17 +79,13 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
81
79
  for f in self._resampler.push(frame):
82
80
  self._queue_msg(
83
81
  types.LiveClientRealtimeInput(
84
- media_chunks=[
85
- types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")
86
- ]
82
+ media_chunks=[types.Blob(data=f.data.tobytes(), mime_type="audio/pcm")]
87
83
  )
88
84
  )
89
85
  else:
90
86
  self._queue_msg(
91
87
  types.LiveClientRealtimeInput(
92
- media_chunks=[
93
- types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")
94
- ]
88
+ media_chunks=[types.Blob(data=frame.data.tobytes(), mime_type="audio/pcm")]
95
89
  )
96
90
  )
97
91
 
@@ -157,17 +151,11 @@ class TranscriberSession(utils.EventEmitter[EventTypes]):
157
151
  logger.exception(f"Uncaught error in transcriber _recv_task: {e}")
158
152
  self._closed = True
159
153
 
160
- async with self._client.aio.live.connect(
161
- model=self._model, config=self._config
162
- ) as session:
154
+ async with self._client.aio.live.connect(model=self._model, config=self._config) as session:
163
155
  self._session = session
164
156
  tasks = [
165
- asyncio.create_task(
166
- _send_task(), name="gemini-realtime-transcriber-send"
167
- ),
168
- asyncio.create_task(
169
- _recv_task(), name="gemini-realtime-transcriber-recv"
170
- ),
157
+ asyncio.create_task(_send_task(), name="gemini-realtime-transcriber-send"),
158
+ asyncio.create_task(_recv_task(), name="gemini-realtime-transcriber-recv"),
171
159
  ]
172
160
 
173
161
  try:
@@ -187,9 +175,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
187
175
  self._client = client
188
176
  self._model = model
189
177
  self._needed_sr = 16000
190
- self._system_instructions = types.Content(
191
- parts=[types.Part(text=SYSTEM_INSTRUCTIONS)]
192
- )
178
+ self._system_instructions = types.Content(parts=[types.Part(text=SYSTEM_INSTRUCTIONS)])
193
179
  self._config = types.GenerateContentConfig(
194
180
  temperature=0.0,
195
181
  system_instruction=self._system_instructions,
@@ -198,9 +184,7 @@ class ModelTranscriber(utils.EventEmitter[EventTypes]):
198
184
  self._resampler: rtc.AudioResampler | None = None
199
185
  self._buffer: rtc.AudioFrame | None = None
200
186
  self._audio_ch = utils.aio.Chan[rtc.AudioFrame]()
201
- self._main_atask = asyncio.create_task(
202
- self._main_task(), name="gemini-model-transcriber"
203
- )
187
+ self._main_atask = asyncio.create_task(self._main_task(), name="gemini-model-transcriber")
204
188
 
205
189
  async def aclose(self) -> None:
206
190
  if self._audio_ch.closed: