livekit-plugins-google 1.0.11__py3-none-any.whl → 1.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,8 +7,8 @@ import weakref
7
7
  from dataclasses import dataclass
8
8
 
9
9
  from google import genai
10
- from google.genai._api_client import HttpOptions
11
10
  from google.genai.types import (
11
+ AudioTranscriptionConfig,
12
12
  Blob,
13
13
  Content,
14
14
  FunctionDeclaration,
@@ -17,6 +17,7 @@ from google.genai.types import (
17
17
  LiveClientRealtimeInput,
18
18
  LiveConnectConfig,
19
19
  LiveServerContent,
20
+ LiveServerGoAway,
20
21
  LiveServerToolCall,
21
22
  LiveServerToolCallCancellation,
22
23
  Modality,
@@ -24,12 +25,13 @@ from google.genai.types import (
24
25
  PrebuiltVoiceConfig,
25
26
  SpeechConfig,
26
27
  Tool,
28
+ UsageMetadata,
27
29
  VoiceConfig,
28
30
  )
29
31
  from livekit import rtc
30
32
  from livekit.agents import llm, utils
31
33
  from livekit.agents.types import NOT_GIVEN, NotGivenOr
32
- from livekit.agents.utils import is_given
34
+ from livekit.agents.utils import images, is_given
33
35
 
34
36
  from ...log import logger
35
37
  from ...utils import _build_gemini_fnc, get_tool_results_for_realtime, to_chat_ctx
@@ -39,6 +41,12 @@ INPUT_AUDIO_SAMPLE_RATE = 16000
39
41
  OUTPUT_AUDIO_SAMPLE_RATE = 24000
40
42
  NUM_CHANNELS = 1
41
43
 
44
+ DEFAULT_ENCODE_OPTIONS = images.EncodeOptions(
45
+ format="JPEG",
46
+ quality=75,
47
+ resize_options=images.ResizeOptions(width=1024, height=1024, strategy="scale_aspect_fit"),
48
+ )
49
+
42
50
 
43
51
  @dataclass
44
52
  class InputTranscription:
@@ -63,6 +71,8 @@ class _RealtimeOptions:
63
71
  presence_penalty: NotGivenOr[float]
64
72
  frequency_penalty: NotGivenOr[float]
65
73
  instructions: NotGivenOr[str]
74
+ input_audio_transcription: AudioTranscriptionConfig | None
75
+ output_audio_transcription: AudioTranscriptionConfig | None
66
76
 
67
77
 
68
78
  @dataclass
@@ -99,6 +109,8 @@ class RealtimeModel(llm.RealtimeModel):
99
109
  top_k: NotGivenOr[int] = NOT_GIVEN,
100
110
  presence_penalty: NotGivenOr[float] = NOT_GIVEN,
101
111
  frequency_penalty: NotGivenOr[float] = NOT_GIVEN,
112
+ input_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
113
+ output_audio_transcription: NotGivenOr[AudioTranscriptionConfig | None] = NOT_GIVEN,
102
114
  ) -> None:
103
115
  """
104
116
  Initializes a RealtimeModel instance for interacting with Google's Realtime API.
@@ -125,6 +137,8 @@ class RealtimeModel(llm.RealtimeModel):
125
137
  top_k (int, optional): The top-k value for response generation
126
138
  presence_penalty (float, optional): The presence penalty for response generation
127
139
  frequency_penalty (float, optional): The frequency penalty for response generation
140
+ input_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for input audio transcription. Defaults to None.)
141
+ output_audio_transcription (AudioTranscriptionConfig | None, optional): The configuration for output audio transcription. Defaults to AudioTranscriptionConfig().
128
142
 
129
143
  Raises:
130
144
  ValueError: If the API key is required but not found.
@@ -155,6 +169,11 @@ class RealtimeModel(llm.RealtimeModel):
155
169
  "API key is required for Google API either via api_key or GOOGLE_API_KEY environment variable" # noqa: E501
156
170
  )
157
171
 
172
+ if not is_given(input_audio_transcription):
173
+ input_audio_transcription = None
174
+ if not is_given(output_audio_transcription):
175
+ output_audio_transcription = AudioTranscriptionConfig()
176
+
158
177
  self._opts = _RealtimeOptions(
159
178
  model=model,
160
179
  api_key=gemini_api_key,
@@ -171,6 +190,8 @@ class RealtimeModel(llm.RealtimeModel):
171
190
  presence_penalty=presence_penalty,
172
191
  frequency_penalty=frequency_penalty,
173
192
  instructions=instructions,
193
+ input_audio_transcription=input_audio_transcription,
194
+ output_audio_transcription=output_audio_transcription,
174
195
  )
175
196
 
176
197
  self._sessions = weakref.WeakSet[RealtimeSession]()
@@ -204,7 +225,6 @@ class RealtimeSession(llm.RealtimeSession):
204
225
  self._msg_ch = utils.aio.Chan[ClientEvents]()
205
226
  self._gemini_tools: list[Tool] = []
206
227
  self._client = genai.Client(
207
- http_options=HttpOptions(api_version="v1alpha"),
208
228
  api_key=self._opts.api_key,
209
229
  vertexai=self._opts.vertexai,
210
230
  project=self._opts.project,
@@ -299,8 +319,15 @@ class RealtimeSession(llm.RealtimeSession):
299
319
  return self._tools
300
320
 
301
321
  def push_audio(self, frame: rtc.AudioFrame) -> None:
322
+ self.push_media(frame.data.tobytes(), "audio/pcm")
323
+
324
+ def push_video(self, frame: rtc.VideoFrame) -> None:
325
+ encoded_data = images.encode(frame, DEFAULT_ENCODE_OPTIONS)
326
+ self.push_media(encoded_data, "image/jpeg")
327
+
328
+ def push_media(self, bytes: bytes, mime_type: str) -> None:
302
329
  realtime_input = LiveClientRealtimeInput(
303
- media_chunks=[Blob(data=frame.data.tobytes(), mime_type="audio/pcm")],
330
+ media_chunks=[Blob(data=bytes, mime_type=mime_type)]
304
331
  )
305
332
  self._msg_ch.send_nowait(realtime_input)
306
333
 
@@ -381,6 +408,8 @@ class RealtimeSession(llm.RealtimeSession):
381
408
  )
382
409
  ),
383
410
  tools=self._gemini_tools,
411
+ input_audio_transcription=self._opts.input_audio_transcription,
412
+ output_audio_transcription=self._opts.output_audio_transcription,
384
413
  )
385
414
 
386
415
  async with self._client.aio.live.connect(
@@ -404,12 +433,18 @@ class RealtimeSession(llm.RealtimeSession):
404
433
  async for response in session.receive():
405
434
  if self._active_response_id is None:
406
435
  self._start_new_generation()
436
+ if response.setup_complete:
437
+ logger.info("connection established with gemini live api server")
407
438
  if response.server_content:
408
439
  self._handle_server_content(response.server_content)
409
440
  if response.tool_call:
410
441
  self._handle_tool_calls(response.tool_call)
411
442
  if response.tool_call_cancellation:
412
443
  self._handle_tool_call_cancellation(response.tool_call_cancellation)
444
+ if response.usage_metadata:
445
+ self._handle_usage_metadata(response.usage_metadata)
446
+ if response.go_away:
447
+ self._handle_go_away(response.go_away)
413
448
 
414
449
  send_task = asyncio.create_task(_send_task(), name="gemini-realtime-send")
415
450
  recv_task = asyncio.create_task(_recv_task(), name="gemini-realtime-recv")
@@ -497,6 +532,17 @@ class RealtimeSession(llm.RealtimeSession):
497
532
  samples_per_channel=len(frame_data) // 2,
498
533
  )
499
534
  item_generation.audio_ch.send_nowait(frame)
535
+ input_transcription = server_content.input_transcription
536
+ if input_transcription and input_transcription.text:
537
+ self.emit(
538
+ "input_audio_transcription_completed",
539
+ llm.InputTranscriptionCompleted(
540
+ item_id=self._active_response_id, transcript=input_transcription.text
541
+ ),
542
+ )
543
+ output_transcription = server_content.output_transcription
544
+ if output_transcription and output_transcription.text:
545
+ item_generation.text_ch.send_nowait(output_transcription.text)
500
546
 
501
547
  if server_content.interrupted or server_content.turn_complete:
502
548
  self._finalize_response()
@@ -522,7 +568,7 @@ class RealtimeSession(llm.RealtimeSession):
522
568
  for fnc_call in tool_call.function_calls:
523
569
  self._current_generation.function_ch.send_nowait(
524
570
  llm.FunctionCall(
525
- call_id=fnc_call.id,
571
+ call_id=fnc_call.id or "",
526
572
  name=fnc_call.name,
527
573
  arguments=json.dumps(fnc_call.args),
528
574
  )
@@ -540,6 +586,16 @@ class RealtimeSession(llm.RealtimeSession):
540
586
  )
541
587
  self.emit("function_calls_cancelled", tool_call_cancellation.ids)
542
588
 
589
+ def _handle_usage_metadata(self, usage_metadata: UsageMetadata):
590
+ # todo: handle metrics
591
+ logger.info("Usage metadata", extra={"usage_metadata": usage_metadata})
592
+
593
+ def _handle_go_away(self, go_away: LiveServerGoAway):
594
+ # should we reconnect?
595
+ logger.warning(
596
+ f"gemini live api server will soon disconnect. time left: {go_away.time_left}"
597
+ )
598
+
543
599
  def commit_audio(self) -> None:
544
600
  raise NotImplementedError("commit_audio_buffer is not supported yet")
545
601
 
@@ -26,6 +26,7 @@ from google.genai import types
26
26
  from google.genai.errors import APIError, ClientError, ServerError
27
27
  from livekit.agents import APIConnectionError, APIStatusError, llm, utils
28
28
  from livekit.agents.llm import FunctionTool, ToolChoice, utils as llm_utils
29
+ from livekit.agents.llm.tool_context import get_function_info
29
30
  from livekit.agents.types import (
30
31
  DEFAULT_API_CONNECT_OPTIONS,
31
32
  NOT_GIVEN,
@@ -173,7 +174,9 @@ class LLM(llm.LLM):
173
174
  gemini_tool_choice = types.ToolConfig(
174
175
  function_calling_config=types.FunctionCallingConfig(
175
176
  mode="ANY",
176
- allowed_function_names=[fnc.name for fnc in tools],
177
+ allowed_function_names=[get_function_info(fnc).name for fnc in tools]
178
+ if tools
179
+ else None,
177
180
  )
178
181
  )
179
182
  extra["tool_config"] = gemini_tool_choice
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.0.11"
15
+ __version__ = "1.0.13"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-google
3
- Version: 1.0.11
3
+ Version: 1.0.13
4
4
  Summary: Agent Framework plugin for services from Google Cloud
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -21,8 +21,8 @@ Requires-Python: >=3.9.0
21
21
  Requires-Dist: google-auth<3,>=2
22
22
  Requires-Dist: google-cloud-speech<3,>=2
23
23
  Requires-Dist: google-cloud-texttospeech<3,>=2
24
- Requires-Dist: google-genai==1.5.0
25
- Requires-Dist: livekit-agents>=1.0.11
24
+ Requires-Dist: google-genai>=1.10.0
25
+ Requires-Dist: livekit-agents>=1.0.13
26
26
  Description-Content-Type: text/markdown
27
27
 
28
28
  # LiveKit Plugins Google
@@ -1,16 +1,16 @@
1
1
  livekit/plugins/google/__init__.py,sha256=e_kSlFNmKhyyeliz7f4WOKc_Y0-y39QjO5nCWuguhss,1171
2
- livekit/plugins/google/llm.py,sha256=81LCCJPmpMOkApX0S0a-zu5xIvcm2Pk8lTTz-PoK5m0,14740
2
+ livekit/plugins/google/llm.py,sha256=yAm-to2ItTJ7dAHc-2mlPeI0Npz9ZxRdyuRLV8PINqg,14888
3
3
  livekit/plugins/google/log.py,sha256=GI3YWN5YzrafnUccljzPRS_ZALkMNk1i21IRnTl2vNA,69
4
4
  livekit/plugins/google/models.py,sha256=SGjAumdDK97NNLwMFcqZdKR68f1NoGB2Rk1UP2-imG0,1457
5
5
  livekit/plugins/google/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
6
  livekit/plugins/google/stt.py,sha256=AG_lh2fuuduJi0jFbA_QKFXLJ6NUdF1W_FfkLUJML_Q,22413
7
7
  livekit/plugins/google/tts.py,sha256=P8Zu2s0TfmyzlrNxzDIqyn3sGiNSW0n3nB_JlO_ojiM,7985
8
8
  livekit/plugins/google/utils.py,sha256=pbLSOAdQxInWhgI2Yhsrr9KvgvpFXYDdU2yx2p03pFg,9437
9
- livekit/plugins/google/version.py,sha256=TjFqFuavgK4Qts5EiYNxeFse7oxLYPJZGuOP2eE4XVU,601
9
+ livekit/plugins/google/version.py,sha256=i9Tq4ZlIN5uba7xHRxp31dxAE9NuzqobM8zWhdM4QgA,601
10
10
  livekit/plugins/google/beta/__init__.py,sha256=AxRYc7NGG62Tv1MmcZVCDHNvlhbC86hM-_yP01Qb28k,47
11
11
  livekit/plugins/google/beta/realtime/__init__.py,sha256=_fW2NMN22F-hnQ4xAJ_g5lPbR7CvM_xXzSWlUQY-E-U,188
12
12
  livekit/plugins/google/beta/realtime/api_proto.py,sha256=cwpFOYjN_3v5PMY0TnzoHhJoASfZ7Qt9IO281ZhJ7Ww,565
13
- livekit/plugins/google/beta/realtime/realtime_api.py,sha256=ubF2Ha9zCD28gQrrjTcX3MWgMBs7bC3rI0DUdaHAa_Q,22021
14
- livekit_plugins_google-1.0.11.dist-info/METADATA,sha256=ryOwrpvbgiZuDJb0WVP3i8oOoyE3Cu3XSzZ5uThgXYs,3491
15
- livekit_plugins_google-1.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
- livekit_plugins_google-1.0.11.dist-info/RECORD,,
13
+ livekit/plugins/google/beta/realtime/realtime_api.py,sha256=JBEEOeTl6gv6Fe6GtYJjj9C-dqvfhWpOzNAa0tnTKgM,25002
14
+ livekit_plugins_google-1.0.13.dist-info/METADATA,sha256=u8ocRjsu24AzO_FRgqYZzDqc3gKnQGp1hprKBc3RFm4,3492
15
+ livekit_plugins_google-1.0.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ livekit_plugins_google-1.0.13.dist-info/RECORD,,