dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -0,0 +1,1582 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Google Gemini Live API service implementation.
8
+
9
+ This module provides real-time conversational AI capabilities using Google's
10
+ Gemini Live API, supporting both text and audio modalities with
11
+ voice transcription, streaming responses, and tool usage.
12
+ """
13
+
14
+ import base64
15
+ import io
16
+ import json
17
+ import random
18
+ import time
19
+ import uuid
20
+ from dataclasses import dataclass
21
+ from enum import Enum
22
+ from typing import Any, Dict, List, Optional, Union
23
+
24
+ from loguru import logger
25
+ from PIL import Image
26
+ from pydantic import BaseModel, Field
27
+
28
+ from pipecat.adapters.schemas.tools_schema import ToolsSchema
29
+ from pipecat.adapters.services.gemini_adapter import GeminiLLMAdapter
30
+ from pipecat.frames.frames import (
31
+ BotStartedSpeakingFrame,
32
+ BotStoppedSpeakingFrame,
33
+ CancelFrame,
34
+ EndFrame,
35
+ ErrorFrame,
36
+ Frame,
37
+ InputAudioRawFrame,
38
+ InputImageRawFrame,
39
+ InputTextRawFrame,
40
+ InterruptionFrame,
41
+ LLMContextFrame,
42
+ LLMFullResponseEndFrame,
43
+ LLMFullResponseStartFrame,
44
+ LLMMessagesAppendFrame,
45
+ LLMSetToolsFrame,
46
+ LLMTextFrame,
47
+ LLMUpdateSettingsFrame,
48
+ StartFrame,
49
+ TranscriptionFrame,
50
+ TTSAudioRawFrame,
51
+ TTSStartedFrame,
52
+ TTSStoppedFrame,
53
+ TTSTextFrame,
54
+ UserImageRawFrame,
55
+ UserStartedSpeakingFrame,
56
+ UserStoppedSpeakingFrame,
57
+ )
58
+ from pipecat.metrics.metrics import LLMTokenUsage
59
+ from pipecat.processors.aggregators.llm_response import (
60
+ LLMAssistantAggregatorParams,
61
+ LLMUserAggregatorParams,
62
+ )
63
+ from pipecat.processors.aggregators.openai_llm_context import (
64
+ OpenAILLMContext,
65
+ OpenAILLMContextFrame,
66
+ )
67
+ from pipecat.processors.frame_processor import FrameDirection
68
+ from pipecat.services.google.frames import LLMSearchOrigin, LLMSearchResponseFrame, LLMSearchResult
69
+ from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
70
+ from pipecat.services.openai.llm import (
71
+ OpenAIAssistantContextAggregator,
72
+ OpenAIUserContextAggregator,
73
+ )
74
+ from pipecat.transcriptions.language import Language
75
+ from pipecat.utils.string import match_endofsentence
76
+ from pipecat.utils.time import time_now_iso8601
77
+ from pipecat.utils.tracing.service_decorators import traced_gemini_live, traced_stt
78
+
79
+ from .file_api import GeminiFileAPI
80
+
81
+ try:
82
+ from google.genai import Client
83
+ from google.genai.live import AsyncSession
84
+ from google.genai.types import (
85
+ AudioTranscriptionConfig,
86
+ AutomaticActivityDetection,
87
+ Blob,
88
+ Content,
89
+ ContextWindowCompressionConfig,
90
+ EndSensitivity,
91
+ FileData,
92
+ FunctionResponse,
93
+ GenerationConfig,
94
+ GroundingMetadata,
95
+ HttpOptions,
96
+ LiveConnectConfig,
97
+ LiveServerMessage,
98
+ MediaResolution,
99
+ Modality,
100
+ Part,
101
+ ProactivityConfig,
102
+ RealtimeInputConfig,
103
+ SessionResumptionConfig,
104
+ SlidingWindow,
105
+ SpeechConfig,
106
+ StartSensitivity,
107
+ ThinkingConfig,
108
+ VoiceConfig,
109
+ )
110
+ except ModuleNotFoundError as e:
111
+ logger.error(f"Exception: {e}")
112
+ logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
113
+ raise Exception(f"Missing module: {e}")
114
+
115
+
116
+ # Connection management constants
117
+ MAX_CONSECUTIVE_FAILURES = 3
118
+ CONNECTION_ESTABLISHED_THRESHOLD = 10.0 # seconds
119
+
120
+
121
+ def language_to_gemini_language(language: Language) -> Optional[str]:
122
+ """Maps a Language enum value to a Gemini Live supported language code.
123
+
124
+ Source:
125
+ https://ai.google.dev/api/generate-content#MediaResolution
126
+
127
+ Args:
128
+ language: The language enum value to convert.
129
+
130
+ Returns:
131
+ The Gemini language code string, or None if the language is not supported.
132
+ """
133
+ language_map = {
134
+ # Arabic
135
+ Language.AR: "ar-XA",
136
+ # Bengali
137
+ Language.BN_IN: "bn-IN",
138
+ # Chinese (Mandarin)
139
+ Language.CMN: "cmn-CN",
140
+ Language.CMN_CN: "cmn-CN",
141
+ Language.ZH: "cmn-CN", # Map general Chinese to Mandarin for Gemini
142
+ Language.ZH_CN: "cmn-CN", # Map Simplified Chinese to Mandarin for Gemini
143
+ # German
144
+ Language.DE: "de-DE",
145
+ Language.DE_DE: "de-DE",
146
+ # English
147
+ Language.EN: "en-US", # Default to US English (though not explicitly listed in supported codes)
148
+ Language.EN_US: "en-US",
149
+ Language.EN_AU: "en-AU",
150
+ Language.EN_GB: "en-GB",
151
+ Language.EN_IN: "en-IN",
152
+ # Spanish
153
+ Language.ES: "es-ES", # Default to Spain Spanish
154
+ Language.ES_ES: "es-ES",
155
+ Language.ES_US: "es-US",
156
+ # French
157
+ Language.FR: "fr-FR", # Default to France French
158
+ Language.FR_FR: "fr-FR",
159
+ Language.FR_CA: "fr-CA",
160
+ # Gujarati
161
+ Language.GU: "gu-IN",
162
+ Language.GU_IN: "gu-IN",
163
+ # Hindi
164
+ Language.HI: "hi-IN",
165
+ Language.HI_IN: "hi-IN",
166
+ # Indonesian
167
+ Language.ID: "id-ID",
168
+ Language.ID_ID: "id-ID",
169
+ # Italian
170
+ Language.IT: "it-IT",
171
+ Language.IT_IT: "it-IT",
172
+ # Japanese
173
+ Language.JA: "ja-JP",
174
+ Language.JA_JP: "ja-JP",
175
+ # Kannada
176
+ Language.KN: "kn-IN",
177
+ Language.KN_IN: "kn-IN",
178
+ # Korean
179
+ Language.KO: "ko-KR",
180
+ Language.KO_KR: "ko-KR",
181
+ # Malayalam
182
+ Language.ML: "ml-IN",
183
+ Language.ML_IN: "ml-IN",
184
+ # Marathi
185
+ Language.MR: "mr-IN",
186
+ Language.MR_IN: "mr-IN",
187
+ # Dutch
188
+ Language.NL: "nl-NL",
189
+ Language.NL_NL: "nl-NL",
190
+ # Polish
191
+ Language.PL: "pl-PL",
192
+ Language.PL_PL: "pl-PL",
193
+ # Portuguese (Brazil)
194
+ Language.PT_BR: "pt-BR",
195
+ # Russian
196
+ Language.RU: "ru-RU",
197
+ Language.RU_RU: "ru-RU",
198
+ # Tamil
199
+ Language.TA: "ta-IN",
200
+ Language.TA_IN: "ta-IN",
201
+ # Telugu
202
+ Language.TE: "te-IN",
203
+ Language.TE_IN: "te-IN",
204
+ # Thai
205
+ Language.TH: "th-TH",
206
+ Language.TH_TH: "th-TH",
207
+ # Turkish
208
+ Language.TR: "tr-TR",
209
+ Language.TR_TR: "tr-TR",
210
+ # Vietnamese
211
+ Language.VI: "vi-VN",
212
+ Language.VI_VN: "vi-VN",
213
+ }
214
+ return language_map.get(language)
215
+
216
+
217
+ class GeminiLiveContext(OpenAILLMContext):
218
+ """Extended OpenAI context for Gemini Live API.
219
+
220
+ Provides Gemini-specific context management including system instruction
221
+ extraction and message format conversion for the Live API.
222
+ """
223
+
224
+ @staticmethod
225
+ def upgrade(obj: OpenAILLMContext) -> "GeminiLiveContext":
226
+ """Upgrade an OpenAI context to Gemini context.
227
+
228
+ Args:
229
+ obj: The OpenAI context to upgrade.
230
+
231
+ Returns:
232
+ The upgraded Gemini context instance.
233
+ """
234
+ if isinstance(obj, OpenAILLMContext) and not isinstance(obj, GeminiLiveContext):
235
+ logger.debug(f"Upgrading to Gemini Live Context: {obj}")
236
+ obj.__class__ = GeminiLiveContext
237
+ obj._restructure_from_openai_messages()
238
+ return obj
239
+
240
+ def _restructure_from_openai_messages(self):
241
+ pass
242
+
243
+ def extract_system_instructions(self):
244
+ """Extract system instructions from context messages.
245
+
246
+ Returns:
247
+ Combined system instruction text from all system messages.
248
+ """
249
+ system_instruction = ""
250
+ for item in self.messages:
251
+ if item.get("role") == "system":
252
+ content = item.get("content", "")
253
+ if content:
254
+ if system_instruction and not system_instruction.endswith("\n"):
255
+ system_instruction += "\n"
256
+ system_instruction += str(content)
257
+ return system_instruction
258
+
259
+ def add_file_reference(self, file_uri: str, mime_type: str, text: Optional[str] = None):
260
+ """Add a file reference to the context.
261
+
262
+ This adds a user message with a file reference that will be sent during context initialization.
263
+
264
+ Args:
265
+ file_uri: URI of the uploaded file
266
+ mime_type: MIME type of the file
267
+ text: Optional text prompt to accompany the file
268
+ """
269
+ # Create parts list with file reference
270
+ parts = []
271
+ if text:
272
+ parts.append({"type": "text", "text": text})
273
+
274
+ # Add file reference part
275
+ parts.append(
276
+ {"type": "file_data", "file_data": {"mime_type": mime_type, "file_uri": file_uri}}
277
+ )
278
+
279
+ # Add to messages
280
+ message = {"role": "user", "content": parts}
281
+ self.messages.append(message)
282
+ logger.info(f"Added file reference to context: {file_uri}")
283
+
284
+ def get_messages_for_initializing_history(self) -> List[Content]:
285
+ """Get messages formatted for Gemini history initialization.
286
+
287
+ Returns:
288
+ List of messages in Gemini format for conversation history.
289
+ """
290
+ messages: List[Content] = []
291
+ for item in self.messages:
292
+ role = item.get("role")
293
+
294
+ if role == "system":
295
+ continue
296
+
297
+ elif role == "assistant":
298
+ role = "model"
299
+
300
+ content = item.get("content")
301
+ parts: List[Part] = []
302
+ if isinstance(content, str):
303
+ parts = [Part(text=content)]
304
+ elif isinstance(content, list):
305
+ for part in content:
306
+ if part.get("type") == "text":
307
+ parts.append(Part(text=part.get("text")))
308
+ elif part.get("type") == "file_data":
309
+ file_data = part.get("file_data", {})
310
+ parts.append(
311
+ Part(
312
+ file_data=FileData(
313
+ mime_type=file_data.get("mime_type"),
314
+ file_uri=file_data.get("file_uri"),
315
+ )
316
+ )
317
+ )
318
+ else:
319
+ logger.warning(f"Unsupported content type: {str(part)[:80]}")
320
+ else:
321
+ logger.warning(f"Unsupported content type: {str(content)[:80]}")
322
+ messages.append(Content(role=role, parts=parts))
323
+ return messages
324
+
325
+
326
+ class GeminiLiveUserContextAggregator(OpenAIUserContextAggregator):
327
+ """User context aggregator for Gemini Live.
328
+
329
+ Extends OpenAI user aggregator to handle Gemini-specific message passing
330
+ while maintaining compatibility with the standard aggregation pipeline.
331
+ """
332
+
333
+ async def process_frame(self, frame, direction):
334
+ """Process incoming frames for user context aggregation.
335
+
336
+ Args:
337
+ frame: The frame to process.
338
+ direction: The frame processing direction.
339
+ """
340
+ await super().process_frame(frame, direction)
341
+ # kind of a hack just to pass the LLMMessagesAppendFrame through, but it's fine for now
342
+ if isinstance(frame, LLMMessagesAppendFrame):
343
+ await self.push_frame(frame, direction)
344
+
345
+
346
+ class GeminiLiveAssistantContextAggregator(OpenAIAssistantContextAggregator):
347
+ """Assistant context aggregator for Gemini Live.
348
+
349
+ Handles assistant response aggregation while filtering out LLMTextFrames
350
+ to prevent duplicate context entries, as Gemini Live pushes both
351
+ LLMTextFrames and TTSTextFrames.
352
+ """
353
+
354
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
355
+ """Process incoming frames for assistant context aggregation.
356
+
357
+ Args:
358
+ frame: The frame to process.
359
+ direction: The frame processing direction.
360
+ """
361
+ # The LLMAssistantContextAggregator uses TextFrames to aggregate the LLM output,
362
+ # but the GeminiLiveAssistantContextAggregator pushes LLMTextFrames and TTSTextFrames. We
363
+ # need to override this proces_frame for LLMTextFrame, so that only the TTSTextFrames
364
+ # are process. This ensures that the context gets only one set of messages.
365
+ if not isinstance(frame, LLMTextFrame):
366
+ await super().process_frame(frame, direction)
367
+
368
+ async def handle_user_image_frame(self, frame: UserImageRawFrame):
369
+ """Handle user image frames.
370
+
371
+ Args:
372
+ frame: The user image frame to handle.
373
+ """
374
+ # We don't want to store any images in the context. Revisit this later
375
+ # when the API evolves.
376
+ pass
377
+
378
+
379
+ @dataclass
380
+ class GeminiLiveContextAggregatorPair:
381
+ """Pair of user and assistant context aggregators for Gemini Live.
382
+
383
+ Parameters:
384
+ _user: The user context aggregator instance.
385
+ _assistant: The assistant context aggregator instance.
386
+ """
387
+
388
+ _user: GeminiLiveUserContextAggregator
389
+ _assistant: GeminiLiveAssistantContextAggregator
390
+
391
+ def user(self) -> GeminiLiveUserContextAggregator:
392
+ """Get the user context aggregator.
393
+
394
+ Returns:
395
+ The user context aggregator instance.
396
+ """
397
+ return self._user
398
+
399
+ def assistant(self) -> GeminiLiveAssistantContextAggregator:
400
+ """Get the assistant context aggregator.
401
+
402
+ Returns:
403
+ The assistant context aggregator instance.
404
+ """
405
+ return self._assistant
406
+
407
+
408
+ class GeminiModalities(Enum):
409
+ """Supported modalities for Gemini Live.
410
+
411
+ Parameters:
412
+ TEXT: Text responses.
413
+ AUDIO: Audio responses.
414
+ """
415
+
416
+ TEXT = "TEXT"
417
+ AUDIO = "AUDIO"
418
+
419
+
420
+ class GeminiMediaResolution(str, Enum):
421
+ """Media resolution options for Gemini Live.
422
+
423
+ Parameters:
424
+ UNSPECIFIED: Use default resolution setting.
425
+ LOW: Low resolution with 64 tokens.
426
+ MEDIUM: Medium resolution with 256 tokens.
427
+ HIGH: High resolution with zoomed reframing and 256 tokens.
428
+ """
429
+
430
+ UNSPECIFIED = "MEDIA_RESOLUTION_UNSPECIFIED" # Use default
431
+ LOW = "MEDIA_RESOLUTION_LOW" # 64 tokens
432
+ MEDIUM = "MEDIA_RESOLUTION_MEDIUM" # 256 tokens
433
+ HIGH = "MEDIA_RESOLUTION_HIGH" # Zoomed reframing with 256 tokens
434
+
435
+
436
+ class GeminiVADParams(BaseModel):
437
+ """Voice Activity Detection parameters for Gemini Live.
438
+
439
+ Parameters:
440
+ disabled: Whether to disable VAD. Defaults to None.
441
+ start_sensitivity: Sensitivity for speech start detection. Defaults to None.
442
+ end_sensitivity: Sensitivity for speech end detection. Defaults to None.
443
+ prefix_padding_ms: Prefix padding in milliseconds. Defaults to None.
444
+ silence_duration_ms: Silence duration threshold in milliseconds. Defaults to None.
445
+ """
446
+
447
+ disabled: Optional[bool] = Field(default=None)
448
+ start_sensitivity: Optional[StartSensitivity] = Field(default=None)
449
+ end_sensitivity: Optional[EndSensitivity] = Field(default=None)
450
+ prefix_padding_ms: Optional[int] = Field(default=None)
451
+ silence_duration_ms: Optional[int] = Field(default=None)
452
+
453
+
454
+ class ContextWindowCompressionParams(BaseModel):
455
+ """Parameters for context window compression in Gemini Live.
456
+
457
+ Parameters:
458
+ enabled: Whether compression is enabled. Defaults to False.
459
+ trigger_tokens: Token count to trigger compression. None uses 80% of context window.
460
+ """
461
+
462
+ enabled: bool = Field(default=False)
463
+ trigger_tokens: Optional[int] = Field(
464
+ default=None
465
+ ) # None = use default (80% of context window)
466
+
467
+
468
+ class InputParams(BaseModel):
469
+ """Input parameters for Gemini Live generation.
470
+
471
+ Parameters:
472
+ frequency_penalty: Frequency penalty for generation (0.0-2.0). Defaults to None.
473
+ max_tokens: Maximum tokens to generate. Must be >= 1. Defaults to 4096.
474
+ presence_penalty: Presence penalty for generation (0.0-2.0). Defaults to None.
475
+ temperature: Sampling temperature (0.0-2.0). Defaults to None.
476
+ top_k: Top-k sampling parameter. Must be >= 0. Defaults to None.
477
+ top_p: Top-p sampling parameter (0.0-1.0). Defaults to None.
478
+ modalities: Response modalities. Defaults to AUDIO.
479
+ language: Language for generation. Defaults to EN_US.
480
+ media_resolution: Media resolution setting. Defaults to UNSPECIFIED.
481
+ vad: Voice activity detection parameters. Defaults to None.
482
+ context_window_compression: Context compression settings. Defaults to None.
483
+ thinking: Thinking settings. Defaults to None.
484
+ Note that these settings may require specifying a model that
485
+ supports them, e.g. "gemini-2.5-flash-native-audio-preview-09-2025".
486
+ enable_affective_dialog: Enable affective dialog, which allows Gemini
487
+ to adapt to expression and tone. Defaults to None.
488
+ Note that these settings may require specifying a model that
489
+ supports them, e.g. "gemini-2.5-flash-native-audio-preview-09-2025".
490
+ Also note that this setting may require specifying an API version that
491
+ supports it, e.g. HttpOptions(api_version="v1alpha").
492
+ proactivity: Proactivity settings, which allows Gemini to proactively
493
+ decide how to behave, such as whether to avoid responding to
494
+ content that is not relevant. Defaults to None.
495
+ Note that these settings may require specifying a model that
496
+ supports them, e.g. "gemini-2.5-flash-native-audio-preview-09-2025".
497
+ Also note that this setting may require specifying an API version that
498
+ supports it, e.g. HttpOptions(api_version="v1alpha").
499
+ extra: Additional parameters. Defaults to empty dict.
500
+ """
501
+
502
+ frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
503
+ max_tokens: Optional[int] = Field(default=4096, ge=1)
504
+ presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
505
+ temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
506
+ top_k: Optional[int] = Field(default=None, ge=0)
507
+ top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
508
+ modalities: Optional[GeminiModalities] = Field(default=GeminiModalities.AUDIO)
509
+ language: Optional[Language] = Field(default=Language.EN_US)
510
+ media_resolution: Optional[GeminiMediaResolution] = Field(
511
+ default=GeminiMediaResolution.UNSPECIFIED
512
+ )
513
+ vad: Optional[GeminiVADParams] = Field(default=None)
514
+ context_window_compression: Optional[ContextWindowCompressionParams] = Field(default=None)
515
+ thinking: Optional[ThinkingConfig] = Field(default=None)
516
+ enable_affective_dialog: Optional[bool] = Field(default=None)
517
+ proactivity: Optional[ProactivityConfig] = Field(default=None)
518
+ extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
519
+
520
+
521
+ class GeminiLiveLLMService(LLMService):
522
+ """Provides access to Google's Gemini Live API.
523
+
524
+ This service enables real-time conversations with Gemini, supporting both
525
+ text and audio modalities. It handles voice transcription, streaming audio
526
+ responses, and tool usage.
527
+ """
528
+
529
+ # Overriding the default adapter to use the Gemini one.
530
+ adapter_class = GeminiLLMAdapter
531
+
532
+ def __init__(
533
+ self,
534
+ *,
535
+ api_key: str,
536
+ base_url: Optional[str] = None,
537
+ model="models/gemini-2.0-flash-live-001",
538
+ voice_id: str = "Charon",
539
+ start_audio_paused: bool = False,
540
+ start_video_paused: bool = False,
541
+ system_instruction: Optional[str] = None,
542
+ tools: Optional[Union[List[dict], ToolsSchema]] = None,
543
+ params: Optional[InputParams] = None,
544
+ inference_on_context_initialization: bool = True,
545
+ file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
546
+ http_options: Optional[HttpOptions] = None,
547
+ **kwargs,
548
+ ):
549
+ """Initialize the Gemini Live LLM service.
550
+
551
+ Args:
552
+ api_key: Google AI API key for authentication.
553
+ base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
554
+
555
+ .. deprecated:: 0.0.90
556
+ This parameter is deprecated and no longer has any effect.
557
+ Please use `http_options` to customize requests made by the
558
+ API client.
559
+
560
+ model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
561
+ voice_id: TTS voice identifier. Defaults to "Charon".
562
+ start_audio_paused: Whether to start with audio input paused. Defaults to False.
563
+ start_video_paused: Whether to start with video input paused. Defaults to False.
564
+ system_instruction: System prompt for the model. Defaults to None.
565
+ tools: Tools/functions available to the model. Defaults to None.
566
+ params: Configuration parameters for the model. Defaults to InputParams().
567
+ inference_on_context_initialization: Whether to generate a response when context
568
+ is first set. Defaults to True.
569
+ file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
570
+ http_options: HTTP options for the client.
571
+ **kwargs: Additional arguments passed to parent LLMService.
572
+ """
573
+ # Check for deprecated parameter usage
574
+ if base_url is not None:
575
+ import warnings
576
+
577
+ with warnings.catch_warnings():
578
+ warnings.simplefilter("always")
579
+ warnings.warn(
580
+ "Parameter 'base_url' is deprecated and no longer has any effect. Please use 'http_options' to customize requests made by the API client.",
581
+ DeprecationWarning,
582
+ stacklevel=2,
583
+ )
584
+
585
+ super().__init__(base_url=base_url, **kwargs)
586
+
587
+ params = params or InputParams()
588
+
589
+ self._last_sent_time = 0
590
+ self._base_url = base_url
591
+ self.set_model_name(model)
592
+ self._voice_id = voice_id
593
+ self._language_code = params.language
594
+
595
+ self._system_instruction = system_instruction
596
+ self._tools = tools
597
+ self._inference_on_context_initialization = inference_on_context_initialization
598
+ self._needs_turn_complete_message = False
599
+
600
+ self._audio_input_paused = start_audio_paused
601
+ self._video_input_paused = start_video_paused
602
+ self._context = None
603
+ self._api_key = api_key
604
+ self._http_options = http_options
605
+ self._session: AsyncSession = None
606
+ self._connection_task = None
607
+
608
+ self._disconnecting = False
609
+ self._run_llm_when_session_ready = False
610
+
611
+ self._user_is_speaking = False
612
+ self._bot_is_speaking = False
613
+ self._user_audio_buffer = bytearray()
614
+ self._user_transcription_buffer = ""
615
+ self._last_transcription_sent = ""
616
+ self._bot_audio_buffer = bytearray()
617
+ self._bot_text_buffer = ""
618
+ self._llm_output_buffer = ""
619
+
620
+ self._sample_rate = 24000
621
+
622
+ self._language = params.language
623
+ self._language_code = (
624
+ language_to_gemini_language(params.language) if params.language else "en-US"
625
+ )
626
+ self._vad_params = params.vad
627
+
628
+ # Reconnection tracking
629
+ self._consecutive_failures = 0
630
+ self._connection_start_time = None
631
+
632
+ self._settings = {
633
+ "frequency_penalty": params.frequency_penalty,
634
+ "max_tokens": params.max_tokens,
635
+ "presence_penalty": params.presence_penalty,
636
+ "temperature": params.temperature,
637
+ "top_k": params.top_k,
638
+ "top_p": params.top_p,
639
+ "modalities": params.modalities,
640
+ "language": self._language_code,
641
+ "media_resolution": params.media_resolution,
642
+ "vad": params.vad,
643
+ "context_window_compression": params.context_window_compression.model_dump()
644
+ if params.context_window_compression
645
+ else {},
646
+ "thinking": params.thinking or {},
647
+ "enable_affective_dialog": params.enable_affective_dialog or False,
648
+ "proactivity": params.proactivity or {},
649
+ "extra": params.extra if isinstance(params.extra, dict) else {},
650
+ }
651
+
652
+ self._file_api_base_url = file_api_base_url
653
+ self._file_api: Optional[GeminiFileAPI] = None
654
+
655
+ # Grounding metadata tracking
656
+ self._search_result_buffer = ""
657
+ self._accumulated_grounding_metadata = None
658
+
659
+ # Session resumption
660
+ self._session_resumption_handle: Optional[str] = None
661
+
662
+ # Bookkeeping for ending gracefully (i.e. after the bot is finished)
663
+ self._end_frame_pending_bot_turn_finished: Optional[EndFrame] = None
664
+
665
+ # Initialize the API client. Subclasses can override this if needed.
666
+ self.create_client()
667
+
668
+ def create_client(self):
669
+ """Create the Gemini API client instance. Subclasses can override this."""
670
+ self._client = Client(api_key=self._api_key, http_options=self._http_options)
671
+
672
+ @property
673
+ def file_api(self) -> GeminiFileAPI:
674
+ """Get the Gemini File API client instance. Subclasses can override this.
675
+
676
+ Returns:
677
+ The Gemini File API client.
678
+ """
679
+ if not self._file_api:
680
+ self._file_api = GeminiFileAPI(api_key=self._api_key, base_url=self._file_api_base_url)
681
+ return self._file_api
682
+
683
+ def can_generate_metrics(self) -> bool:
684
+ """Check if the service can generate usage metrics.
685
+
686
+ Returns:
687
+ True as Gemini Live supports token usage metrics.
688
+ """
689
+ return True
690
+
691
+ def needs_mcp_alternate_schema(self) -> bool:
692
+ """Check if this LLM service requires alternate MCP schema.
693
+
694
+ Google/Gemini has stricter JSON schema validation and requires
695
+ certain properties to be removed or modified for compatibility.
696
+
697
+ Returns:
698
+ True for Google/Gemini services.
699
+ """
700
+ return True
701
+
702
+ def set_audio_input_paused(self, paused: bool):
703
+ """Set the audio input pause state.
704
+
705
+ Args:
706
+ paused: Whether to pause audio input.
707
+ """
708
+ self._audio_input_paused = paused
709
+
710
+ def set_video_input_paused(self, paused: bool):
711
+ """Set the video input pause state.
712
+
713
+ Args:
714
+ paused: Whether to pause video input.
715
+ """
716
+ self._video_input_paused = paused
717
+
718
+ def set_model_modalities(self, modalities: GeminiModalities):
719
+ """Set the model response modalities.
720
+
721
+ Args:
722
+ modalities: The modalities to use for responses.
723
+ """
724
+ self._settings["modalities"] = modalities
725
+
726
+ def set_language(self, language: Language):
727
+ """Set the language for generation.
728
+
729
+ Args:
730
+ language: The language to use for generation.
731
+ """
732
+ self._language = language
733
+ self._language_code = language_to_gemini_language(language) or "en-US"
734
+ self._settings["language"] = self._language_code
735
+ logger.info(f"Set Gemini language to: {self._language_code}")
736
+
737
+ async def set_context(self, context: OpenAILLMContext):
738
+ """Set the context explicitly from outside the pipeline.
739
+
740
+ This is useful when initializing a conversation because in server-side VAD mode we might not have a
741
+ way to trigger the pipeline. This sends the history to the server. The `inference_on_context_initialization`
742
+ flag controls whether to set the turnComplete flag when we do this. Without that flag, the model will
743
+ not respond. This is often what we want when setting the context at the beginning of a conversation.
744
+
745
+ Args:
746
+ context: The OpenAI LLM context to set.
747
+ """
748
+ if self._context:
749
+ logger.error("Context already set. Can only set up Gemini Live context once.")
750
+ return
751
+ self._context = GeminiLiveContext.upgrade(context)
752
+ await self._create_initial_response()
753
+
754
+ #
755
+ # standard AIService frame handling
756
+ #
757
+
758
+ async def start(self, frame: StartFrame):
759
+ """Start the service and establish connection.
760
+
761
+ Args:
762
+ frame: The start frame.
763
+ """
764
+ await super().start(frame)
765
+ await self._connect()
766
+
767
+ async def stop(self, frame: EndFrame):
768
+ """Stop the service and close connections.
769
+
770
+ Args:
771
+ frame: The end frame.
772
+ """
773
+ await super().stop(frame)
774
+ await self._disconnect()
775
+
776
+ async def cancel(self, frame: CancelFrame):
777
+ """Cancel the service and close connections.
778
+
779
+ Args:
780
+ frame: The cancel frame.
781
+ """
782
+ await super().cancel(frame)
783
+ await self._disconnect()
784
+
785
+ #
786
+ # speech and interruption handling
787
+ #
788
+
789
+ async def _handle_interruption(self):
790
+ await self._set_bot_is_speaking(False)
791
+ await self.push_frame(TTSStoppedFrame())
792
+ await self.push_frame(LLMFullResponseEndFrame())
793
+
794
+ async def _handle_user_started_speaking(self, frame):
795
+ self._user_is_speaking = True
796
+ pass
797
+
798
+ async def _handle_user_stopped_speaking(self, frame):
799
+ self._user_is_speaking = False
800
+ self._user_audio_buffer = bytearray()
801
+ await self.start_ttfb_metrics()
802
+ if self._needs_turn_complete_message:
803
+ self._needs_turn_complete_message = False
804
+ # NOTE: without this, the model ignores the context it's been
805
+ # seeded with before the user started speaking
806
+ await self._session.send_client_content(turn_complete=True)
807
+
808
+ #
809
+ # frame processing
810
+ #
811
+ # StartFrame, StopFrame, CancelFrame implemented in base class
812
+ #
813
+
814
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
815
+ """Process incoming frames for the Gemini Live service.
816
+
817
+ Args:
818
+ frame: The frame to process.
819
+ direction: The frame processing direction.
820
+ """
821
+ # Defer EndFrame handling until after the bot turn is finished
822
+ if isinstance(frame, EndFrame):
823
+ if self._bot_is_speaking:
824
+ logger.debug("Deferring handling EndFrame until bot turn is finished")
825
+ self._end_frame_pending_bot_turn_finished = frame
826
+ return
827
+
828
+ await super().process_frame(frame, direction)
829
+
830
+ if isinstance(frame, TranscriptionFrame):
831
+ await self.push_frame(frame, direction)
832
+ elif isinstance(frame, OpenAILLMContextFrame):
833
+ context: GeminiLiveContext = GeminiLiveContext.upgrade(frame.context)
834
+ # For now, we'll only trigger inference here when either:
835
+ # 1. We have not seen a context frame before
836
+ # 2. The last message is a tool call result
837
+ if not self._context:
838
+ self._context = context
839
+ if frame.context.tools:
840
+ self._tools = frame.context.tools
841
+ await self._create_initial_response()
842
+ elif context.messages and context.messages[-1].get("role") == "tool":
843
+ # Support just one tool call per context frame for now
844
+ tool_result_message = context.messages[-1]
845
+ await self._tool_result(tool_result_message)
846
+ elif isinstance(frame, LLMContextFrame):
847
+ raise NotImplementedError("Universal LLMContext is not yet supported for Gemini Live.")
848
+ elif isinstance(frame, InputTextRawFrame):
849
+ await self._send_user_text(frame.text)
850
+ await self.push_frame(frame, direction)
851
+ elif isinstance(frame, InputAudioRawFrame):
852
+ await self._send_user_audio(frame)
853
+ await self.push_frame(frame, direction)
854
+ elif isinstance(frame, InputImageRawFrame):
855
+ await self._send_user_video(frame)
856
+ await self.push_frame(frame, direction)
857
+ elif isinstance(frame, InterruptionFrame):
858
+ await self._handle_interruption()
859
+ await self.push_frame(frame, direction)
860
+ elif isinstance(frame, UserStartedSpeakingFrame):
861
+ await self._handle_user_started_speaking(frame)
862
+ await self.push_frame(frame, direction)
863
+ elif isinstance(frame, UserStoppedSpeakingFrame):
864
+ await self._handle_user_stopped_speaking(frame)
865
+ await self.push_frame(frame, direction)
866
+ elif isinstance(frame, BotStartedSpeakingFrame):
867
+ # Ignore this frame. Use the serverContent API message instead
868
+ await self.push_frame(frame, direction)
869
+ elif isinstance(frame, BotStoppedSpeakingFrame):
870
+ # ignore this frame. Use the serverContent.turnComplete API message
871
+ await self.push_frame(frame, direction)
872
+ elif isinstance(frame, LLMMessagesAppendFrame):
873
+ # NOTE: handling LLMMessagesAppendFrame here in the LLMService is
874
+ # unusual - typically this would be handled in the user context
875
+ # aggregator. Leaving this handling here so that user code that
876
+ # uses this frame *without* a user context aggregator still works
877
+ # (we have an example that does just that, actually).
878
+ await self._create_single_response(frame.messages)
879
+ elif isinstance(frame, LLMUpdateSettingsFrame):
880
+ await self._update_settings(frame.settings)
881
+ elif isinstance(frame, LLMSetToolsFrame):
882
+ await self._update_settings()
883
+ else:
884
+ await self.push_frame(frame, direction)
885
+
886
+ async def _set_bot_is_speaking(self, speaking: bool):
887
+ if self._bot_is_speaking == speaking:
888
+ return
889
+
890
+ self._bot_is_speaking = speaking
891
+
892
+ if not self._bot_is_speaking and self._end_frame_pending_bot_turn_finished:
893
+ await self.queue_frame(self._end_frame_pending_bot_turn_finished)
894
+ self._end_frame_pending_bot_turn_finished = None
895
+
896
+ async def _connect(self, session_resumption_handle: Optional[str] = None):
897
+ """Establish client connection to Gemini Live API."""
898
+ if self._session:
899
+ # Here we assume that if we have a client, we are connected. We
900
+ # handle disconnections in the send/recv code paths.
901
+ return
902
+
903
+ if session_resumption_handle:
904
+ logger.info(
905
+ f"Connecting to Gemini service with session_resumption_handle: {session_resumption_handle}"
906
+ )
907
+ else:
908
+ logger.info("Connecting to Gemini service")
909
+ try:
910
+ # Assemble basic configuration
911
+ config = LiveConnectConfig(
912
+ generation_config=GenerationConfig(
913
+ frequency_penalty=self._settings["frequency_penalty"],
914
+ max_output_tokens=self._settings["max_tokens"],
915
+ presence_penalty=self._settings["presence_penalty"],
916
+ temperature=self._settings["temperature"],
917
+ top_k=self._settings["top_k"],
918
+ top_p=self._settings["top_p"],
919
+ response_modalities=[Modality(self._settings["modalities"].value)],
920
+ speech_config=SpeechConfig(
921
+ voice_config=VoiceConfig(
922
+ prebuilt_voice_config={"voice_name": self._voice_id}
923
+ ),
924
+ language_code=self._settings["language"],
925
+ ),
926
+ media_resolution=MediaResolution(self._settings["media_resolution"].value),
927
+ ),
928
+ input_audio_transcription=AudioTranscriptionConfig(),
929
+ output_audio_transcription=AudioTranscriptionConfig(),
930
+ session_resumption=SessionResumptionConfig(handle=session_resumption_handle),
931
+ )
932
+
933
+ # Add context window compression to configuration, if enabled
934
+ if self._settings.get("context_window_compression", {}).get("enabled", False):
935
+ compression_config = ContextWindowCompressionConfig()
936
+
937
+ # Add sliding window (always true if compression is enabled)
938
+ compression_config.sliding_window = SlidingWindow()
939
+
940
+ # Add trigger_tokens if specified
941
+ trigger_tokens = self._settings.get("context_window_compression", {}).get(
942
+ "trigger_tokens"
943
+ )
944
+ if trigger_tokens is not None:
945
+ compression_config.trigger_tokens = trigger_tokens
946
+
947
+ config.context_window_compression = compression_config
948
+
949
+ # Add thinking configuration to configuration, if provided
950
+ if self._settings.get("thinking"):
951
+ config.thinking_config = self._settings["thinking"]
952
+
953
+ # Add affective dialog setting, if provided
954
+ if self._settings.get("enable_affective_dialog", False):
955
+ config.enable_affective_dialog = self._settings["enable_affective_dialog"]
956
+
957
+ # Add proactivity configuration to configuration, if provided
958
+ if self._settings.get("proactivity"):
959
+ config.proactivity = self._settings["proactivity"]
960
+
961
+ # Add VAD configuration to configuration, if provided
962
+ if self._settings.get("vad"):
963
+ vad_config = AutomaticActivityDetection()
964
+ vad_params = self._settings["vad"]
965
+ has_vad_settings = False
966
+
967
+ # Only add parameters that are explicitly set
968
+ if vad_params.disabled is not None:
969
+ vad_config.disabled = vad_params.disabled
970
+ has_vad_settings = True
971
+
972
+ if vad_params.start_sensitivity:
973
+ vad_config.start_of_speech_sensitivity = vad_params.start_sensitivity
974
+ has_vad_settings = True
975
+
976
+ if vad_params.end_sensitivity:
977
+ vad_config.end_of_speech_sensitivity = vad_params.end_sensitivity
978
+ has_vad_settings = True
979
+
980
+ if vad_params.prefix_padding_ms is not None:
981
+ vad_config.prefix_padding_ms = vad_params.prefix_padding_ms
982
+ has_vad_settings = True
983
+
984
+ if vad_params.silence_duration_ms is not None:
985
+ vad_config.silence_duration_ms = vad_params.silence_duration_ms
986
+ has_vad_settings = True
987
+
988
+ # Only add automatic_activity_detection if we have VAD settings
989
+ if has_vad_settings:
990
+ config.realtime_input_config = RealtimeInputConfig(
991
+ automatic_activity_detection=vad_config
992
+ )
993
+
994
+ # Add system instruction to configuration, if provided
995
+ system_instruction = self._system_instruction or ""
996
+ if self._context and hasattr(self._context, "extract_system_instructions"):
997
+ system_instruction += "\n" + self._context.extract_system_instructions()
998
+ if system_instruction:
999
+ logger.debug(f"Setting system instruction: {system_instruction}")
1000
+ config.system_instruction = system_instruction
1001
+
1002
+ # Add tools to configuration, if provided
1003
+ if self._tools:
1004
+ logger.debug(f"Setting tools: {self._tools}")
1005
+ config.tools = self.get_llm_adapter().from_standard_tools(self._tools)
1006
+
1007
+ # Start the connection
1008
+ self._connection_task = self.create_task(self._connection_task_handler(config=config))
1009
+
1010
+ except Exception as e:
1011
+ await self.push_error(ErrorFrame(error=f"{self} Initialization error: {e}", fatal=True))
1012
+
1013
+ async def _connection_task_handler(self, config: LiveConnectConfig):
1014
+ async with self._client.aio.live.connect(model=self._model_name, config=config) as session:
1015
+ logger.info("Connected to Gemini service")
1016
+
1017
+ # Mark connection start time
1018
+ self._connection_start_time = time.time()
1019
+
1020
+ await self._handle_session_ready(session)
1021
+
1022
+ while True:
1023
+ try:
1024
+ turn = self._session.receive()
1025
+ async for message in turn:
1026
+ # Reset failure counter if connection has been stable
1027
+ self._check_and_reset_failure_counter()
1028
+
1029
+ if message.server_content and message.server_content.model_turn:
1030
+ await self._handle_msg_model_turn(message)
1031
+ elif (
1032
+ message.server_content
1033
+ and message.server_content.turn_complete
1034
+ and message.usage_metadata
1035
+ ):
1036
+ await self._handle_msg_turn_complete(message)
1037
+ await self._handle_msg_usage_metadata(message)
1038
+ elif message.server_content and message.server_content.input_transcription:
1039
+ await self._handle_msg_input_transcription(message)
1040
+ elif message.server_content and message.server_content.output_transcription:
1041
+ await self._handle_msg_output_transcription(message)
1042
+ elif message.server_content and message.server_content.grounding_metadata:
1043
+ await self._handle_msg_grounding_metadata(message)
1044
+ elif message.tool_call:
1045
+ await self._handle_msg_tool_call(message)
1046
+ elif message.session_resumption_update:
1047
+ self._handle_msg_resumption_update(message)
1048
+ except Exception as e:
1049
+ if not self._disconnecting:
1050
+ should_reconnect = await self._handle_connection_error(e)
1051
+ if should_reconnect:
1052
+ await self._reconnect()
1053
+ return # Exit this connection handler, _reconnect will start a new one
1054
+ break
1055
+
1056
+ def _check_and_reset_failure_counter(self):
1057
+ """Check if connection has been stable long enough to reset the failure counter.
1058
+
1059
+ If the connection has been active for longer than the established threshold
1060
+ and there are accumulated failures, reset the counter to 0.
1061
+ """
1062
+ if (
1063
+ self._connection_start_time
1064
+ and self._consecutive_failures > 0
1065
+ and time.time() - self._connection_start_time >= CONNECTION_ESTABLISHED_THRESHOLD
1066
+ ):
1067
+ logger.info(
1068
+ f"Connection stable for {CONNECTION_ESTABLISHED_THRESHOLD}s, "
1069
+ f"resetting failure counter from {self._consecutive_failures} to 0"
1070
+ )
1071
+ self._consecutive_failures = 0
1072
+
1073
+ async def _handle_connection_error(self, error: Exception) -> bool:
1074
+ """Handle a connection error and determine if reconnection should be attempted.
1075
+
1076
+ Args:
1077
+ error: The exception that caused the connection error.
1078
+
1079
+ Returns:
1080
+ True if reconnection should be attempted, False if a fatal error should be pushed.
1081
+ """
1082
+ self._consecutive_failures += 1
1083
+ logger.warning(
1084
+ f"Connection error (failure {self._consecutive_failures}/{MAX_CONSECUTIVE_FAILURES}): {error}"
1085
+ )
1086
+
1087
+ if self._consecutive_failures >= MAX_CONSECUTIVE_FAILURES:
1088
+ logger.error(
1089
+ f"Max consecutive failures ({MAX_CONSECUTIVE_FAILURES}) reached, "
1090
+ "treating as fatal error"
1091
+ )
1092
+ await self.push_error(
1093
+ ErrorFrame(error=f"{self} Error in receive loop: {error}", fatal=True)
1094
+ )
1095
+ return False
1096
+ else:
1097
+ logger.info(
1098
+ f"Attempting reconnection ({self._consecutive_failures}/{MAX_CONSECUTIVE_FAILURES})"
1099
+ )
1100
+ return True
1101
+
1102
+ async def _reconnect(self):
1103
+ """Reconnect to Gemini Live API."""
1104
+ await self._disconnect()
1105
+ await self._connect(session_resumption_handle=self._session_resumption_handle)
1106
+
1107
+ async def _disconnect(self):
1108
+ """Disconnect from Gemini Live API and clean up resources."""
1109
+ logger.info("Disconnecting from Gemini service")
1110
+ try:
1111
+ self._disconnecting = True
1112
+ await self.stop_all_metrics()
1113
+ if self._connection_task:
1114
+ await self.cancel_task(self._connection_task, timeout=1.0)
1115
+ self._connection_task = None
1116
+ if self._session:
1117
+ await self._session.close()
1118
+ self._session = None
1119
+ self._disconnecting = False
1120
+ except Exception as e:
1121
+ logger.error(f"{self} error disconnecting: {e}")
1122
+
1123
+ async def _send_user_audio(self, frame):
1124
+ """Send user audio frame to Gemini Live API."""
1125
+ if self._audio_input_paused or self._disconnecting or not self._session:
1126
+ return
1127
+
1128
+ # Send all audio to Gemini
1129
+ try:
1130
+ await self._session.send_realtime_input(
1131
+ audio=Blob(data=frame.audio, mime_type=f"audio/pcm;rate={frame.sample_rate}")
1132
+ )
1133
+ except Exception as e:
1134
+ await self._handle_send_error(e)
1135
+
1136
+ # Manage a buffer of audio to use for transcription
1137
+ audio = frame.audio
1138
+ if self._user_is_speaking:
1139
+ self._user_audio_buffer.extend(audio)
1140
+ else:
1141
+ # Keep 1/2 second of audio in the buffer even when not speaking.
1142
+ self._user_audio_buffer.extend(audio)
1143
+ length = int((frame.sample_rate * frame.num_channels * 2) * 0.5)
1144
+ self._user_audio_buffer = self._user_audio_buffer[-length:]
1145
+
1146
+ async def _send_user_text(self, text: str):
1147
+ """Send user text via Gemini Live API's realtime input stream.
1148
+
1149
+ This method sends text through the realtimeInput stream (via TextInputMessage)
1150
+ rather than the clientContent stream. This ensures text input is synchronized
1151
+ with audio and video inputs, preventing temporal misalignment that can occur
1152
+ when different modalities are processed through separate API pathways.
1153
+
1154
+ For realtimeInput, turn completion is automatically inferred by the API based
1155
+ on user activity, so no explicit turnComplete signal is needed.
1156
+
1157
+ Args:
1158
+ text: The text to send as user input.
1159
+ """
1160
+ if self._disconnecting or not self._session:
1161
+ return
1162
+
1163
+ try:
1164
+ await self._session.send_realtime_input(text=text)
1165
+ except Exception as e:
1166
+ await self._handle_send_error(e)
1167
+
1168
+ async def _send_user_video(self, frame):
1169
+ """Send user video frame to Gemini Live API."""
1170
+ if self._video_input_paused or self._disconnecting or not self._session:
1171
+ return
1172
+
1173
+ now = time.time()
1174
+ if now - self._last_sent_time < 1:
1175
+ return # Ignore if less than 1 second has passed
1176
+
1177
+ self._last_sent_time = now # Update last sent time
1178
+ logger.debug(f"Sending video frame to Gemini: {frame}")
1179
+
1180
+ buffer = io.BytesIO()
1181
+ Image.frombytes(frame.format, frame.size, frame.image).save(buffer, format="JPEG")
1182
+ data = base64.b64encode(buffer.getvalue()).decode("utf-8")
1183
+
1184
+ try:
1185
+ await self._session.send_realtime_input(video=Blob(data=data, mime_type="image/jpeg"))
1186
+ except Exception as e:
1187
+ await self._handle_send_error(e)
1188
+
1189
+ async def _create_initial_response(self):
1190
+ """Create initial response based on context history."""
1191
+ if self._disconnecting:
1192
+ return
1193
+
1194
+ if not self._session:
1195
+ self._run_llm_when_session_ready = True
1196
+ return
1197
+
1198
+ messages = self._context.get_messages_for_initializing_history()
1199
+ if not messages:
1200
+ return
1201
+
1202
+ logger.debug(f"Creating initial response: {messages}")
1203
+
1204
+ await self.start_ttfb_metrics()
1205
+
1206
+ try:
1207
+ await self._session.send_client_content(
1208
+ turns=messages, turn_complete=self._inference_on_context_initialization
1209
+ )
1210
+ except Exception as e:
1211
+ await self._handle_send_error(e)
1212
+
1213
+ # If we're generating a response right away upon initializing
1214
+ # conversation history, set a flag saying that we need a turn complete
1215
+ # message when the user stops speaking.
1216
+ if not self._inference_on_context_initialization:
1217
+ self._needs_turn_complete_message = True
1218
+
1219
+ async def _create_single_response(self, messages_list):
1220
+ """Create a single response from a list of messages."""
1221
+ if self._disconnecting or not self._session:
1222
+ return
1223
+
1224
+ # Create a throwaway context just for the purpose of getting messages
1225
+ # in the right format
1226
+ context = GeminiLiveContext.upgrade(OpenAILLMContext(messages=messages_list))
1227
+ messages = context.get_messages_for_initializing_history()
1228
+
1229
+ if not messages:
1230
+ return
1231
+
1232
+ logger.debug(f"Creating response: {messages}")
1233
+
1234
+ await self.start_ttfb_metrics()
1235
+
1236
+ try:
1237
+ await self._session.send_client_content(turns=messages, turn_complete=True)
1238
+ except Exception as e:
1239
+ await self._handle_send_error(e)
1240
+
1241
+ @traced_gemini_live(operation="llm_tool_result")
1242
+ async def _tool_result(self, tool_result_message):
1243
+ """Send tool result back to the API."""
1244
+ if self._disconnecting or not self._session:
1245
+ return
1246
+
1247
+ # For now we're shoving the name into the tool_call_id field, so this
1248
+ # will work until we revisit that.
1249
+ id = tool_result_message.get("tool_call_id")
1250
+ name = tool_result_message.get("tool_call_name")
1251
+ result = json.loads(tool_result_message.get("content") or "")
1252
+ response = FunctionResponse(name=name, id=id, response=result)
1253
+
1254
+ try:
1255
+ await self._session.send_tool_response(function_responses=response)
1256
+ except Exception as e:
1257
+ await self._handle_send_error(e)
1258
+
1259
+ @traced_gemini_live(operation="llm_setup")
1260
+ async def _handle_session_ready(self, session: AsyncSession):
1261
+ """Handle the session being ready."""
1262
+ self._session = session
1263
+ # If we were just waititng for the session to be ready to run the LLM,
1264
+ # do that now.
1265
+ if self._run_llm_when_session_ready:
1266
+ self._run_llm_when_session_ready = False
1267
+ await self._create_initial_response()
1268
+
1269
+ async def _handle_msg_model_turn(self, msg: LiveServerMessage):
1270
+ """Handle the model turn message."""
1271
+ part = msg.server_content.model_turn.parts[0]
1272
+ if not part:
1273
+ return
1274
+
1275
+ await self.stop_ttfb_metrics()
1276
+
1277
+ # part.text is added when `modalities` is set to TEXT; otherwise, it's None
1278
+ text = part.text
1279
+ if text:
1280
+ if not self._bot_text_buffer:
1281
+ await self.push_frame(LLMFullResponseStartFrame())
1282
+
1283
+ self._bot_text_buffer += text
1284
+ self._search_result_buffer += text # Also accumulate for grounding
1285
+ await self.push_frame(LLMTextFrame(text=text))
1286
+
1287
+ # Check for grounding metadata in server content
1288
+ if msg.server_content and msg.server_content.grounding_metadata:
1289
+ self._accumulated_grounding_metadata = msg.server_content.grounding_metadata
1290
+
1291
+ inline_data = part.inline_data
1292
+ if not inline_data:
1293
+ return
1294
+
1295
+ # Check if mime type matches expected format
1296
+ expected_mime_type = f"audio/pcm;rate={self._sample_rate}"
1297
+ if inline_data.mime_type == expected_mime_type:
1298
+ # Perfect match, continue processing
1299
+ pass
1300
+ elif inline_data.mime_type == "audio/pcm":
1301
+ # Sample rate not provided in mime type, assume default
1302
+ if not hasattr(self, "_sample_rate_warning_logged"):
1303
+ logger.warning(
1304
+ f"Sample rate not provided in mime type '{inline_data.mime_type}', assuming rate of {self._sample_rate}"
1305
+ )
1306
+ self._sample_rate_warning_logged = True
1307
+ else:
1308
+ # Unrecognized format
1309
+ logger.warning(f"Unrecognized server_content format {inline_data.mime_type}")
1310
+ return
1311
+
1312
+ audio = inline_data.data
1313
+ if not audio:
1314
+ return
1315
+
1316
+ if not self._bot_is_speaking:
1317
+ await self._set_bot_is_speaking(True)
1318
+ await self.push_frame(TTSStartedFrame())
1319
+ await self.push_frame(LLMFullResponseStartFrame())
1320
+
1321
+ self._bot_audio_buffer.extend(audio)
1322
+ frame = TTSAudioRawFrame(
1323
+ audio=audio,
1324
+ sample_rate=self._sample_rate,
1325
+ num_channels=1,
1326
+ )
1327
+ await self.push_frame(frame)
1328
+
1329
+ @traced_gemini_live(operation="llm_tool_call")
1330
+ async def _handle_msg_tool_call(self, message: LiveServerMessage):
1331
+ """Handle tool call messages."""
1332
+ function_calls = message.tool_call.function_calls
1333
+ if not function_calls:
1334
+ return
1335
+ if not self._context:
1336
+ logger.error("Function calls are not supported without a context object.")
1337
+
1338
+ function_calls_llm = [
1339
+ FunctionCallFromLLM(
1340
+ context=self._context,
1341
+ tool_call_id=(
1342
+ # NOTE: when using Vertex AI we don't get server-provided
1343
+ # tool call IDs here
1344
+ f.id or str(uuid.uuid4())
1345
+ ),
1346
+ function_name=f.name,
1347
+ arguments=f.args,
1348
+ )
1349
+ for f in function_calls
1350
+ ]
1351
+
1352
+ await self.run_function_calls(function_calls_llm)
1353
+
1354
+ @traced_gemini_live(operation="llm_response")
1355
+ async def _handle_msg_turn_complete(self, message: LiveServerMessage):
1356
+ """Handle the turn complete message."""
1357
+ await self._set_bot_is_speaking(False)
1358
+ text = self._bot_text_buffer
1359
+
1360
+ # Trace the complete LLM response (this will be handled by the decorator)
1361
+ # The decorator will extract the output text and usage metadata from the message
1362
+
1363
+ self._bot_text_buffer = ""
1364
+ self._llm_output_buffer = ""
1365
+
1366
+ # Process grounding metadata if we have accumulated any
1367
+ if self._accumulated_grounding_metadata:
1368
+ await self._process_grounding_metadata(
1369
+ self._accumulated_grounding_metadata, self._search_result_buffer
1370
+ )
1371
+
1372
+ # Reset grounding tracking for next response
1373
+ self._search_result_buffer = ""
1374
+ self._accumulated_grounding_metadata = None
1375
+
1376
+ # Only push the TTSStoppedFrame if the bot is outputting audio
1377
+ # when text is found, modalities is set to TEXT and no audio
1378
+ # is produced.
1379
+ if not text:
1380
+ await self.push_frame(TTSStoppedFrame())
1381
+
1382
+ await self.push_frame(LLMFullResponseEndFrame())
1383
+
1384
+ @traced_stt
1385
+ async def _handle_user_transcription(
1386
+ self, transcript: str, is_final: bool, language: Optional[Language] = None
1387
+ ):
1388
+ """Handle a transcription result with tracing."""
1389
+ pass
1390
+
1391
+ async def _handle_msg_input_transcription(self, message: LiveServerMessage):
1392
+ """Handle the input transcription message.
1393
+
1394
+ Gemini Live sends user transcriptions in either single words or multi-word
1395
+ phrases. As a result, we have to aggregate the input transcription. This handler
1396
+ aggregates into sentences, splitting on the end of sentence markers.
1397
+ """
1398
+ if not message.server_content.input_transcription:
1399
+ return
1400
+
1401
+ text = message.server_content.input_transcription.text
1402
+
1403
+ if not text:
1404
+ return
1405
+
1406
+ # Strip leading space from sentence starts if buffer is empty
1407
+ if text.startswith(" ") and not self._user_transcription_buffer:
1408
+ text = text.lstrip()
1409
+
1410
+ # Accumulate text in the buffer
1411
+ self._user_transcription_buffer += text
1412
+
1413
+ # Check for complete sentences
1414
+ while True:
1415
+ eos_end_marker = match_endofsentence(self._user_transcription_buffer)
1416
+ if not eos_end_marker:
1417
+ break
1418
+
1419
+ # Extract the complete sentence
1420
+ complete_sentence = self._user_transcription_buffer[:eos_end_marker]
1421
+ # Keep the remainder for the next chunk
1422
+ self._user_transcription_buffer = self._user_transcription_buffer[eos_end_marker:]
1423
+
1424
+ # Send a TranscriptionFrame with the complete sentence
1425
+ logger.debug(f"[Transcription:user] [{complete_sentence}]")
1426
+ await self._handle_user_transcription(
1427
+ complete_sentence, True, self._settings["language"]
1428
+ )
1429
+ await self.push_frame(
1430
+ TranscriptionFrame(
1431
+ text=complete_sentence,
1432
+ user_id="",
1433
+ timestamp=time_now_iso8601(),
1434
+ result=message,
1435
+ ),
1436
+ FrameDirection.UPSTREAM,
1437
+ )
1438
+
1439
+ async def _handle_msg_output_transcription(self, message: LiveServerMessage):
1440
+ """Handle the output transcription message."""
1441
+ if not message.server_content.output_transcription:
1442
+ return
1443
+
1444
+ # This is the output transcription text when modalities is set to AUDIO.
1445
+ # In this case, we push LLMTextFrame and TTSTextFrame to be handled by the
1446
+ # downstream assistant context aggregator.
1447
+ text = message.server_content.output_transcription.text
1448
+
1449
+ if not text:
1450
+ return
1451
+
1452
+ # Accumulate text for grounding as well
1453
+ self._search_result_buffer += text
1454
+
1455
+ # Check for grounding metadata in server content
1456
+ if message.server_content and message.server_content.grounding_metadata:
1457
+ self._accumulated_grounding_metadata = message.server_content.grounding_metadata
1458
+ # Collect text for tracing
1459
+ self._llm_output_buffer += text
1460
+
1461
+ await self.push_frame(LLMTextFrame(text=text))
1462
+ await self.push_frame(TTSTextFrame(text=text))
1463
+
1464
+ async def _handle_msg_grounding_metadata(self, message: LiveServerMessage):
1465
+ """Handle dedicated grounding metadata messages."""
1466
+ if message.server_content and message.server_content.grounding_metadata:
1467
+ grounding_metadata = message.server_content.grounding_metadata
1468
+ # Process the grounding metadata immediately
1469
+ await self._process_grounding_metadata(grounding_metadata, self._search_result_buffer)
1470
+
1471
+ async def _process_grounding_metadata(
1472
+ self, grounding_metadata: GroundingMetadata, search_result: str = ""
1473
+ ):
1474
+ """Process grounding metadata and emit LLMSearchResponseFrame."""
1475
+ if not grounding_metadata:
1476
+ return
1477
+
1478
+ # Extract rendered content for search suggestions
1479
+ rendered_content = None
1480
+ if (
1481
+ grounding_metadata.search_entry_point
1482
+ and grounding_metadata.search_entry_point.rendered_content
1483
+ ):
1484
+ rendered_content = grounding_metadata.search_entry_point.rendered_content
1485
+
1486
+ # Convert grounding chunks and supports to LLMSearchOrigin format
1487
+ origins = []
1488
+
1489
+ if grounding_metadata.grounding_chunks and grounding_metadata.grounding_supports:
1490
+ # Create a mapping of chunk indices to origins
1491
+ chunk_to_origin: Dict[int, LLMSearchOrigin] = {}
1492
+
1493
+ for index, chunk in enumerate(grounding_metadata.grounding_chunks):
1494
+ if chunk.web:
1495
+ origin = LLMSearchOrigin(
1496
+ site_uri=chunk.web.uri, site_title=chunk.web.title, results=[]
1497
+ )
1498
+ chunk_to_origin[index] = origin
1499
+ origins.append(origin)
1500
+
1501
+ # Add grounding support results to the appropriate origins
1502
+ for support in grounding_metadata.grounding_supports:
1503
+ if support.segment and support.grounding_chunk_indices:
1504
+ text = support.segment.text or ""
1505
+ confidence_scores = support.confidence_scores or []
1506
+
1507
+ # Add this result to all origins referenced by this support
1508
+ for chunk_index in support.grounding_chunk_indices:
1509
+ if chunk_index in chunk_to_origin:
1510
+ result = LLMSearchResult(text=text, confidence=confidence_scores)
1511
+ chunk_to_origin[chunk_index].results.append(result)
1512
+
1513
+ # Create and push the search response frame
1514
+ search_frame = LLMSearchResponseFrame(
1515
+ search_result=search_result, origins=origins, rendered_content=rendered_content
1516
+ )
1517
+
1518
+ await self.push_frame(search_frame)
1519
+
1520
+ async def _handle_msg_usage_metadata(self, message: LiveServerMessage):
1521
+ """Handle the usage metadata message."""
1522
+ if not message.usage_metadata:
1523
+ return
1524
+
1525
+ usage = message.usage_metadata
1526
+
1527
+ # Ensure we have valid integers for all token counts
1528
+ prompt_tokens = usage.prompt_token_count or 0
1529
+ completion_tokens = usage.response_token_count or 0
1530
+ total_tokens = usage.total_token_count or (prompt_tokens + completion_tokens)
1531
+
1532
+ tokens = LLMTokenUsage(
1533
+ prompt_tokens=prompt_tokens,
1534
+ completion_tokens=completion_tokens,
1535
+ total_tokens=total_tokens,
1536
+ )
1537
+
1538
+ await self.start_llm_usage_metrics(tokens)
1539
+
1540
+ def _handle_msg_resumption_update(self, message: LiveServerMessage):
1541
+ update = message.session_resumption_update
1542
+ if update.resumable and update.new_handle:
1543
+ self._session_resumption_handle = update.new_handle
1544
+
1545
+ async def _handle_send_error(self, error: Exception):
1546
+ # In server-to-server contexts, a WebSocket error should be quite rare.
1547
+ # Given how hard it is to recover from a send-side error with proper
1548
+ # state management, and that exponential backoff for retries can have
1549
+ # cost/stability implications for a service cluster, let's just treat a
1550
+ # send-side error as fatal.
1551
+ if not self._disconnecting:
1552
+ await self.push_error(ErrorFrame(error=f"{self} Send error: {error}", fatal=True))
1553
+
1554
+ def create_context_aggregator(
1555
+ self,
1556
+ context: OpenAILLMContext,
1557
+ *,
1558
+ user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
1559
+ assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
1560
+ ) -> GeminiLiveContextAggregatorPair:
1561
+ """Create an instance of GeminiLiveContextAggregatorPair from an OpenAILLMContext.
1562
+
1563
+ Constructor keyword arguments for both the user and assistant aggregators can be provided.
1564
+
1565
+ Args:
1566
+ context: The LLM context to use.
1567
+ user_params: User aggregator parameters. Defaults to LLMUserAggregatorParams().
1568
+ assistant_params: Assistant aggregator parameters. Defaults to LLMAssistantAggregatorParams().
1569
+
1570
+ Returns:
1571
+ GeminiLiveContextAggregatorPair: A pair of context
1572
+ aggregators, one for the user and one for the assistant,
1573
+ encapsulated in an GeminiLiveContextAggregatorPair.
1574
+ """
1575
+ context.set_llm_adapter(self.get_llm_adapter())
1576
+
1577
+ GeminiLiveContext.upgrade(context)
1578
+ user = GeminiLiveUserContextAggregator(context, params=user_params)
1579
+
1580
+ assistant_params.expect_stripped_words = False
1581
+ assistant = GeminiLiveAssistantContextAggregator(context, params=assistant_params)
1582
+ return GeminiLiveContextAggregatorPair(_user=user, _assistant=assistant)