dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -4,1416 +4,54 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- """Google Gemini Multimodal Live API service implementation.
7
+ """Google Gemini Live API service implementation.
8
8
 
9
9
  This module provides real-time conversational AI capabilities using Google's
10
- Gemini Multimodal Live API, supporting both text and audio modalities with
10
+ Gemini Live API, supporting both text and audio modalities with
11
11
  voice transcription, streaming responses, and tool usage.
12
- """
13
12
 
14
- import base64
15
- import json
16
- import time
17
- from dataclasses import dataclass
18
- from enum import Enum
19
- from typing import Any, Dict, List, Optional, Union
13
+ .. deprecated:: 0.0.90
14
+ This module is deprecated. Please use the equivalent types from
15
+ pipecat.services.google.gemini_live.llm instead. Note that the new type names
16
+ do not include 'Multimodal'.
17
+ """
20
18
 
21
- from loguru import logger
22
- from pydantic import BaseModel, Field
19
+ import warnings
23
20
 
24
- from pipecat.adapters.schemas.tools_schema import ToolsSchema
25
- from pipecat.adapters.services.gemini_adapter import GeminiLLMAdapter
26
- from pipecat.frames.frames import (
27
- BotStartedSpeakingFrame,
28
- BotStoppedSpeakingFrame,
29
- CancelFrame,
30
- EndFrame,
31
- ErrorFrame,
32
- Frame,
33
- InputAudioRawFrame,
34
- InputImageRawFrame,
35
- InputTextRawFrame,
36
- LLMContextFrame,
37
- LLMFullResponseEndFrame,
38
- LLMFullResponseStartFrame,
39
- LLMMessagesAppendFrame,
40
- LLMSetToolsFrame,
41
- LLMTextFrame,
42
- LLMUpdateSettingsFrame,
43
- StartFrame,
44
- StartInterruptionFrame,
45
- TranscriptionFrame,
46
- TTSAudioRawFrame,
47
- TTSStartedFrame,
48
- TTSStoppedFrame,
49
- TTSTextFrame,
50
- UserImageRawFrame,
51
- UserStartedSpeakingFrame,
52
- UserStoppedSpeakingFrame,
53
- )
54
- from pipecat.metrics.metrics import LLMTokenUsage
55
- from pipecat.processors.aggregators.llm_response import (
56
- LLMAssistantAggregatorParams,
57
- LLMUserAggregatorParams,
58
- )
59
- from pipecat.processors.aggregators.openai_llm_context import (
60
- OpenAILLMContext,
61
- OpenAILLMContextFrame,
21
+ from pipecat.services.google.gemini_live.llm import (
22
+ ContextWindowCompressionParams as _ContextWindowCompressionParams,
62
23
  )
63
- from pipecat.processors.frame_processor import FrameDirection
64
- from pipecat.services.google.frames import LLMSearchOrigin, LLMSearchResponseFrame, LLMSearchResult
65
- from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
66
- from pipecat.services.openai.llm import (
67
- OpenAIAssistantContextAggregator,
68
- OpenAIUserContextAggregator,
24
+ from pipecat.services.google.gemini_live.llm import (
25
+ GeminiLiveAssistantContextAggregator,
26
+ GeminiLiveContext,
27
+ GeminiLiveContextAggregatorPair,
28
+ GeminiLiveLLMService,
29
+ GeminiLiveUserContextAggregator,
30
+ GeminiModalities,
69
31
  )
70
- from pipecat.transcriptions.language import Language
71
- from pipecat.utils.string import match_endofsentence
72
- from pipecat.utils.time import time_now_iso8601
73
- from pipecat.utils.tracing.service_decorators import traced_gemini_live, traced_stt
74
-
75
- from . import events
76
- from .file_api import GeminiFileAPI
77
-
78
- try:
79
- from websockets.asyncio.client import connect as websocket_connect
80
- except ModuleNotFoundError as e:
81
- logger.error(f"Exception: {e}")
82
- logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
83
- raise Exception(f"Missing module: {e}")
84
-
85
-
86
- def language_to_gemini_language(language: Language) -> Optional[str]:
87
- """Maps a Language enum value to a Gemini Live supported language code.
88
-
89
- Source:
90
- https://ai.google.dev/api/generate-content#MediaResolution
91
-
92
- Args:
93
- language: The language enum value to convert.
94
-
95
- Returns:
96
- The Gemini language code string, or None if the language is not supported.
97
- """
98
- language_map = {
99
- # Arabic
100
- Language.AR: "ar-XA",
101
- # Bengali
102
- Language.BN_IN: "bn-IN",
103
- # Chinese (Mandarin)
104
- Language.CMN: "cmn-CN",
105
- Language.CMN_CN: "cmn-CN",
106
- Language.ZH: "cmn-CN", # Map general Chinese to Mandarin for Gemini
107
- Language.ZH_CN: "cmn-CN", # Map Simplified Chinese to Mandarin for Gemini
108
- # German
109
- Language.DE: "de-DE",
110
- Language.DE_DE: "de-DE",
111
- # English
112
- Language.EN: "en-US", # Default to US English (though not explicitly listed in supported codes)
113
- Language.EN_US: "en-US",
114
- Language.EN_AU: "en-AU",
115
- Language.EN_GB: "en-GB",
116
- Language.EN_IN: "en-IN",
117
- # Spanish
118
- Language.ES: "es-ES", # Default to Spain Spanish
119
- Language.ES_ES: "es-ES",
120
- Language.ES_US: "es-US",
121
- # French
122
- Language.FR: "fr-FR", # Default to France French
123
- Language.FR_FR: "fr-FR",
124
- Language.FR_CA: "fr-CA",
125
- # Gujarati
126
- Language.GU: "gu-IN",
127
- Language.GU_IN: "gu-IN",
128
- # Hindi
129
- Language.HI: "hi-IN",
130
- Language.HI_IN: "hi-IN",
131
- # Indonesian
132
- Language.ID: "id-ID",
133
- Language.ID_ID: "id-ID",
134
- # Italian
135
- Language.IT: "it-IT",
136
- Language.IT_IT: "it-IT",
137
- # Japanese
138
- Language.JA: "ja-JP",
139
- Language.JA_JP: "ja-JP",
140
- # Kannada
141
- Language.KN: "kn-IN",
142
- Language.KN_IN: "kn-IN",
143
- # Korean
144
- Language.KO: "ko-KR",
145
- Language.KO_KR: "ko-KR",
146
- # Malayalam
147
- Language.ML: "ml-IN",
148
- Language.ML_IN: "ml-IN",
149
- # Marathi
150
- Language.MR: "mr-IN",
151
- Language.MR_IN: "mr-IN",
152
- # Dutch
153
- Language.NL: "nl-NL",
154
- Language.NL_NL: "nl-NL",
155
- # Polish
156
- Language.PL: "pl-PL",
157
- Language.PL_PL: "pl-PL",
158
- # Portuguese (Brazil)
159
- Language.PT_BR: "pt-BR",
160
- # Russian
161
- Language.RU: "ru-RU",
162
- Language.RU_RU: "ru-RU",
163
- # Tamil
164
- Language.TA: "ta-IN",
165
- Language.TA_IN: "ta-IN",
166
- # Telugu
167
- Language.TE: "te-IN",
168
- Language.TE_IN: "te-IN",
169
- # Thai
170
- Language.TH: "th-TH",
171
- Language.TH_TH: "th-TH",
172
- # Turkish
173
- Language.TR: "tr-TR",
174
- Language.TR_TR: "tr-TR",
175
- # Vietnamese
176
- Language.VI: "vi-VN",
177
- Language.VI_VN: "vi-VN",
178
- }
179
- return language_map.get(language)
180
-
181
-
182
- class GeminiMultimodalLiveContext(OpenAILLMContext):
183
- """Extended OpenAI context for Gemini Multimodal Live API.
184
-
185
- Provides Gemini-specific context management including system instruction
186
- extraction and message format conversion for the Live API.
187
- """
188
-
189
- @staticmethod
190
- def upgrade(obj: OpenAILLMContext) -> "GeminiMultimodalLiveContext":
191
- """Upgrade an OpenAI context to Gemini context.
192
-
193
- Args:
194
- obj: The OpenAI context to upgrade.
195
-
196
- Returns:
197
- The upgraded Gemini context instance.
198
- """
199
- if isinstance(obj, OpenAILLMContext) and not isinstance(obj, GeminiMultimodalLiveContext):
200
- logger.debug(f"Upgrading to Gemini Multimodal Live Context: {obj}")
201
- obj.__class__ = GeminiMultimodalLiveContext
202
- obj._restructure_from_openai_messages()
203
- return obj
204
-
205
- def _restructure_from_openai_messages(self):
206
- pass
207
-
208
- def extract_system_instructions(self):
209
- """Extract system instructions from context messages.
210
-
211
- Returns:
212
- Combined system instruction text from all system messages.
213
- """
214
- system_instruction = ""
215
- for item in self.messages:
216
- if item.get("role") == "system":
217
- content = item.get("content", "")
218
- if content:
219
- if system_instruction and not system_instruction.endswith("\n"):
220
- system_instruction += "\n"
221
- system_instruction += str(content)
222
- return system_instruction
223
-
224
- def add_file_reference(self, file_uri: str, mime_type: str, text: Optional[str] = None):
225
- """Add a file reference to the context.
226
-
227
- This adds a user message with a file reference that will be sent during context initialization.
228
-
229
- Args:
230
- file_uri: URI of the uploaded file
231
- mime_type: MIME type of the file
232
- text: Optional text prompt to accompany the file
233
- """
234
- # Create parts list with file reference
235
- parts = []
236
- if text:
237
- parts.append({"type": "text", "text": text})
238
-
239
- # Add file reference part
240
- parts.append(
241
- {"type": "file_data", "file_data": {"mime_type": mime_type, "file_uri": file_uri}}
242
- )
243
-
244
- # Add to messages
245
- message = {"role": "user", "content": parts}
246
- self.messages.append(message)
247
- logger.info(f"Added file reference to context: {file_uri}")
248
-
249
- def get_messages_for_initializing_history(self):
250
- """Get messages formatted for Gemini history initialization.
251
-
252
- Returns:
253
- List of messages in Gemini format for conversation history.
254
- """
255
- messages = []
256
- for item in self.messages:
257
- role = item.get("role")
258
-
259
- if role == "system":
260
- continue
261
-
262
- elif role == "assistant":
263
- role = "model"
264
-
265
- content = item.get("content")
266
- parts = []
267
- if isinstance(content, str):
268
- parts = [{"text": content}]
269
- elif isinstance(content, list):
270
- for part in content:
271
- if part.get("type") == "text":
272
- parts.append({"text": part.get("text")})
273
- elif part.get("type") == "file_data":
274
- file_data = part.get("file_data", {})
275
-
276
- parts.append(
277
- {
278
- "fileData": {
279
- "mimeType": file_data.get("mime_type"),
280
- "fileUri": file_data.get("file_uri"),
281
- }
282
- }
283
- )
284
- else:
285
- logger.warning(f"Unsupported content type: {str(part)[:80]}")
286
- else:
287
- logger.warning(f"Unsupported content type: {str(content)[:80]}")
288
- messages.append({"role": role, "parts": parts})
289
- return messages
290
-
291
-
292
- class GeminiMultimodalLiveUserContextAggregator(OpenAIUserContextAggregator):
293
- """User context aggregator for Gemini Multimodal Live.
294
-
295
- Extends OpenAI user aggregator to handle Gemini-specific message passing
296
- while maintaining compatibility with the standard aggregation pipeline.
297
- """
298
-
299
- async def process_frame(self, frame, direction):
300
- """Process incoming frames for user context aggregation.
301
-
302
- Args:
303
- frame: The frame to process.
304
- direction: The frame processing direction.
305
- """
306
- await super().process_frame(frame, direction)
307
- # kind of a hack just to pass the LLMMessagesAppendFrame through, but it's fine for now
308
- if isinstance(frame, LLMMessagesAppendFrame):
309
- await self.push_frame(frame, direction)
310
-
311
-
312
- class GeminiMultimodalLiveAssistantContextAggregator(OpenAIAssistantContextAggregator):
313
- """Assistant context aggregator for Gemini Multimodal Live.
314
-
315
- Handles assistant response aggregation while filtering out LLMTextFrames
316
- to prevent duplicate context entries, as Gemini Live pushes both
317
- LLMTextFrames and TTSTextFrames.
318
- """
319
-
320
- async def process_frame(self, frame: Frame, direction: FrameDirection):
321
- """Process incoming frames for assistant context aggregation.
322
-
323
- Args:
324
- frame: The frame to process.
325
- direction: The frame processing direction.
326
- """
327
- # The LLMAssistantContextAggregator uses TextFrames to aggregate the LLM output,
328
- # but the GeminiMultimodalLiveAssistantContextAggregator pushes LLMTextFrames and TTSTextFrames. We
329
- # need to override this proces_frame for LLMTextFrame, so that only the TTSTextFrames
330
- # are process. This ensures that the context gets only one set of messages.
331
- if not isinstance(frame, LLMTextFrame):
332
- await super().process_frame(frame, direction)
333
-
334
- async def handle_user_image_frame(self, frame: UserImageRawFrame):
335
- """Handle user image frames.
336
-
337
- Args:
338
- frame: The user image frame to handle.
339
- """
340
- # We don't want to store any images in the context. Revisit this later
341
- # when the API evolves.
342
- pass
343
-
344
-
345
- @dataclass
346
- class GeminiMultimodalLiveContextAggregatorPair:
347
- """Pair of user and assistant context aggregators for Gemini Multimodal Live.
348
-
349
- Parameters:
350
- _user: The user context aggregator instance.
351
- _assistant: The assistant context aggregator instance.
352
- """
353
-
354
- _user: GeminiMultimodalLiveUserContextAggregator
355
- _assistant: GeminiMultimodalLiveAssistantContextAggregator
356
-
357
- def user(self) -> GeminiMultimodalLiveUserContextAggregator:
358
- """Get the user context aggregator.
359
-
360
- Returns:
361
- The user context aggregator instance.
362
- """
363
- return self._user
364
-
365
- def assistant(self) -> GeminiMultimodalLiveAssistantContextAggregator:
366
- """Get the assistant context aggregator.
367
-
368
- Returns:
369
- The assistant context aggregator instance.
370
- """
371
- return self._assistant
372
-
373
-
374
- class GeminiMultimodalModalities(Enum):
375
- """Supported modalities for Gemini Multimodal Live.
376
-
377
- Parameters:
378
- TEXT: Text responses.
379
- AUDIO: Audio responses.
380
- """
381
-
382
- TEXT = "TEXT"
383
- AUDIO = "AUDIO"
384
-
385
-
386
- class GeminiMediaResolution(str, Enum):
387
- """Media resolution options for Gemini Multimodal Live.
388
-
389
- Parameters:
390
- UNSPECIFIED: Use default resolution setting.
391
- LOW: Low resolution with 64 tokens.
392
- MEDIUM: Medium resolution with 256 tokens.
393
- HIGH: High resolution with zoomed reframing and 256 tokens.
394
- """
395
-
396
- UNSPECIFIED = "MEDIA_RESOLUTION_UNSPECIFIED" # Use default
397
- LOW = "MEDIA_RESOLUTION_LOW" # 64 tokens
398
- MEDIUM = "MEDIA_RESOLUTION_MEDIUM" # 256 tokens
399
- HIGH = "MEDIA_RESOLUTION_HIGH" # Zoomed reframing with 256 tokens
400
-
401
-
402
- class GeminiVADParams(BaseModel):
403
- """Voice Activity Detection parameters for Gemini Live.
404
-
405
- Parameters:
406
- disabled: Whether to disable VAD. Defaults to None.
407
- start_sensitivity: Sensitivity for speech start detection. Defaults to None.
408
- end_sensitivity: Sensitivity for speech end detection. Defaults to None.
409
- prefix_padding_ms: Prefix padding in milliseconds. Defaults to None.
410
- silence_duration_ms: Silence duration threshold in milliseconds. Defaults to None.
411
- """
412
-
413
- disabled: Optional[bool] = Field(default=None)
414
- start_sensitivity: Optional[events.StartSensitivity] = Field(default=None)
415
- end_sensitivity: Optional[events.EndSensitivity] = Field(default=None)
416
- prefix_padding_ms: Optional[int] = Field(default=None)
417
- silence_duration_ms: Optional[int] = Field(default=None)
418
-
419
-
420
- class ContextWindowCompressionParams(BaseModel):
421
- """Parameters for context window compression in Gemini Live.
422
-
423
- Parameters:
424
- enabled: Whether compression is enabled. Defaults to False.
425
- trigger_tokens: Token count to trigger compression. None uses 80% of context window.
426
- """
427
-
428
- enabled: bool = Field(default=False)
429
- trigger_tokens: Optional[int] = Field(
430
- default=None
431
- ) # None = use default (80% of context window)
432
-
433
-
434
- class InputParams(BaseModel):
435
- """Input parameters for Gemini Multimodal Live generation.
436
-
437
- Parameters:
438
- frequency_penalty: Frequency penalty for generation (0.0-2.0). Defaults to None.
439
- max_tokens: Maximum tokens to generate. Must be >= 1. Defaults to 4096.
440
- presence_penalty: Presence penalty for generation (0.0-2.0). Defaults to None.
441
- temperature: Sampling temperature (0.0-2.0). Defaults to None.
442
- top_k: Top-k sampling parameter. Must be >= 0. Defaults to None.
443
- top_p: Top-p sampling parameter (0.0-1.0). Defaults to None.
444
- modalities: Response modalities. Defaults to AUDIO.
445
- language: Language for generation. Defaults to EN_US.
446
- media_resolution: Media resolution setting. Defaults to UNSPECIFIED.
447
- vad: Voice activity detection parameters. Defaults to None.
448
- context_window_compression: Context compression settings. Defaults to None.
449
- extra: Additional parameters. Defaults to empty dict.
450
- """
451
-
452
- frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
453
- max_tokens: Optional[int] = Field(default=4096, ge=1)
454
- presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
455
- temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
456
- top_k: Optional[int] = Field(default=None, ge=0)
457
- top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
458
- modalities: Optional[GeminiMultimodalModalities] = Field(
459
- default=GeminiMultimodalModalities.AUDIO
32
+ from pipecat.services.google.gemini_live.llm import GeminiMediaResolution as _GeminiMediaResolution
33
+ from pipecat.services.google.gemini_live.llm import GeminiVADParams as _GeminiVADParams
34
+ from pipecat.services.google.gemini_live.llm import InputParams as _InputParams
35
+
36
+ with warnings.catch_warnings():
37
+ warnings.simplefilter("always")
38
+ warnings.warn(
39
+ "Types in pipecat.services.gemini_multimodal_live.gemini are deprecated. "
40
+ "Please use the equivalent types from "
41
+ "pipecat.services.google.gemini_live.llm instead. Note that the new type "
42
+ "names do not include 'Multimodal' "
43
+ "(e.g. `GeminiMultimodalLiveLLMService` is now `GeminiLiveLLMService`).",
44
+ DeprecationWarning,
45
+ stacklevel=2,
460
46
  )
461
- language: Optional[Language] = Field(default=Language.EN_US)
462
- media_resolution: Optional[GeminiMediaResolution] = Field(
463
- default=GeminiMediaResolution.UNSPECIFIED
464
- )
465
- vad: Optional[GeminiVADParams] = Field(default=None)
466
- context_window_compression: Optional[ContextWindowCompressionParams] = Field(default=None)
467
- extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
468
-
469
-
470
- class GeminiMultimodalLiveLLMService(LLMService):
471
- """Provides access to Google's Gemini Multimodal Live API.
472
-
473
- This service enables real-time conversations with Gemini, supporting both
474
- text and audio modalities. It handles voice transcription, streaming audio
475
- responses, and tool usage.
476
- """
477
-
478
- # Overriding the default adapter to use the Gemini one.
479
- adapter_class = GeminiLLMAdapter
480
-
481
- def __init__(
482
- self,
483
- *,
484
- api_key: str,
485
- base_url: str = "generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent",
486
- model="models/gemini-2.0-flash-live-001",
487
- voice_id: str = "Charon",
488
- start_audio_paused: bool = False,
489
- start_video_paused: bool = False,
490
- system_instruction: Optional[str] = None,
491
- tools: Optional[Union[List[dict], ToolsSchema]] = None,
492
- params: Optional[InputParams] = None,
493
- inference_on_context_initialization: bool = True,
494
- file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
495
- **kwargs,
496
- ):
497
- """Initialize the Gemini Multimodal Live LLM service.
498
-
499
- Args:
500
- api_key: Google AI API key for authentication.
501
- base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
502
- model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
503
- voice_id: TTS voice identifier. Defaults to "Charon".
504
- start_audio_paused: Whether to start with audio input paused. Defaults to False.
505
- start_video_paused: Whether to start with video input paused. Defaults to False.
506
- system_instruction: System prompt for the model. Defaults to None.
507
- tools: Tools/functions available to the model. Defaults to None.
508
- params: Configuration parameters for the model. Defaults to InputParams().
509
- inference_on_context_initialization: Whether to generate a response when context
510
- is first set. Defaults to True.
511
- file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
512
- **kwargs: Additional arguments passed to parent LLMService.
513
- """
514
- super().__init__(base_url=base_url, **kwargs)
515
-
516
- params = params or InputParams()
517
-
518
- self._last_sent_time = 0
519
- self._api_key = api_key
520
- self._base_url = base_url
521
- self.set_model_name(model)
522
- self._voice_id = voice_id
523
- self._language_code = params.language
524
-
525
- self._system_instruction = system_instruction
526
- self._tools = tools
527
- self._inference_on_context_initialization = inference_on_context_initialization
528
- self._needs_turn_complete_message = False
529
-
530
- self._audio_input_paused = start_audio_paused
531
- self._video_input_paused = start_video_paused
532
- self._context = None
533
- self._websocket = None
534
- self._receive_task = None
535
-
536
- self._disconnecting = False
537
- self._api_session_ready = False
538
- self._run_llm_when_api_session_ready = False
539
-
540
- self._user_is_speaking = False
541
- self._bot_is_speaking = False
542
- self._user_audio_buffer = bytearray()
543
- self._user_transcription_buffer = ""
544
- self._last_transcription_sent = ""
545
- self._bot_audio_buffer = bytearray()
546
- self._bot_text_buffer = ""
547
- self._llm_output_buffer = ""
548
-
549
- self._sample_rate = 24000
550
-
551
- self._language = params.language
552
- self._language_code = (
553
- language_to_gemini_language(params.language) if params.language else "en-US"
554
- )
555
- self._vad_params = params.vad
556
-
557
- self._settings = {
558
- "frequency_penalty": params.frequency_penalty,
559
- "max_tokens": params.max_tokens,
560
- "presence_penalty": params.presence_penalty,
561
- "temperature": params.temperature,
562
- "top_k": params.top_k,
563
- "top_p": params.top_p,
564
- "modalities": params.modalities,
565
- "language": self._language_code,
566
- "media_resolution": params.media_resolution,
567
- "vad": params.vad,
568
- "context_window_compression": params.context_window_compression.model_dump()
569
- if params.context_window_compression
570
- else {},
571
- "extra": params.extra if isinstance(params.extra, dict) else {},
572
- }
573
-
574
- # Initialize the File API client
575
- self.file_api = GeminiFileAPI(api_key=api_key, base_url=file_api_base_url)
576
-
577
- # Grounding metadata tracking
578
- self._search_result_buffer = ""
579
- self._accumulated_grounding_metadata = None
580
-
581
- def can_generate_metrics(self) -> bool:
582
- """Check if the service can generate usage metrics.
583
-
584
- Returns:
585
- True as Gemini Live supports token usage metrics.
586
- """
587
- return True
588
-
589
- def needs_mcp_alternate_schema(self) -> bool:
590
- """Check if this LLM service requires alternate MCP schema.
591
-
592
- Google/Gemini has stricter JSON schema validation and requires
593
- certain properties to be removed or modified for compatibility.
594
-
595
- Returns:
596
- True for Google/Gemini services.
597
- """
598
- return True
599
-
600
- def set_audio_input_paused(self, paused: bool):
601
- """Set the audio input pause state.
602
-
603
- Args:
604
- paused: Whether to pause audio input.
605
- """
606
- self._audio_input_paused = paused
607
-
608
- def set_video_input_paused(self, paused: bool):
609
- """Set the video input pause state.
610
-
611
- Args:
612
- paused: Whether to pause video input.
613
- """
614
- self._video_input_paused = paused
615
-
616
- def set_model_modalities(self, modalities: GeminiMultimodalModalities):
617
- """Set the model response modalities.
618
-
619
- Args:
620
- modalities: The modalities to use for responses.
621
- """
622
- self._settings["modalities"] = modalities
623
-
624
- def set_language(self, language: Language):
625
- """Set the language for generation.
626
-
627
- Args:
628
- language: The language to use for generation.
629
- """
630
- self._language = language
631
- self._language_code = language_to_gemini_language(language) or "en-US"
632
- self._settings["language"] = self._language_code
633
- logger.info(f"Set Gemini language to: {self._language_code}")
634
-
635
- async def set_context(self, context: OpenAILLMContext):
636
- """Set the context explicitly from outside the pipeline.
637
-
638
- This is useful when initializing a conversation because in server-side VAD mode we might not have a
639
- way to trigger the pipeline. This sends the history to the server. The `inference_on_context_initialization`
640
- flag controls whether to set the turnComplete flag when we do this. Without that flag, the model will
641
- not respond. This is often what we want when setting the context at the beginning of a conversation.
642
-
643
- Args:
644
- context: The OpenAI LLM context to set.
645
- """
646
- if self._context:
647
- logger.error(
648
- "Context already set. Can only set up Gemini Multimodal Live context once."
649
- )
650
- return
651
- self._context = GeminiMultimodalLiveContext.upgrade(context)
652
- await self._create_initial_response()
653
-
654
- #
655
- # standard AIService frame handling
656
- #
657
-
658
- async def start(self, frame: StartFrame):
659
- """Start the service and establish websocket connection.
660
-
661
- Args:
662
- frame: The start frame.
663
- """
664
- await super().start(frame)
665
- await self._connect()
666
-
667
- async def stop(self, frame: EndFrame):
668
- """Stop the service and close connections.
669
-
670
- Args:
671
- frame: The end frame.
672
- """
673
- await super().stop(frame)
674
- await self._disconnect()
675
-
676
- async def cancel(self, frame: CancelFrame):
677
- """Cancel the service and close connections.
678
-
679
- Args:
680
- frame: The cancel frame.
681
- """
682
- await super().cancel(frame)
683
- await self._disconnect()
684
-
685
- #
686
- # speech and interruption handling
687
- #
688
-
689
- async def _handle_interruption(self):
690
- self._bot_is_speaking = False
691
- await self.push_frame(TTSStoppedFrame())
692
- await self.push_frame(LLMFullResponseEndFrame())
693
-
694
- async def _handle_user_started_speaking(self, frame):
695
- self._user_is_speaking = True
696
- pass
697
-
698
- async def _handle_user_stopped_speaking(self, frame):
699
- self._user_is_speaking = False
700
- self._user_audio_buffer = bytearray()
701
- await self.start_ttfb_metrics()
702
- if self._needs_turn_complete_message:
703
- self._needs_turn_complete_message = False
704
- evt = events.ClientContentMessage.model_validate(
705
- {"clientContent": {"turnComplete": True}}
706
- )
707
- await self.send_client_event(evt)
708
-
709
- #
710
- # frame processing
711
- #
712
- # StartFrame, StopFrame, CancelFrame implemented in base class
713
- #
714
-
715
- async def process_frame(self, frame: Frame, direction: FrameDirection):
716
- """Process incoming frames for the Gemini Live service.
717
-
718
- Args:
719
- frame: The frame to process.
720
- direction: The frame processing direction.
721
- """
722
- await super().process_frame(frame, direction)
723
-
724
- if isinstance(frame, TranscriptionFrame):
725
- await self.push_frame(frame, direction)
726
- elif isinstance(frame, OpenAILLMContextFrame):
727
- context: GeminiMultimodalLiveContext = GeminiMultimodalLiveContext.upgrade(
728
- frame.context
729
- )
730
- # For now, we'll only trigger inference here when either:
731
- # 1. We have not seen a context frame before
732
- # 2. The last message is a tool call result
733
- if not self._context:
734
- self._context = context
735
- if frame.context.tools:
736
- self._tools = frame.context.tools
737
- await self._create_initial_response()
738
- elif context.messages and context.messages[-1].get("role") == "tool":
739
- # Support just one tool call per context frame for now
740
- tool_result_message = context.messages[-1]
741
- await self._tool_result(tool_result_message)
742
- elif isinstance(frame, LLMContextFrame):
743
- raise NotImplementedError(
744
- "Universal LLMContext is not yet supported for Gemini Multimodal Live."
745
- )
746
- elif isinstance(frame, InputTextRawFrame):
747
- await self._send_user_text(frame.text)
748
- await self.push_frame(frame, direction)
749
- elif isinstance(frame, InputAudioRawFrame):
750
- await self._send_user_audio(frame)
751
- await self.push_frame(frame, direction)
752
- elif isinstance(frame, InputImageRawFrame):
753
- await self._send_user_video(frame)
754
- await self.push_frame(frame, direction)
755
- elif isinstance(frame, StartInterruptionFrame):
756
- await self._handle_interruption()
757
- await self.push_frame(frame, direction)
758
- elif isinstance(frame, UserStartedSpeakingFrame):
759
- await self._handle_user_started_speaking(frame)
760
- await self.push_frame(frame, direction)
761
- elif isinstance(frame, UserStoppedSpeakingFrame):
762
- await self._handle_user_stopped_speaking(frame)
763
- await self.push_frame(frame, direction)
764
- elif isinstance(frame, BotStartedSpeakingFrame):
765
- # Ignore this frame. Use the serverContent API message instead
766
- await self.push_frame(frame, direction)
767
- elif isinstance(frame, BotStoppedSpeakingFrame):
768
- # ignore this frame. Use the serverContent.turnComplete API message
769
- await self.push_frame(frame, direction)
770
- elif isinstance(frame, LLMMessagesAppendFrame):
771
- await self._create_single_response(frame.messages)
772
- elif isinstance(frame, LLMUpdateSettingsFrame):
773
- await self._update_settings(frame.settings)
774
- elif isinstance(frame, LLMSetToolsFrame):
775
- await self._update_settings()
776
- else:
777
- await self.push_frame(frame, direction)
778
-
779
- #
780
- # websocket communication
781
- #
782
-
783
- async def send_client_event(self, event):
784
- """Send a client event to the Gemini Live API.
785
-
786
- Args:
787
- event: The event to send.
788
- """
789
- await self._ws_send(event.model_dump(exclude_none=True))
790
-
791
- async def _connect(self):
792
- """Establish WebSocket connection to Gemini Live API."""
793
- if self._websocket:
794
- # Here we assume that if we have a websocket, we are connected. We
795
- # handle disconnections in the send/recv code paths.
796
- return
797
-
798
- logger.info("Connecting to Gemini service")
799
- try:
800
- logger.info(f"Connecting to wss://{self._base_url}")
801
- uri = f"wss://{self._base_url}?key={self._api_key}"
802
- self._websocket = await websocket_connect(uri=uri)
803
- self._receive_task = self.create_task(self._receive_task_handler())
804
-
805
- # Create the basic configuration
806
- config_data = {
807
- "setup": {
808
- "model": self._model_name,
809
- "generation_config": {
810
- "frequency_penalty": self._settings["frequency_penalty"],
811
- "max_output_tokens": self._settings["max_tokens"],
812
- "presence_penalty": self._settings["presence_penalty"],
813
- "temperature": self._settings["temperature"],
814
- "top_k": self._settings["top_k"],
815
- "top_p": self._settings["top_p"],
816
- "response_modalities": self._settings["modalities"].value,
817
- "speech_config": {
818
- "voice_config": {
819
- "prebuilt_voice_config": {"voice_name": self._voice_id}
820
- },
821
- "language_code": self._settings["language"],
822
- },
823
- "media_resolution": self._settings["media_resolution"].value,
824
- },
825
- "input_audio_transcription": {},
826
- "output_audio_transcription": {},
827
- }
828
- }
829
-
830
- # Add context window compression if enabled
831
- if self._settings.get("context_window_compression", {}).get("enabled", False):
832
- compression_config = {}
833
- # Add sliding window (always true if compression is enabled)
834
- compression_config["sliding_window"] = {}
835
-
836
- # Add trigger_tokens if specified
837
- trigger_tokens = self._settings.get("context_window_compression", {}).get(
838
- "trigger_tokens"
839
- )
840
- if trigger_tokens is not None:
841
- compression_config["trigger_tokens"] = trigger_tokens
842
-
843
- config_data["setup"]["context_window_compression"] = compression_config
844
-
845
- # Add VAD configuration if provided
846
- if self._settings.get("vad"):
847
- vad_config = {}
848
- vad_params = self._settings["vad"]
849
-
850
- # Only add parameters that are explicitly set
851
- if vad_params.disabled is not None:
852
- vad_config["disabled"] = vad_params.disabled
853
-
854
- if vad_params.start_sensitivity:
855
- vad_config["start_of_speech_sensitivity"] = vad_params.start_sensitivity.value
856
-
857
- if vad_params.end_sensitivity:
858
- vad_config["end_of_speech_sensitivity"] = vad_params.end_sensitivity.value
859
-
860
- if vad_params.prefix_padding_ms is not None:
861
- vad_config["prefix_padding_ms"] = vad_params.prefix_padding_ms
862
-
863
- if vad_params.silence_duration_ms is not None:
864
- vad_config["silence_duration_ms"] = vad_params.silence_duration_ms
865
-
866
- # Only add automatic_activity_detection if we have VAD settings
867
- if vad_config:
868
- realtime_config = {"automatic_activity_detection": vad_config}
869
-
870
- config_data["setup"]["realtime_input_config"] = realtime_config
871
-
872
- config = events.Config.model_validate(config_data)
873
-
874
- # Add system instruction if available
875
- system_instruction = self._system_instruction or ""
876
- if self._context and hasattr(self._context, "extract_system_instructions"):
877
- system_instruction += "\n" + self._context.extract_system_instructions()
878
- if system_instruction:
879
- logger.debug(f"Setting system instruction: {system_instruction}")
880
- config.setup.system_instruction = events.SystemInstruction(
881
- parts=[events.ContentPart(text=system_instruction)]
882
- )
883
-
884
- # Add tools if available
885
- if self._tools:
886
- logger.debug(f"Gemini is configuring to use tools{self._tools}")
887
- config.setup.tools = self.get_llm_adapter().from_standard_tools(self._tools)
888
-
889
- # Send the configuration
890
- await self.send_client_event(config)
891
-
892
- except Exception as e:
893
- logger.error(f"{self} initialization error: {e}")
894
- self._websocket = None
895
-
896
- async def _disconnect(self):
897
- """Disconnect from Gemini Live API and clean up resources."""
898
- logger.info("Disconnecting from Gemini service")
899
- try:
900
- self._disconnecting = True
901
- self._api_session_ready = False
902
- await self.stop_all_metrics()
903
- if self._websocket:
904
- await self._websocket.close()
905
- self._websocket = None
906
- if self._receive_task:
907
- await self.cancel_task(self._receive_task, timeout=1.0)
908
- self._receive_task = None
909
- self._disconnecting = False
910
- except Exception as e:
911
- logger.error(f"{self} error disconnecting: {e}")
912
-
913
- async def _ws_send(self, message):
914
- """Send a message to the WebSocket connection."""
915
- # logger.debug(f"Sending message to websocket: {message}")
916
- try:
917
- if self._websocket:
918
- await self._websocket.send(json.dumps(message))
919
- except Exception as e:
920
- if self._disconnecting:
921
- return
922
- logger.error(f"Error sending message to websocket: {e}")
923
- # In server-to-server contexts, a WebSocket error should be quite rare. Given how hard
924
- # it is to recover from a send-side error with proper state management, and that exponential
925
- # backoff for retries can have cost/stability implications for a service cluster, let's just
926
- # treat a send-side error as fatal.
927
- await self.push_error(ErrorFrame(error=f"Error sending client event: {e}", fatal=True))
928
-
929
- #
930
- # inbound server event handling
931
- # todo: docs link here
932
- #
933
-
934
- async def _receive_task_handler(self):
935
- """Handle incoming messages from the WebSocket connection."""
936
- async for message in self._websocket:
937
- evt = events.parse_server_event(message)
938
- # logger.debug(f"Received event: {message[:500]}")
939
- # logger.debug(f"Received event: {evt}")
940
-
941
- if evt.setupComplete:
942
- await self._handle_evt_setup_complete(evt)
943
- elif evt.serverContent and evt.serverContent.modelTurn:
944
- await self._handle_evt_model_turn(evt)
945
- elif evt.serverContent and evt.serverContent.turnComplete and evt.usageMetadata:
946
- await self._handle_evt_turn_complete(evt)
947
- await self._handle_evt_usage_metadata(evt)
948
- elif evt.serverContent and evt.serverContent.inputTranscription:
949
- await self._handle_evt_input_transcription(evt)
950
- elif evt.serverContent and evt.serverContent.outputTranscription:
951
- await self._handle_evt_output_transcription(evt)
952
- elif evt.serverContent and evt.serverContent.groundingMetadata:
953
- await self._handle_evt_grounding_metadata(evt)
954
- elif evt.toolCall:
955
- await self._handle_evt_tool_call(evt)
956
- elif False: # !!! todo: error events?
957
- await self._handle_evt_error(evt)
958
- # errors are fatal, so exit the receive loop
959
- return
960
-
961
- #
962
- #
963
- #
964
-
965
- async def _send_user_audio(self, frame):
966
- """Send user audio frame to Gemini Live API."""
967
- if self._audio_input_paused:
968
- return
969
- # Send all audio to Gemini
970
- evt = events.AudioInputMessage.from_raw_audio(frame.audio, frame.sample_rate)
971
- await self.send_client_event(evt)
972
- # Manage a buffer of audio to use for transcription
973
- audio = frame.audio
974
- if self._user_is_speaking:
975
- self._user_audio_buffer.extend(audio)
976
- else:
977
- # Keep 1/2 second of audio in the buffer even when not speaking.
978
- self._user_audio_buffer.extend(audio)
979
- length = int((frame.sample_rate * frame.num_channels * 2) * 0.5)
980
- self._user_audio_buffer = self._user_audio_buffer[-length:]
981
-
982
- async def _send_user_text(self, text: str):
983
- """Send user text via Gemini Live API's realtime input stream.
984
-
985
- This method sends text through the realtimeInput stream (via TextInputMessage)
986
- rather than the clientContent stream. This ensures text input is synchronized
987
- with audio and video inputs, preventing temporal misalignment that can occur
988
- when different modalities are processed through separate API pathways.
989
-
990
- For realtimeInput, turn completion is automatically inferred by the API based
991
- on user activity, so no explicit turnComplete signal is needed.
992
-
993
- Args:
994
- text: The text to send as user input.
995
- """
996
- evt = events.TextInputMessage.from_text(text)
997
- await self.send_client_event(evt)
998
-
999
- async def _send_user_video(self, frame):
1000
- """Send user video frame to Gemini Live API."""
1001
- if self._video_input_paused:
1002
- return
1003
-
1004
- now = time.time()
1005
- if now - self._last_sent_time < 1:
1006
- return # Ignore if less than 1 second has passed
1007
-
1008
- self._last_sent_time = now # Update last sent time
1009
- logger.debug(f"Sending video frame to Gemini: {frame}")
1010
- evt = events.VideoInputMessage.from_image_frame(frame)
1011
- await self.send_client_event(evt)
1012
-
1013
- async def _create_initial_response(self):
1014
- """Create initial response based on context history."""
1015
- if not self._api_session_ready:
1016
- self._run_llm_when_api_session_ready = True
1017
- return
1018
-
1019
- messages = self._context.get_messages_for_initializing_history()
1020
- if not messages:
1021
- return
1022
-
1023
- logger.debug(f"Creating initial response: {messages}")
1024
-
1025
- await self.start_ttfb_metrics()
1026
-
1027
- evt = events.ClientContentMessage.model_validate(
1028
- {
1029
- "clientContent": {
1030
- "turns": messages,
1031
- "turnComplete": self._inference_on_context_initialization,
1032
- }
1033
- }
1034
- )
1035
- await self.send_client_event(evt)
1036
- if not self._inference_on_context_initialization:
1037
- self._needs_turn_complete_message = True
1038
-
1039
- async def _create_single_response(self, messages_list):
1040
- """Create a single response from a list of messages."""
1041
- # Refactor to combine this logic with same logic in GeminiMultimodalLiveContext
1042
- messages = []
1043
- for item in messages_list:
1044
- role = item.get("role")
1045
-
1046
- if role == "system":
1047
- continue
1048
-
1049
- elif role == "assistant":
1050
- role = "model"
1051
-
1052
- content = item.get("content")
1053
- parts = []
1054
- if isinstance(content, str):
1055
- parts = [{"text": content}]
1056
- elif isinstance(content, list):
1057
- for part in content:
1058
- if part.get("type") == "text":
1059
- parts.append({"text": part.get("text")})
1060
- elif part.get("type") == "file_data":
1061
- file_data = part.get("file_data", {})
1062
-
1063
- parts.append(
1064
- {
1065
- "fileData": {
1066
- "mimeType": file_data.get("mime_type"),
1067
- "fileUri": file_data.get("file_uri"),
1068
- }
1069
- }
1070
- )
1071
- else:
1072
- logger.warning(f"Unsupported content type: {str(part)[:80]}")
1073
- else:
1074
- logger.warning(f"Unsupported content type: {str(content)[:80]}")
1075
- messages.append({"role": role, "parts": parts})
1076
- if not messages:
1077
- return
1078
- logger.debug(f"Creating response: {messages}")
1079
-
1080
- await self.start_ttfb_metrics()
1081
-
1082
- evt = events.ClientContentMessage.model_validate(
1083
- {
1084
- "clientContent": {
1085
- "turns": messages,
1086
- "turnComplete": True,
1087
- }
1088
- }
1089
- )
1090
- await self.send_client_event(evt)
1091
-
1092
- @traced_gemini_live(operation="llm_tool_result")
1093
- async def _tool_result(self, tool_result_message):
1094
- """Send tool result back to the API."""
1095
- # For now we're shoving the name into the tool_call_id field, so this
1096
- # will work until we revisit that.
1097
- id = tool_result_message.get("tool_call_id")
1098
- name = tool_result_message.get("tool_call_name")
1099
- result = json.loads(tool_result_message.get("content") or "")
1100
- response_message = json.dumps(
1101
- {
1102
- "toolResponse": {
1103
- "functionResponses": [
1104
- {
1105
- "id": id,
1106
- "name": name,
1107
- "response": {
1108
- "result": result,
1109
- },
1110
- }
1111
- ],
1112
- }
1113
- }
1114
- )
1115
- await self._websocket.send(response_message)
1116
- # await self._websocket.send(json.dumps({"clientContent": {"turnComplete": True}}))
1117
-
1118
- @traced_gemini_live(operation="llm_setup")
1119
- async def _handle_evt_setup_complete(self, evt):
1120
- """Handle the setup complete event."""
1121
- # If this is our first context frame, run the LLM
1122
- self._api_session_ready = True
1123
- # Now that we've configured the session, we can run the LLM if we need to.
1124
- if self._run_llm_when_api_session_ready:
1125
- self._run_llm_when_api_session_ready = False
1126
- await self._create_initial_response()
1127
-
1128
- async def _handle_evt_model_turn(self, evt):
1129
- """Handle the model turn event."""
1130
- part = evt.serverContent.modelTurn.parts[0]
1131
- if not part:
1132
- return
1133
-
1134
- await self.stop_ttfb_metrics()
1135
-
1136
- # part.text is added when `modalities` is set to TEXT; otherwise, it's None
1137
- text = part.text
1138
- if text:
1139
- if not self._bot_text_buffer:
1140
- await self.push_frame(LLMFullResponseStartFrame())
1141
-
1142
- self._bot_text_buffer += text
1143
- self._search_result_buffer += text # Also accumulate for grounding
1144
- await self.push_frame(LLMTextFrame(text=text))
1145
-
1146
- # Check for grounding metadata in server content
1147
- if evt.serverContent and evt.serverContent.groundingMetadata:
1148
- self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
1149
-
1150
- inline_data = part.inlineData
1151
- if not inline_data:
1152
- return
1153
- if inline_data.mimeType != f"audio/pcm;rate={self._sample_rate}":
1154
- logger.warning(f"Unrecognized server_content format {inline_data.mimeType}")
1155
- return
1156
-
1157
- audio = base64.b64decode(inline_data.data)
1158
- if not audio:
1159
- return
1160
-
1161
- if not self._bot_is_speaking:
1162
- self._bot_is_speaking = True
1163
- await self.push_frame(TTSStartedFrame())
1164
- await self.push_frame(LLMFullResponseStartFrame())
1165
-
1166
- self._bot_audio_buffer.extend(audio)
1167
- frame = TTSAudioRawFrame(
1168
- audio=audio,
1169
- sample_rate=self._sample_rate,
1170
- num_channels=1,
1171
- )
1172
- await self.push_frame(frame)
1173
-
1174
- @traced_gemini_live(operation="llm_tool_call")
1175
- async def _handle_evt_tool_call(self, evt):
1176
- """Handle tool call events."""
1177
- function_calls = evt.toolCall.functionCalls
1178
- if not function_calls:
1179
- return
1180
- if not self._context:
1181
- logger.error("Function calls are not supported without a context object.")
1182
-
1183
- function_calls_llm = [
1184
- FunctionCallFromLLM(
1185
- context=self._context,
1186
- tool_call_id=f.id,
1187
- function_name=f.name,
1188
- arguments=f.args,
1189
- )
1190
- for f in function_calls
1191
- ]
1192
-
1193
- await self.run_function_calls(function_calls_llm)
1194
-
1195
- @traced_gemini_live(operation="llm_response")
1196
- async def _handle_evt_turn_complete(self, evt):
1197
- """Handle the turn complete event."""
1198
- self._bot_is_speaking = False
1199
- text = self._bot_text_buffer
1200
-
1201
- # Determine output and modality for tracing
1202
- if text:
1203
- # TEXT modality
1204
- output_text = text
1205
- output_modality = "TEXT"
1206
- else:
1207
- # AUDIO modality
1208
- output_text = self._llm_output_buffer
1209
- output_modality = "AUDIO"
1210
-
1211
- # Trace the complete LLM response (this will be handled by the decorator)
1212
- # The decorator will extract the output text and usage metadata from the event
1213
-
1214
- self._bot_text_buffer = ""
1215
- self._llm_output_buffer = ""
1216
-
1217
- # Process grounding metadata if we have accumulated any
1218
- if self._accumulated_grounding_metadata:
1219
- await self._process_grounding_metadata(
1220
- self._accumulated_grounding_metadata, self._search_result_buffer
1221
- )
1222
-
1223
- # Reset grounding tracking for next response
1224
- self._search_result_buffer = ""
1225
- self._accumulated_grounding_metadata = None
1226
-
1227
- # Only push the TTSStoppedFrame if the bot is outputting audio
1228
- # when text is found, modalities is set to TEXT and no audio
1229
- # is produced.
1230
- if not text:
1231
- await self.push_frame(TTSStoppedFrame())
1232
-
1233
- await self.push_frame(LLMFullResponseEndFrame())
1234
-
1235
- @traced_stt
1236
- async def _handle_user_transcription(
1237
- self, transcript: str, is_final: bool, language: Optional[Language] = None
1238
- ):
1239
- """Handle a transcription result with tracing."""
1240
- pass
1241
-
1242
- async def _handle_evt_input_transcription(self, evt):
1243
- """Handle the input transcription event.
1244
-
1245
- Gemini Live sends user transcriptions in either single words or multi-word
1246
- phrases. As a result, we have to aggregate the input transcription. This handler
1247
- aggregates into sentences, splitting on the end of sentence markers.
1248
- """
1249
- if not evt.serverContent.inputTranscription:
1250
- return
1251
-
1252
- text = evt.serverContent.inputTranscription.text
1253
-
1254
- if not text:
1255
- return
1256
-
1257
- # Strip leading space from sentence starts if buffer is empty
1258
- if text.startswith(" ") and not self._user_transcription_buffer:
1259
- text = text.lstrip()
1260
-
1261
- # Accumulate text in the buffer
1262
- self._user_transcription_buffer += text
1263
-
1264
- # Check for complete sentences
1265
- while True:
1266
- eos_end_marker = match_endofsentence(self._user_transcription_buffer)
1267
- if not eos_end_marker:
1268
- break
1269
-
1270
- # Extract the complete sentence
1271
- complete_sentence = self._user_transcription_buffer[:eos_end_marker]
1272
- # Keep the remainder for the next chunk
1273
- self._user_transcription_buffer = self._user_transcription_buffer[eos_end_marker:]
1274
-
1275
- # Send a TranscriptionFrame with the complete sentence
1276
- logger.debug(f"[Transcription:user] [{complete_sentence}]")
1277
- await self._handle_user_transcription(
1278
- complete_sentence, True, self._settings["language"]
1279
- )
1280
- await self.push_frame(
1281
- TranscriptionFrame(
1282
- text=complete_sentence,
1283
- user_id="",
1284
- timestamp=time_now_iso8601(),
1285
- result=evt,
1286
- ),
1287
- FrameDirection.UPSTREAM,
1288
- )
1289
-
1290
- async def _handle_evt_output_transcription(self, evt):
1291
- """Handle the output transcription event."""
1292
- if not evt.serverContent.outputTranscription:
1293
- return
1294
-
1295
- # This is the output transcription text when modalities is set to AUDIO.
1296
- # In this case, we push LLMTextFrame and TTSTextFrame to be handled by the
1297
- # downstream assistant context aggregator.
1298
- text = evt.serverContent.outputTranscription.text
1299
-
1300
- if not text:
1301
- return
1302
-
1303
- # Accumulate text for grounding as well
1304
- self._search_result_buffer += text
1305
-
1306
- # Check for grounding metadata in server content
1307
- if evt.serverContent and evt.serverContent.groundingMetadata:
1308
- self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
1309
- # Collect text for tracing
1310
- self._llm_output_buffer += text
1311
-
1312
- await self.push_frame(LLMTextFrame(text=text))
1313
- await self.push_frame(TTSTextFrame(text=text))
1314
-
1315
- async def _handle_evt_grounding_metadata(self, evt):
1316
- """Handle dedicated grounding metadata events."""
1317
- if evt.serverContent and evt.serverContent.groundingMetadata:
1318
- grounding_metadata = evt.serverContent.groundingMetadata
1319
- # Process the grounding metadata immediately
1320
- await self._process_grounding_metadata(grounding_metadata, self._search_result_buffer)
1321
-
1322
- async def _process_grounding_metadata(
1323
- self, grounding_metadata: events.GroundingMetadata, search_result: str = ""
1324
- ):
1325
- """Process grounding metadata and emit LLMSearchResponseFrame."""
1326
- if not grounding_metadata:
1327
- return
1328
-
1329
- # Extract rendered content for search suggestions
1330
- rendered_content = None
1331
- if (
1332
- grounding_metadata.searchEntryPoint
1333
- and grounding_metadata.searchEntryPoint.renderedContent
1334
- ):
1335
- rendered_content = grounding_metadata.searchEntryPoint.renderedContent
1336
-
1337
- # Convert grounding chunks and supports to LLMSearchOrigin format
1338
- origins = []
1339
-
1340
- if grounding_metadata.groundingChunks and grounding_metadata.groundingSupports:
1341
- # Create a mapping of chunk indices to origins
1342
- chunk_to_origin = {}
1343
-
1344
- for index, chunk in enumerate(grounding_metadata.groundingChunks):
1345
- if chunk.web:
1346
- origin = LLMSearchOrigin(
1347
- site_uri=chunk.web.uri, site_title=chunk.web.title, results=[]
1348
- )
1349
- chunk_to_origin[index] = origin
1350
- origins.append(origin)
1351
-
1352
- # Add grounding support results to the appropriate origins
1353
- for support in grounding_metadata.groundingSupports:
1354
- if support.segment and support.groundingChunkIndices:
1355
- text = support.segment.text or ""
1356
- confidence_scores = support.confidenceScores or []
1357
-
1358
- # Add this result to all origins referenced by this support
1359
- for chunk_index in support.groundingChunkIndices:
1360
- if chunk_index in chunk_to_origin:
1361
- result = LLMSearchResult(text=text, confidence=confidence_scores)
1362
- chunk_to_origin[chunk_index].results.append(result)
1363
-
1364
- # Create and push the search response frame
1365
- search_frame = LLMSearchResponseFrame(
1366
- search_result=search_result, origins=origins, rendered_content=rendered_content
1367
- )
1368
-
1369
- await self.push_frame(search_frame)
1370
-
1371
- async def _handle_evt_usage_metadata(self, evt):
1372
- """Handle the usage metadata event."""
1373
- if not evt.usageMetadata:
1374
- return
1375
-
1376
- usage = evt.usageMetadata
1377
-
1378
- # Ensure we have valid integers for all token counts
1379
- prompt_tokens = usage.promptTokenCount or 0
1380
- completion_tokens = usage.responseTokenCount or 0
1381
- total_tokens = usage.totalTokenCount or (prompt_tokens + completion_tokens)
1382
-
1383
- tokens = LLMTokenUsage(
1384
- prompt_tokens=prompt_tokens,
1385
- completion_tokens=completion_tokens,
1386
- total_tokens=total_tokens,
1387
- )
1388
-
1389
- await self.start_llm_usage_metrics(tokens)
1390
-
1391
- def create_context_aggregator(
1392
- self,
1393
- context: OpenAILLMContext,
1394
- *,
1395
- user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
1396
- assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
1397
- ) -> GeminiMultimodalLiveContextAggregatorPair:
1398
- """Create an instance of GeminiMultimodalLiveContextAggregatorPair from an OpenAILLMContext.
1399
-
1400
- Constructor keyword arguments for both the user and assistant aggregators can be provided.
1401
-
1402
- Args:
1403
- context: The LLM context to use.
1404
- user_params: User aggregator parameters. Defaults to LLMUserAggregatorParams().
1405
- assistant_params: Assistant aggregator parameters. Defaults to LLMAssistantAggregatorParams().
1406
-
1407
- Returns:
1408
- GeminiMultimodalLiveContextAggregatorPair: A pair of context
1409
- aggregators, one for the user and one for the assistant,
1410
- encapsulated in an GeminiMultimodalLiveContextAggregatorPair.
1411
- """
1412
- context.set_llm_adapter(self.get_llm_adapter())
1413
-
1414
- GeminiMultimodalLiveContext.upgrade(context)
1415
- user = GeminiMultimodalLiveUserContextAggregator(context, params=user_params)
1416
47
 
1417
- assistant_params.expect_stripped_words = False
1418
- assistant = GeminiMultimodalLiveAssistantContextAggregator(context, params=assistant_params)
1419
- return GeminiMultimodalLiveContextAggregatorPair(_user=user, _assistant=assistant)
48
+ GeminiMultimodalLiveContext = GeminiLiveContext
49
+ GeminiMultimodalLiveUserContextAggregator = GeminiLiveUserContextAggregator
50
+ GeminiMultimodalLiveAssistantContextAggregator = GeminiLiveAssistantContextAggregator
51
+ GeminiMultimodalLiveContextAggregatorPair = GeminiLiveContextAggregatorPair
52
+ GeminiMultimodalModalities = GeminiModalities
53
+ GeminiMediaResolution = _GeminiMediaResolution
54
+ GeminiVADParams = _GeminiVADParams
55
+ ContextWindowCompressionParams = _ContextWindowCompressionParams
56
+ InputParams = _InputParams
57
+ GeminiMultimodalLiveLLMService = GeminiLiveLLMService