dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -4,1416 +4,54 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
"""Google Gemini
|
|
7
|
+
"""Google Gemini Live API service implementation.
|
|
8
8
|
|
|
9
9
|
This module provides real-time conversational AI capabilities using Google's
|
|
10
|
-
Gemini
|
|
10
|
+
Gemini Live API, supporting both text and audio modalities with
|
|
11
11
|
voice transcription, streaming responses, and tool usage.
|
|
12
|
-
"""
|
|
13
12
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
from typing import Any, Dict, List, Optional, Union
|
|
13
|
+
.. deprecated:: 0.0.90
|
|
14
|
+
This module is deprecated. Please use the equivalent types from
|
|
15
|
+
pipecat.services.google.gemini_live.llm instead. Note that the new type names
|
|
16
|
+
do not include 'Multimodal'.
|
|
17
|
+
"""
|
|
20
18
|
|
|
21
|
-
|
|
22
|
-
from pydantic import BaseModel, Field
|
|
19
|
+
import warnings
|
|
23
20
|
|
|
24
|
-
from pipecat.
|
|
25
|
-
|
|
26
|
-
from pipecat.frames.frames import (
|
|
27
|
-
BotStartedSpeakingFrame,
|
|
28
|
-
BotStoppedSpeakingFrame,
|
|
29
|
-
CancelFrame,
|
|
30
|
-
EndFrame,
|
|
31
|
-
ErrorFrame,
|
|
32
|
-
Frame,
|
|
33
|
-
InputAudioRawFrame,
|
|
34
|
-
InputImageRawFrame,
|
|
35
|
-
InputTextRawFrame,
|
|
36
|
-
LLMContextFrame,
|
|
37
|
-
LLMFullResponseEndFrame,
|
|
38
|
-
LLMFullResponseStartFrame,
|
|
39
|
-
LLMMessagesAppendFrame,
|
|
40
|
-
LLMSetToolsFrame,
|
|
41
|
-
LLMTextFrame,
|
|
42
|
-
LLMUpdateSettingsFrame,
|
|
43
|
-
StartFrame,
|
|
44
|
-
StartInterruptionFrame,
|
|
45
|
-
TranscriptionFrame,
|
|
46
|
-
TTSAudioRawFrame,
|
|
47
|
-
TTSStartedFrame,
|
|
48
|
-
TTSStoppedFrame,
|
|
49
|
-
TTSTextFrame,
|
|
50
|
-
UserImageRawFrame,
|
|
51
|
-
UserStartedSpeakingFrame,
|
|
52
|
-
UserStoppedSpeakingFrame,
|
|
53
|
-
)
|
|
54
|
-
from pipecat.metrics.metrics import LLMTokenUsage
|
|
55
|
-
from pipecat.processors.aggregators.llm_response import (
|
|
56
|
-
LLMAssistantAggregatorParams,
|
|
57
|
-
LLMUserAggregatorParams,
|
|
58
|
-
)
|
|
59
|
-
from pipecat.processors.aggregators.openai_llm_context import (
|
|
60
|
-
OpenAILLMContext,
|
|
61
|
-
OpenAILLMContextFrame,
|
|
21
|
+
from pipecat.services.google.gemini_live.llm import (
|
|
22
|
+
ContextWindowCompressionParams as _ContextWindowCompressionParams,
|
|
62
23
|
)
|
|
63
|
-
from pipecat.
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
24
|
+
from pipecat.services.google.gemini_live.llm import (
|
|
25
|
+
GeminiLiveAssistantContextAggregator,
|
|
26
|
+
GeminiLiveContext,
|
|
27
|
+
GeminiLiveContextAggregatorPair,
|
|
28
|
+
GeminiLiveLLMService,
|
|
29
|
+
GeminiLiveUserContextAggregator,
|
|
30
|
+
GeminiModalities,
|
|
69
31
|
)
|
|
70
|
-
from pipecat.
|
|
71
|
-
from pipecat.
|
|
72
|
-
from pipecat.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
def language_to_gemini_language(language: Language) -> Optional[str]:
|
|
87
|
-
"""Maps a Language enum value to a Gemini Live supported language code.
|
|
88
|
-
|
|
89
|
-
Source:
|
|
90
|
-
https://ai.google.dev/api/generate-content#MediaResolution
|
|
91
|
-
|
|
92
|
-
Args:
|
|
93
|
-
language: The language enum value to convert.
|
|
94
|
-
|
|
95
|
-
Returns:
|
|
96
|
-
The Gemini language code string, or None if the language is not supported.
|
|
97
|
-
"""
|
|
98
|
-
language_map = {
|
|
99
|
-
# Arabic
|
|
100
|
-
Language.AR: "ar-XA",
|
|
101
|
-
# Bengali
|
|
102
|
-
Language.BN_IN: "bn-IN",
|
|
103
|
-
# Chinese (Mandarin)
|
|
104
|
-
Language.CMN: "cmn-CN",
|
|
105
|
-
Language.CMN_CN: "cmn-CN",
|
|
106
|
-
Language.ZH: "cmn-CN", # Map general Chinese to Mandarin for Gemini
|
|
107
|
-
Language.ZH_CN: "cmn-CN", # Map Simplified Chinese to Mandarin for Gemini
|
|
108
|
-
# German
|
|
109
|
-
Language.DE: "de-DE",
|
|
110
|
-
Language.DE_DE: "de-DE",
|
|
111
|
-
# English
|
|
112
|
-
Language.EN: "en-US", # Default to US English (though not explicitly listed in supported codes)
|
|
113
|
-
Language.EN_US: "en-US",
|
|
114
|
-
Language.EN_AU: "en-AU",
|
|
115
|
-
Language.EN_GB: "en-GB",
|
|
116
|
-
Language.EN_IN: "en-IN",
|
|
117
|
-
# Spanish
|
|
118
|
-
Language.ES: "es-ES", # Default to Spain Spanish
|
|
119
|
-
Language.ES_ES: "es-ES",
|
|
120
|
-
Language.ES_US: "es-US",
|
|
121
|
-
# French
|
|
122
|
-
Language.FR: "fr-FR", # Default to France French
|
|
123
|
-
Language.FR_FR: "fr-FR",
|
|
124
|
-
Language.FR_CA: "fr-CA",
|
|
125
|
-
# Gujarati
|
|
126
|
-
Language.GU: "gu-IN",
|
|
127
|
-
Language.GU_IN: "gu-IN",
|
|
128
|
-
# Hindi
|
|
129
|
-
Language.HI: "hi-IN",
|
|
130
|
-
Language.HI_IN: "hi-IN",
|
|
131
|
-
# Indonesian
|
|
132
|
-
Language.ID: "id-ID",
|
|
133
|
-
Language.ID_ID: "id-ID",
|
|
134
|
-
# Italian
|
|
135
|
-
Language.IT: "it-IT",
|
|
136
|
-
Language.IT_IT: "it-IT",
|
|
137
|
-
# Japanese
|
|
138
|
-
Language.JA: "ja-JP",
|
|
139
|
-
Language.JA_JP: "ja-JP",
|
|
140
|
-
# Kannada
|
|
141
|
-
Language.KN: "kn-IN",
|
|
142
|
-
Language.KN_IN: "kn-IN",
|
|
143
|
-
# Korean
|
|
144
|
-
Language.KO: "ko-KR",
|
|
145
|
-
Language.KO_KR: "ko-KR",
|
|
146
|
-
# Malayalam
|
|
147
|
-
Language.ML: "ml-IN",
|
|
148
|
-
Language.ML_IN: "ml-IN",
|
|
149
|
-
# Marathi
|
|
150
|
-
Language.MR: "mr-IN",
|
|
151
|
-
Language.MR_IN: "mr-IN",
|
|
152
|
-
# Dutch
|
|
153
|
-
Language.NL: "nl-NL",
|
|
154
|
-
Language.NL_NL: "nl-NL",
|
|
155
|
-
# Polish
|
|
156
|
-
Language.PL: "pl-PL",
|
|
157
|
-
Language.PL_PL: "pl-PL",
|
|
158
|
-
# Portuguese (Brazil)
|
|
159
|
-
Language.PT_BR: "pt-BR",
|
|
160
|
-
# Russian
|
|
161
|
-
Language.RU: "ru-RU",
|
|
162
|
-
Language.RU_RU: "ru-RU",
|
|
163
|
-
# Tamil
|
|
164
|
-
Language.TA: "ta-IN",
|
|
165
|
-
Language.TA_IN: "ta-IN",
|
|
166
|
-
# Telugu
|
|
167
|
-
Language.TE: "te-IN",
|
|
168
|
-
Language.TE_IN: "te-IN",
|
|
169
|
-
# Thai
|
|
170
|
-
Language.TH: "th-TH",
|
|
171
|
-
Language.TH_TH: "th-TH",
|
|
172
|
-
# Turkish
|
|
173
|
-
Language.TR: "tr-TR",
|
|
174
|
-
Language.TR_TR: "tr-TR",
|
|
175
|
-
# Vietnamese
|
|
176
|
-
Language.VI: "vi-VN",
|
|
177
|
-
Language.VI_VN: "vi-VN",
|
|
178
|
-
}
|
|
179
|
-
return language_map.get(language)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
class GeminiMultimodalLiveContext(OpenAILLMContext):
|
|
183
|
-
"""Extended OpenAI context for Gemini Multimodal Live API.
|
|
184
|
-
|
|
185
|
-
Provides Gemini-specific context management including system instruction
|
|
186
|
-
extraction and message format conversion for the Live API.
|
|
187
|
-
"""
|
|
188
|
-
|
|
189
|
-
@staticmethod
|
|
190
|
-
def upgrade(obj: OpenAILLMContext) -> "GeminiMultimodalLiveContext":
|
|
191
|
-
"""Upgrade an OpenAI context to Gemini context.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
obj: The OpenAI context to upgrade.
|
|
195
|
-
|
|
196
|
-
Returns:
|
|
197
|
-
The upgraded Gemini context instance.
|
|
198
|
-
"""
|
|
199
|
-
if isinstance(obj, OpenAILLMContext) and not isinstance(obj, GeminiMultimodalLiveContext):
|
|
200
|
-
logger.debug(f"Upgrading to Gemini Multimodal Live Context: {obj}")
|
|
201
|
-
obj.__class__ = GeminiMultimodalLiveContext
|
|
202
|
-
obj._restructure_from_openai_messages()
|
|
203
|
-
return obj
|
|
204
|
-
|
|
205
|
-
def _restructure_from_openai_messages(self):
|
|
206
|
-
pass
|
|
207
|
-
|
|
208
|
-
def extract_system_instructions(self):
|
|
209
|
-
"""Extract system instructions from context messages.
|
|
210
|
-
|
|
211
|
-
Returns:
|
|
212
|
-
Combined system instruction text from all system messages.
|
|
213
|
-
"""
|
|
214
|
-
system_instruction = ""
|
|
215
|
-
for item in self.messages:
|
|
216
|
-
if item.get("role") == "system":
|
|
217
|
-
content = item.get("content", "")
|
|
218
|
-
if content:
|
|
219
|
-
if system_instruction and not system_instruction.endswith("\n"):
|
|
220
|
-
system_instruction += "\n"
|
|
221
|
-
system_instruction += str(content)
|
|
222
|
-
return system_instruction
|
|
223
|
-
|
|
224
|
-
def add_file_reference(self, file_uri: str, mime_type: str, text: Optional[str] = None):
|
|
225
|
-
"""Add a file reference to the context.
|
|
226
|
-
|
|
227
|
-
This adds a user message with a file reference that will be sent during context initialization.
|
|
228
|
-
|
|
229
|
-
Args:
|
|
230
|
-
file_uri: URI of the uploaded file
|
|
231
|
-
mime_type: MIME type of the file
|
|
232
|
-
text: Optional text prompt to accompany the file
|
|
233
|
-
"""
|
|
234
|
-
# Create parts list with file reference
|
|
235
|
-
parts = []
|
|
236
|
-
if text:
|
|
237
|
-
parts.append({"type": "text", "text": text})
|
|
238
|
-
|
|
239
|
-
# Add file reference part
|
|
240
|
-
parts.append(
|
|
241
|
-
{"type": "file_data", "file_data": {"mime_type": mime_type, "file_uri": file_uri}}
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
# Add to messages
|
|
245
|
-
message = {"role": "user", "content": parts}
|
|
246
|
-
self.messages.append(message)
|
|
247
|
-
logger.info(f"Added file reference to context: {file_uri}")
|
|
248
|
-
|
|
249
|
-
def get_messages_for_initializing_history(self):
|
|
250
|
-
"""Get messages formatted for Gemini history initialization.
|
|
251
|
-
|
|
252
|
-
Returns:
|
|
253
|
-
List of messages in Gemini format for conversation history.
|
|
254
|
-
"""
|
|
255
|
-
messages = []
|
|
256
|
-
for item in self.messages:
|
|
257
|
-
role = item.get("role")
|
|
258
|
-
|
|
259
|
-
if role == "system":
|
|
260
|
-
continue
|
|
261
|
-
|
|
262
|
-
elif role == "assistant":
|
|
263
|
-
role = "model"
|
|
264
|
-
|
|
265
|
-
content = item.get("content")
|
|
266
|
-
parts = []
|
|
267
|
-
if isinstance(content, str):
|
|
268
|
-
parts = [{"text": content}]
|
|
269
|
-
elif isinstance(content, list):
|
|
270
|
-
for part in content:
|
|
271
|
-
if part.get("type") == "text":
|
|
272
|
-
parts.append({"text": part.get("text")})
|
|
273
|
-
elif part.get("type") == "file_data":
|
|
274
|
-
file_data = part.get("file_data", {})
|
|
275
|
-
|
|
276
|
-
parts.append(
|
|
277
|
-
{
|
|
278
|
-
"fileData": {
|
|
279
|
-
"mimeType": file_data.get("mime_type"),
|
|
280
|
-
"fileUri": file_data.get("file_uri"),
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
)
|
|
284
|
-
else:
|
|
285
|
-
logger.warning(f"Unsupported content type: {str(part)[:80]}")
|
|
286
|
-
else:
|
|
287
|
-
logger.warning(f"Unsupported content type: {str(content)[:80]}")
|
|
288
|
-
messages.append({"role": role, "parts": parts})
|
|
289
|
-
return messages
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
class GeminiMultimodalLiveUserContextAggregator(OpenAIUserContextAggregator):
|
|
293
|
-
"""User context aggregator for Gemini Multimodal Live.
|
|
294
|
-
|
|
295
|
-
Extends OpenAI user aggregator to handle Gemini-specific message passing
|
|
296
|
-
while maintaining compatibility with the standard aggregation pipeline.
|
|
297
|
-
"""
|
|
298
|
-
|
|
299
|
-
async def process_frame(self, frame, direction):
|
|
300
|
-
"""Process incoming frames for user context aggregation.
|
|
301
|
-
|
|
302
|
-
Args:
|
|
303
|
-
frame: The frame to process.
|
|
304
|
-
direction: The frame processing direction.
|
|
305
|
-
"""
|
|
306
|
-
await super().process_frame(frame, direction)
|
|
307
|
-
# kind of a hack just to pass the LLMMessagesAppendFrame through, but it's fine for now
|
|
308
|
-
if isinstance(frame, LLMMessagesAppendFrame):
|
|
309
|
-
await self.push_frame(frame, direction)
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
class GeminiMultimodalLiveAssistantContextAggregator(OpenAIAssistantContextAggregator):
|
|
313
|
-
"""Assistant context aggregator for Gemini Multimodal Live.
|
|
314
|
-
|
|
315
|
-
Handles assistant response aggregation while filtering out LLMTextFrames
|
|
316
|
-
to prevent duplicate context entries, as Gemini Live pushes both
|
|
317
|
-
LLMTextFrames and TTSTextFrames.
|
|
318
|
-
"""
|
|
319
|
-
|
|
320
|
-
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
321
|
-
"""Process incoming frames for assistant context aggregation.
|
|
322
|
-
|
|
323
|
-
Args:
|
|
324
|
-
frame: The frame to process.
|
|
325
|
-
direction: The frame processing direction.
|
|
326
|
-
"""
|
|
327
|
-
# The LLMAssistantContextAggregator uses TextFrames to aggregate the LLM output,
|
|
328
|
-
# but the GeminiMultimodalLiveAssistantContextAggregator pushes LLMTextFrames and TTSTextFrames. We
|
|
329
|
-
# need to override this proces_frame for LLMTextFrame, so that only the TTSTextFrames
|
|
330
|
-
# are process. This ensures that the context gets only one set of messages.
|
|
331
|
-
if not isinstance(frame, LLMTextFrame):
|
|
332
|
-
await super().process_frame(frame, direction)
|
|
333
|
-
|
|
334
|
-
async def handle_user_image_frame(self, frame: UserImageRawFrame):
|
|
335
|
-
"""Handle user image frames.
|
|
336
|
-
|
|
337
|
-
Args:
|
|
338
|
-
frame: The user image frame to handle.
|
|
339
|
-
"""
|
|
340
|
-
# We don't want to store any images in the context. Revisit this later
|
|
341
|
-
# when the API evolves.
|
|
342
|
-
pass
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
@dataclass
|
|
346
|
-
class GeminiMultimodalLiveContextAggregatorPair:
|
|
347
|
-
"""Pair of user and assistant context aggregators for Gemini Multimodal Live.
|
|
348
|
-
|
|
349
|
-
Parameters:
|
|
350
|
-
_user: The user context aggregator instance.
|
|
351
|
-
_assistant: The assistant context aggregator instance.
|
|
352
|
-
"""
|
|
353
|
-
|
|
354
|
-
_user: GeminiMultimodalLiveUserContextAggregator
|
|
355
|
-
_assistant: GeminiMultimodalLiveAssistantContextAggregator
|
|
356
|
-
|
|
357
|
-
def user(self) -> GeminiMultimodalLiveUserContextAggregator:
|
|
358
|
-
"""Get the user context aggregator.
|
|
359
|
-
|
|
360
|
-
Returns:
|
|
361
|
-
The user context aggregator instance.
|
|
362
|
-
"""
|
|
363
|
-
return self._user
|
|
364
|
-
|
|
365
|
-
def assistant(self) -> GeminiMultimodalLiveAssistantContextAggregator:
|
|
366
|
-
"""Get the assistant context aggregator.
|
|
367
|
-
|
|
368
|
-
Returns:
|
|
369
|
-
The assistant context aggregator instance.
|
|
370
|
-
"""
|
|
371
|
-
return self._assistant
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
class GeminiMultimodalModalities(Enum):
|
|
375
|
-
"""Supported modalities for Gemini Multimodal Live.
|
|
376
|
-
|
|
377
|
-
Parameters:
|
|
378
|
-
TEXT: Text responses.
|
|
379
|
-
AUDIO: Audio responses.
|
|
380
|
-
"""
|
|
381
|
-
|
|
382
|
-
TEXT = "TEXT"
|
|
383
|
-
AUDIO = "AUDIO"
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
class GeminiMediaResolution(str, Enum):
|
|
387
|
-
"""Media resolution options for Gemini Multimodal Live.
|
|
388
|
-
|
|
389
|
-
Parameters:
|
|
390
|
-
UNSPECIFIED: Use default resolution setting.
|
|
391
|
-
LOW: Low resolution with 64 tokens.
|
|
392
|
-
MEDIUM: Medium resolution with 256 tokens.
|
|
393
|
-
HIGH: High resolution with zoomed reframing and 256 tokens.
|
|
394
|
-
"""
|
|
395
|
-
|
|
396
|
-
UNSPECIFIED = "MEDIA_RESOLUTION_UNSPECIFIED" # Use default
|
|
397
|
-
LOW = "MEDIA_RESOLUTION_LOW" # 64 tokens
|
|
398
|
-
MEDIUM = "MEDIA_RESOLUTION_MEDIUM" # 256 tokens
|
|
399
|
-
HIGH = "MEDIA_RESOLUTION_HIGH" # Zoomed reframing with 256 tokens
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
class GeminiVADParams(BaseModel):
|
|
403
|
-
"""Voice Activity Detection parameters for Gemini Live.
|
|
404
|
-
|
|
405
|
-
Parameters:
|
|
406
|
-
disabled: Whether to disable VAD. Defaults to None.
|
|
407
|
-
start_sensitivity: Sensitivity for speech start detection. Defaults to None.
|
|
408
|
-
end_sensitivity: Sensitivity for speech end detection. Defaults to None.
|
|
409
|
-
prefix_padding_ms: Prefix padding in milliseconds. Defaults to None.
|
|
410
|
-
silence_duration_ms: Silence duration threshold in milliseconds. Defaults to None.
|
|
411
|
-
"""
|
|
412
|
-
|
|
413
|
-
disabled: Optional[bool] = Field(default=None)
|
|
414
|
-
start_sensitivity: Optional[events.StartSensitivity] = Field(default=None)
|
|
415
|
-
end_sensitivity: Optional[events.EndSensitivity] = Field(default=None)
|
|
416
|
-
prefix_padding_ms: Optional[int] = Field(default=None)
|
|
417
|
-
silence_duration_ms: Optional[int] = Field(default=None)
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
class ContextWindowCompressionParams(BaseModel):
|
|
421
|
-
"""Parameters for context window compression in Gemini Live.
|
|
422
|
-
|
|
423
|
-
Parameters:
|
|
424
|
-
enabled: Whether compression is enabled. Defaults to False.
|
|
425
|
-
trigger_tokens: Token count to trigger compression. None uses 80% of context window.
|
|
426
|
-
"""
|
|
427
|
-
|
|
428
|
-
enabled: bool = Field(default=False)
|
|
429
|
-
trigger_tokens: Optional[int] = Field(
|
|
430
|
-
default=None
|
|
431
|
-
) # None = use default (80% of context window)
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
class InputParams(BaseModel):
|
|
435
|
-
"""Input parameters for Gemini Multimodal Live generation.
|
|
436
|
-
|
|
437
|
-
Parameters:
|
|
438
|
-
frequency_penalty: Frequency penalty for generation (0.0-2.0). Defaults to None.
|
|
439
|
-
max_tokens: Maximum tokens to generate. Must be >= 1. Defaults to 4096.
|
|
440
|
-
presence_penalty: Presence penalty for generation (0.0-2.0). Defaults to None.
|
|
441
|
-
temperature: Sampling temperature (0.0-2.0). Defaults to None.
|
|
442
|
-
top_k: Top-k sampling parameter. Must be >= 0. Defaults to None.
|
|
443
|
-
top_p: Top-p sampling parameter (0.0-1.0). Defaults to None.
|
|
444
|
-
modalities: Response modalities. Defaults to AUDIO.
|
|
445
|
-
language: Language for generation. Defaults to EN_US.
|
|
446
|
-
media_resolution: Media resolution setting. Defaults to UNSPECIFIED.
|
|
447
|
-
vad: Voice activity detection parameters. Defaults to None.
|
|
448
|
-
context_window_compression: Context compression settings. Defaults to None.
|
|
449
|
-
extra: Additional parameters. Defaults to empty dict.
|
|
450
|
-
"""
|
|
451
|
-
|
|
452
|
-
frequency_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
|
|
453
|
-
max_tokens: Optional[int] = Field(default=4096, ge=1)
|
|
454
|
-
presence_penalty: Optional[float] = Field(default=None, ge=0.0, le=2.0)
|
|
455
|
-
temperature: Optional[float] = Field(default=None, ge=0.0, le=2.0)
|
|
456
|
-
top_k: Optional[int] = Field(default=None, ge=0)
|
|
457
|
-
top_p: Optional[float] = Field(default=None, ge=0.0, le=1.0)
|
|
458
|
-
modalities: Optional[GeminiMultimodalModalities] = Field(
|
|
459
|
-
default=GeminiMultimodalModalities.AUDIO
|
|
32
|
+
from pipecat.services.google.gemini_live.llm import GeminiMediaResolution as _GeminiMediaResolution
|
|
33
|
+
from pipecat.services.google.gemini_live.llm import GeminiVADParams as _GeminiVADParams
|
|
34
|
+
from pipecat.services.google.gemini_live.llm import InputParams as _InputParams
|
|
35
|
+
|
|
36
|
+
with warnings.catch_warnings():
|
|
37
|
+
warnings.simplefilter("always")
|
|
38
|
+
warnings.warn(
|
|
39
|
+
"Types in pipecat.services.gemini_multimodal_live.gemini are deprecated. "
|
|
40
|
+
"Please use the equivalent types from "
|
|
41
|
+
"pipecat.services.google.gemini_live.llm instead. Note that the new type "
|
|
42
|
+
"names do not include 'Multimodal' "
|
|
43
|
+
"(e.g. `GeminiMultimodalLiveLLMService` is now `GeminiLiveLLMService`).",
|
|
44
|
+
DeprecationWarning,
|
|
45
|
+
stacklevel=2,
|
|
460
46
|
)
|
|
461
|
-
language: Optional[Language] = Field(default=Language.EN_US)
|
|
462
|
-
media_resolution: Optional[GeminiMediaResolution] = Field(
|
|
463
|
-
default=GeminiMediaResolution.UNSPECIFIED
|
|
464
|
-
)
|
|
465
|
-
vad: Optional[GeminiVADParams] = Field(default=None)
|
|
466
|
-
context_window_compression: Optional[ContextWindowCompressionParams] = Field(default=None)
|
|
467
|
-
extra: Optional[Dict[str, Any]] = Field(default_factory=dict)
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
class GeminiMultimodalLiveLLMService(LLMService):
|
|
471
|
-
"""Provides access to Google's Gemini Multimodal Live API.
|
|
472
|
-
|
|
473
|
-
This service enables real-time conversations with Gemini, supporting both
|
|
474
|
-
text and audio modalities. It handles voice transcription, streaming audio
|
|
475
|
-
responses, and tool usage.
|
|
476
|
-
"""
|
|
477
|
-
|
|
478
|
-
# Overriding the default adapter to use the Gemini one.
|
|
479
|
-
adapter_class = GeminiLLMAdapter
|
|
480
|
-
|
|
481
|
-
def __init__(
|
|
482
|
-
self,
|
|
483
|
-
*,
|
|
484
|
-
api_key: str,
|
|
485
|
-
base_url: str = "generativelanguage.googleapis.com/ws/google.ai.generativelanguage.v1beta.GenerativeService.BidiGenerateContent",
|
|
486
|
-
model="models/gemini-2.0-flash-live-001",
|
|
487
|
-
voice_id: str = "Charon",
|
|
488
|
-
start_audio_paused: bool = False,
|
|
489
|
-
start_video_paused: bool = False,
|
|
490
|
-
system_instruction: Optional[str] = None,
|
|
491
|
-
tools: Optional[Union[List[dict], ToolsSchema]] = None,
|
|
492
|
-
params: Optional[InputParams] = None,
|
|
493
|
-
inference_on_context_initialization: bool = True,
|
|
494
|
-
file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
|
|
495
|
-
**kwargs,
|
|
496
|
-
):
|
|
497
|
-
"""Initialize the Gemini Multimodal Live LLM service.
|
|
498
|
-
|
|
499
|
-
Args:
|
|
500
|
-
api_key: Google AI API key for authentication.
|
|
501
|
-
base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
|
|
502
|
-
model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
|
|
503
|
-
voice_id: TTS voice identifier. Defaults to "Charon".
|
|
504
|
-
start_audio_paused: Whether to start with audio input paused. Defaults to False.
|
|
505
|
-
start_video_paused: Whether to start with video input paused. Defaults to False.
|
|
506
|
-
system_instruction: System prompt for the model. Defaults to None.
|
|
507
|
-
tools: Tools/functions available to the model. Defaults to None.
|
|
508
|
-
params: Configuration parameters for the model. Defaults to InputParams().
|
|
509
|
-
inference_on_context_initialization: Whether to generate a response when context
|
|
510
|
-
is first set. Defaults to True.
|
|
511
|
-
file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
|
|
512
|
-
**kwargs: Additional arguments passed to parent LLMService.
|
|
513
|
-
"""
|
|
514
|
-
super().__init__(base_url=base_url, **kwargs)
|
|
515
|
-
|
|
516
|
-
params = params or InputParams()
|
|
517
|
-
|
|
518
|
-
self._last_sent_time = 0
|
|
519
|
-
self._api_key = api_key
|
|
520
|
-
self._base_url = base_url
|
|
521
|
-
self.set_model_name(model)
|
|
522
|
-
self._voice_id = voice_id
|
|
523
|
-
self._language_code = params.language
|
|
524
|
-
|
|
525
|
-
self._system_instruction = system_instruction
|
|
526
|
-
self._tools = tools
|
|
527
|
-
self._inference_on_context_initialization = inference_on_context_initialization
|
|
528
|
-
self._needs_turn_complete_message = False
|
|
529
|
-
|
|
530
|
-
self._audio_input_paused = start_audio_paused
|
|
531
|
-
self._video_input_paused = start_video_paused
|
|
532
|
-
self._context = None
|
|
533
|
-
self._websocket = None
|
|
534
|
-
self._receive_task = None
|
|
535
|
-
|
|
536
|
-
self._disconnecting = False
|
|
537
|
-
self._api_session_ready = False
|
|
538
|
-
self._run_llm_when_api_session_ready = False
|
|
539
|
-
|
|
540
|
-
self._user_is_speaking = False
|
|
541
|
-
self._bot_is_speaking = False
|
|
542
|
-
self._user_audio_buffer = bytearray()
|
|
543
|
-
self._user_transcription_buffer = ""
|
|
544
|
-
self._last_transcription_sent = ""
|
|
545
|
-
self._bot_audio_buffer = bytearray()
|
|
546
|
-
self._bot_text_buffer = ""
|
|
547
|
-
self._llm_output_buffer = ""
|
|
548
|
-
|
|
549
|
-
self._sample_rate = 24000
|
|
550
|
-
|
|
551
|
-
self._language = params.language
|
|
552
|
-
self._language_code = (
|
|
553
|
-
language_to_gemini_language(params.language) if params.language else "en-US"
|
|
554
|
-
)
|
|
555
|
-
self._vad_params = params.vad
|
|
556
|
-
|
|
557
|
-
self._settings = {
|
|
558
|
-
"frequency_penalty": params.frequency_penalty,
|
|
559
|
-
"max_tokens": params.max_tokens,
|
|
560
|
-
"presence_penalty": params.presence_penalty,
|
|
561
|
-
"temperature": params.temperature,
|
|
562
|
-
"top_k": params.top_k,
|
|
563
|
-
"top_p": params.top_p,
|
|
564
|
-
"modalities": params.modalities,
|
|
565
|
-
"language": self._language_code,
|
|
566
|
-
"media_resolution": params.media_resolution,
|
|
567
|
-
"vad": params.vad,
|
|
568
|
-
"context_window_compression": params.context_window_compression.model_dump()
|
|
569
|
-
if params.context_window_compression
|
|
570
|
-
else {},
|
|
571
|
-
"extra": params.extra if isinstance(params.extra, dict) else {},
|
|
572
|
-
}
|
|
573
|
-
|
|
574
|
-
# Initialize the File API client
|
|
575
|
-
self.file_api = GeminiFileAPI(api_key=api_key, base_url=file_api_base_url)
|
|
576
|
-
|
|
577
|
-
# Grounding metadata tracking
|
|
578
|
-
self._search_result_buffer = ""
|
|
579
|
-
self._accumulated_grounding_metadata = None
|
|
580
|
-
|
|
581
|
-
def can_generate_metrics(self) -> bool:
|
|
582
|
-
"""Check if the service can generate usage metrics.
|
|
583
|
-
|
|
584
|
-
Returns:
|
|
585
|
-
True as Gemini Live supports token usage metrics.
|
|
586
|
-
"""
|
|
587
|
-
return True
|
|
588
|
-
|
|
589
|
-
def needs_mcp_alternate_schema(self) -> bool:
|
|
590
|
-
"""Check if this LLM service requires alternate MCP schema.
|
|
591
|
-
|
|
592
|
-
Google/Gemini has stricter JSON schema validation and requires
|
|
593
|
-
certain properties to be removed or modified for compatibility.
|
|
594
|
-
|
|
595
|
-
Returns:
|
|
596
|
-
True for Google/Gemini services.
|
|
597
|
-
"""
|
|
598
|
-
return True
|
|
599
|
-
|
|
600
|
-
def set_audio_input_paused(self, paused: bool):
|
|
601
|
-
"""Set the audio input pause state.
|
|
602
|
-
|
|
603
|
-
Args:
|
|
604
|
-
paused: Whether to pause audio input.
|
|
605
|
-
"""
|
|
606
|
-
self._audio_input_paused = paused
|
|
607
|
-
|
|
608
|
-
def set_video_input_paused(self, paused: bool):
|
|
609
|
-
"""Set the video input pause state.
|
|
610
|
-
|
|
611
|
-
Args:
|
|
612
|
-
paused: Whether to pause video input.
|
|
613
|
-
"""
|
|
614
|
-
self._video_input_paused = paused
|
|
615
|
-
|
|
616
|
-
def set_model_modalities(self, modalities: GeminiMultimodalModalities):
|
|
617
|
-
"""Set the model response modalities.
|
|
618
|
-
|
|
619
|
-
Args:
|
|
620
|
-
modalities: The modalities to use for responses.
|
|
621
|
-
"""
|
|
622
|
-
self._settings["modalities"] = modalities
|
|
623
|
-
|
|
624
|
-
def set_language(self, language: Language):
|
|
625
|
-
"""Set the language for generation.
|
|
626
|
-
|
|
627
|
-
Args:
|
|
628
|
-
language: The language to use for generation.
|
|
629
|
-
"""
|
|
630
|
-
self._language = language
|
|
631
|
-
self._language_code = language_to_gemini_language(language) or "en-US"
|
|
632
|
-
self._settings["language"] = self._language_code
|
|
633
|
-
logger.info(f"Set Gemini language to: {self._language_code}")
|
|
634
|
-
|
|
635
|
-
async def set_context(self, context: OpenAILLMContext):
|
|
636
|
-
"""Set the context explicitly from outside the pipeline.
|
|
637
|
-
|
|
638
|
-
This is useful when initializing a conversation because in server-side VAD mode we might not have a
|
|
639
|
-
way to trigger the pipeline. This sends the history to the server. The `inference_on_context_initialization`
|
|
640
|
-
flag controls whether to set the turnComplete flag when we do this. Without that flag, the model will
|
|
641
|
-
not respond. This is often what we want when setting the context at the beginning of a conversation.
|
|
642
|
-
|
|
643
|
-
Args:
|
|
644
|
-
context: The OpenAI LLM context to set.
|
|
645
|
-
"""
|
|
646
|
-
if self._context:
|
|
647
|
-
logger.error(
|
|
648
|
-
"Context already set. Can only set up Gemini Multimodal Live context once."
|
|
649
|
-
)
|
|
650
|
-
return
|
|
651
|
-
self._context = GeminiMultimodalLiveContext.upgrade(context)
|
|
652
|
-
await self._create_initial_response()
|
|
653
|
-
|
|
654
|
-
#
|
|
655
|
-
# standard AIService frame handling
|
|
656
|
-
#
|
|
657
|
-
|
|
658
|
-
async def start(self, frame: StartFrame):
|
|
659
|
-
"""Start the service and establish websocket connection.
|
|
660
|
-
|
|
661
|
-
Args:
|
|
662
|
-
frame: The start frame.
|
|
663
|
-
"""
|
|
664
|
-
await super().start(frame)
|
|
665
|
-
await self._connect()
|
|
666
|
-
|
|
667
|
-
async def stop(self, frame: EndFrame):
|
|
668
|
-
"""Stop the service and close connections.
|
|
669
|
-
|
|
670
|
-
Args:
|
|
671
|
-
frame: The end frame.
|
|
672
|
-
"""
|
|
673
|
-
await super().stop(frame)
|
|
674
|
-
await self._disconnect()
|
|
675
|
-
|
|
676
|
-
async def cancel(self, frame: CancelFrame):
|
|
677
|
-
"""Cancel the service and close connections.
|
|
678
|
-
|
|
679
|
-
Args:
|
|
680
|
-
frame: The cancel frame.
|
|
681
|
-
"""
|
|
682
|
-
await super().cancel(frame)
|
|
683
|
-
await self._disconnect()
|
|
684
|
-
|
|
685
|
-
#
|
|
686
|
-
# speech and interruption handling
|
|
687
|
-
#
|
|
688
|
-
|
|
689
|
-
async def _handle_interruption(self):
|
|
690
|
-
self._bot_is_speaking = False
|
|
691
|
-
await self.push_frame(TTSStoppedFrame())
|
|
692
|
-
await self.push_frame(LLMFullResponseEndFrame())
|
|
693
|
-
|
|
694
|
-
async def _handle_user_started_speaking(self, frame):
|
|
695
|
-
self._user_is_speaking = True
|
|
696
|
-
pass
|
|
697
|
-
|
|
698
|
-
async def _handle_user_stopped_speaking(self, frame):
|
|
699
|
-
self._user_is_speaking = False
|
|
700
|
-
self._user_audio_buffer = bytearray()
|
|
701
|
-
await self.start_ttfb_metrics()
|
|
702
|
-
if self._needs_turn_complete_message:
|
|
703
|
-
self._needs_turn_complete_message = False
|
|
704
|
-
evt = events.ClientContentMessage.model_validate(
|
|
705
|
-
{"clientContent": {"turnComplete": True}}
|
|
706
|
-
)
|
|
707
|
-
await self.send_client_event(evt)
|
|
708
|
-
|
|
709
|
-
#
|
|
710
|
-
# frame processing
|
|
711
|
-
#
|
|
712
|
-
# StartFrame, StopFrame, CancelFrame implemented in base class
|
|
713
|
-
#
|
|
714
|
-
|
|
715
|
-
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
716
|
-
"""Process incoming frames for the Gemini Live service.
|
|
717
|
-
|
|
718
|
-
Args:
|
|
719
|
-
frame: The frame to process.
|
|
720
|
-
direction: The frame processing direction.
|
|
721
|
-
"""
|
|
722
|
-
await super().process_frame(frame, direction)
|
|
723
|
-
|
|
724
|
-
if isinstance(frame, TranscriptionFrame):
|
|
725
|
-
await self.push_frame(frame, direction)
|
|
726
|
-
elif isinstance(frame, OpenAILLMContextFrame):
|
|
727
|
-
context: GeminiMultimodalLiveContext = GeminiMultimodalLiveContext.upgrade(
|
|
728
|
-
frame.context
|
|
729
|
-
)
|
|
730
|
-
# For now, we'll only trigger inference here when either:
|
|
731
|
-
# 1. We have not seen a context frame before
|
|
732
|
-
# 2. The last message is a tool call result
|
|
733
|
-
if not self._context:
|
|
734
|
-
self._context = context
|
|
735
|
-
if frame.context.tools:
|
|
736
|
-
self._tools = frame.context.tools
|
|
737
|
-
await self._create_initial_response()
|
|
738
|
-
elif context.messages and context.messages[-1].get("role") == "tool":
|
|
739
|
-
# Support just one tool call per context frame for now
|
|
740
|
-
tool_result_message = context.messages[-1]
|
|
741
|
-
await self._tool_result(tool_result_message)
|
|
742
|
-
elif isinstance(frame, LLMContextFrame):
|
|
743
|
-
raise NotImplementedError(
|
|
744
|
-
"Universal LLMContext is not yet supported for Gemini Multimodal Live."
|
|
745
|
-
)
|
|
746
|
-
elif isinstance(frame, InputTextRawFrame):
|
|
747
|
-
await self._send_user_text(frame.text)
|
|
748
|
-
await self.push_frame(frame, direction)
|
|
749
|
-
elif isinstance(frame, InputAudioRawFrame):
|
|
750
|
-
await self._send_user_audio(frame)
|
|
751
|
-
await self.push_frame(frame, direction)
|
|
752
|
-
elif isinstance(frame, InputImageRawFrame):
|
|
753
|
-
await self._send_user_video(frame)
|
|
754
|
-
await self.push_frame(frame, direction)
|
|
755
|
-
elif isinstance(frame, StartInterruptionFrame):
|
|
756
|
-
await self._handle_interruption()
|
|
757
|
-
await self.push_frame(frame, direction)
|
|
758
|
-
elif isinstance(frame, UserStartedSpeakingFrame):
|
|
759
|
-
await self._handle_user_started_speaking(frame)
|
|
760
|
-
await self.push_frame(frame, direction)
|
|
761
|
-
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
762
|
-
await self._handle_user_stopped_speaking(frame)
|
|
763
|
-
await self.push_frame(frame, direction)
|
|
764
|
-
elif isinstance(frame, BotStartedSpeakingFrame):
|
|
765
|
-
# Ignore this frame. Use the serverContent API message instead
|
|
766
|
-
await self.push_frame(frame, direction)
|
|
767
|
-
elif isinstance(frame, BotStoppedSpeakingFrame):
|
|
768
|
-
# ignore this frame. Use the serverContent.turnComplete API message
|
|
769
|
-
await self.push_frame(frame, direction)
|
|
770
|
-
elif isinstance(frame, LLMMessagesAppendFrame):
|
|
771
|
-
await self._create_single_response(frame.messages)
|
|
772
|
-
elif isinstance(frame, LLMUpdateSettingsFrame):
|
|
773
|
-
await self._update_settings(frame.settings)
|
|
774
|
-
elif isinstance(frame, LLMSetToolsFrame):
|
|
775
|
-
await self._update_settings()
|
|
776
|
-
else:
|
|
777
|
-
await self.push_frame(frame, direction)
|
|
778
|
-
|
|
779
|
-
#
|
|
780
|
-
# websocket communication
|
|
781
|
-
#
|
|
782
|
-
|
|
783
|
-
async def send_client_event(self, event):
|
|
784
|
-
"""Send a client event to the Gemini Live API.
|
|
785
|
-
|
|
786
|
-
Args:
|
|
787
|
-
event: The event to send.
|
|
788
|
-
"""
|
|
789
|
-
await self._ws_send(event.model_dump(exclude_none=True))
|
|
790
|
-
|
|
791
|
-
async def _connect(self):
|
|
792
|
-
"""Establish WebSocket connection to Gemini Live API."""
|
|
793
|
-
if self._websocket:
|
|
794
|
-
# Here we assume that if we have a websocket, we are connected. We
|
|
795
|
-
# handle disconnections in the send/recv code paths.
|
|
796
|
-
return
|
|
797
|
-
|
|
798
|
-
logger.info("Connecting to Gemini service")
|
|
799
|
-
try:
|
|
800
|
-
logger.info(f"Connecting to wss://{self._base_url}")
|
|
801
|
-
uri = f"wss://{self._base_url}?key={self._api_key}"
|
|
802
|
-
self._websocket = await websocket_connect(uri=uri)
|
|
803
|
-
self._receive_task = self.create_task(self._receive_task_handler())
|
|
804
|
-
|
|
805
|
-
# Create the basic configuration
|
|
806
|
-
config_data = {
|
|
807
|
-
"setup": {
|
|
808
|
-
"model": self._model_name,
|
|
809
|
-
"generation_config": {
|
|
810
|
-
"frequency_penalty": self._settings["frequency_penalty"],
|
|
811
|
-
"max_output_tokens": self._settings["max_tokens"],
|
|
812
|
-
"presence_penalty": self._settings["presence_penalty"],
|
|
813
|
-
"temperature": self._settings["temperature"],
|
|
814
|
-
"top_k": self._settings["top_k"],
|
|
815
|
-
"top_p": self._settings["top_p"],
|
|
816
|
-
"response_modalities": self._settings["modalities"].value,
|
|
817
|
-
"speech_config": {
|
|
818
|
-
"voice_config": {
|
|
819
|
-
"prebuilt_voice_config": {"voice_name": self._voice_id}
|
|
820
|
-
},
|
|
821
|
-
"language_code": self._settings["language"],
|
|
822
|
-
},
|
|
823
|
-
"media_resolution": self._settings["media_resolution"].value,
|
|
824
|
-
},
|
|
825
|
-
"input_audio_transcription": {},
|
|
826
|
-
"output_audio_transcription": {},
|
|
827
|
-
}
|
|
828
|
-
}
|
|
829
|
-
|
|
830
|
-
# Add context window compression if enabled
|
|
831
|
-
if self._settings.get("context_window_compression", {}).get("enabled", False):
|
|
832
|
-
compression_config = {}
|
|
833
|
-
# Add sliding window (always true if compression is enabled)
|
|
834
|
-
compression_config["sliding_window"] = {}
|
|
835
|
-
|
|
836
|
-
# Add trigger_tokens if specified
|
|
837
|
-
trigger_tokens = self._settings.get("context_window_compression", {}).get(
|
|
838
|
-
"trigger_tokens"
|
|
839
|
-
)
|
|
840
|
-
if trigger_tokens is not None:
|
|
841
|
-
compression_config["trigger_tokens"] = trigger_tokens
|
|
842
|
-
|
|
843
|
-
config_data["setup"]["context_window_compression"] = compression_config
|
|
844
|
-
|
|
845
|
-
# Add VAD configuration if provided
|
|
846
|
-
if self._settings.get("vad"):
|
|
847
|
-
vad_config = {}
|
|
848
|
-
vad_params = self._settings["vad"]
|
|
849
|
-
|
|
850
|
-
# Only add parameters that are explicitly set
|
|
851
|
-
if vad_params.disabled is not None:
|
|
852
|
-
vad_config["disabled"] = vad_params.disabled
|
|
853
|
-
|
|
854
|
-
if vad_params.start_sensitivity:
|
|
855
|
-
vad_config["start_of_speech_sensitivity"] = vad_params.start_sensitivity.value
|
|
856
|
-
|
|
857
|
-
if vad_params.end_sensitivity:
|
|
858
|
-
vad_config["end_of_speech_sensitivity"] = vad_params.end_sensitivity.value
|
|
859
|
-
|
|
860
|
-
if vad_params.prefix_padding_ms is not None:
|
|
861
|
-
vad_config["prefix_padding_ms"] = vad_params.prefix_padding_ms
|
|
862
|
-
|
|
863
|
-
if vad_params.silence_duration_ms is not None:
|
|
864
|
-
vad_config["silence_duration_ms"] = vad_params.silence_duration_ms
|
|
865
|
-
|
|
866
|
-
# Only add automatic_activity_detection if we have VAD settings
|
|
867
|
-
if vad_config:
|
|
868
|
-
realtime_config = {"automatic_activity_detection": vad_config}
|
|
869
|
-
|
|
870
|
-
config_data["setup"]["realtime_input_config"] = realtime_config
|
|
871
|
-
|
|
872
|
-
config = events.Config.model_validate(config_data)
|
|
873
|
-
|
|
874
|
-
# Add system instruction if available
|
|
875
|
-
system_instruction = self._system_instruction or ""
|
|
876
|
-
if self._context and hasattr(self._context, "extract_system_instructions"):
|
|
877
|
-
system_instruction += "\n" + self._context.extract_system_instructions()
|
|
878
|
-
if system_instruction:
|
|
879
|
-
logger.debug(f"Setting system instruction: {system_instruction}")
|
|
880
|
-
config.setup.system_instruction = events.SystemInstruction(
|
|
881
|
-
parts=[events.ContentPart(text=system_instruction)]
|
|
882
|
-
)
|
|
883
|
-
|
|
884
|
-
# Add tools if available
|
|
885
|
-
if self._tools:
|
|
886
|
-
logger.debug(f"Gemini is configuring to use tools{self._tools}")
|
|
887
|
-
config.setup.tools = self.get_llm_adapter().from_standard_tools(self._tools)
|
|
888
|
-
|
|
889
|
-
# Send the configuration
|
|
890
|
-
await self.send_client_event(config)
|
|
891
|
-
|
|
892
|
-
except Exception as e:
|
|
893
|
-
logger.error(f"{self} initialization error: {e}")
|
|
894
|
-
self._websocket = None
|
|
895
|
-
|
|
896
|
-
async def _disconnect(self):
|
|
897
|
-
"""Disconnect from Gemini Live API and clean up resources."""
|
|
898
|
-
logger.info("Disconnecting from Gemini service")
|
|
899
|
-
try:
|
|
900
|
-
self._disconnecting = True
|
|
901
|
-
self._api_session_ready = False
|
|
902
|
-
await self.stop_all_metrics()
|
|
903
|
-
if self._websocket:
|
|
904
|
-
await self._websocket.close()
|
|
905
|
-
self._websocket = None
|
|
906
|
-
if self._receive_task:
|
|
907
|
-
await self.cancel_task(self._receive_task, timeout=1.0)
|
|
908
|
-
self._receive_task = None
|
|
909
|
-
self._disconnecting = False
|
|
910
|
-
except Exception as e:
|
|
911
|
-
logger.error(f"{self} error disconnecting: {e}")
|
|
912
|
-
|
|
913
|
-
async def _ws_send(self, message):
|
|
914
|
-
"""Send a message to the WebSocket connection."""
|
|
915
|
-
# logger.debug(f"Sending message to websocket: {message}")
|
|
916
|
-
try:
|
|
917
|
-
if self._websocket:
|
|
918
|
-
await self._websocket.send(json.dumps(message))
|
|
919
|
-
except Exception as e:
|
|
920
|
-
if self._disconnecting:
|
|
921
|
-
return
|
|
922
|
-
logger.error(f"Error sending message to websocket: {e}")
|
|
923
|
-
# In server-to-server contexts, a WebSocket error should be quite rare. Given how hard
|
|
924
|
-
# it is to recover from a send-side error with proper state management, and that exponential
|
|
925
|
-
# backoff for retries can have cost/stability implications for a service cluster, let's just
|
|
926
|
-
# treat a send-side error as fatal.
|
|
927
|
-
await self.push_error(ErrorFrame(error=f"Error sending client event: {e}", fatal=True))
|
|
928
|
-
|
|
929
|
-
#
|
|
930
|
-
# inbound server event handling
|
|
931
|
-
# todo: docs link here
|
|
932
|
-
#
|
|
933
|
-
|
|
934
|
-
async def _receive_task_handler(self):
|
|
935
|
-
"""Handle incoming messages from the WebSocket connection."""
|
|
936
|
-
async for message in self._websocket:
|
|
937
|
-
evt = events.parse_server_event(message)
|
|
938
|
-
# logger.debug(f"Received event: {message[:500]}")
|
|
939
|
-
# logger.debug(f"Received event: {evt}")
|
|
940
|
-
|
|
941
|
-
if evt.setupComplete:
|
|
942
|
-
await self._handle_evt_setup_complete(evt)
|
|
943
|
-
elif evt.serverContent and evt.serverContent.modelTurn:
|
|
944
|
-
await self._handle_evt_model_turn(evt)
|
|
945
|
-
elif evt.serverContent and evt.serverContent.turnComplete and evt.usageMetadata:
|
|
946
|
-
await self._handle_evt_turn_complete(evt)
|
|
947
|
-
await self._handle_evt_usage_metadata(evt)
|
|
948
|
-
elif evt.serverContent and evt.serverContent.inputTranscription:
|
|
949
|
-
await self._handle_evt_input_transcription(evt)
|
|
950
|
-
elif evt.serverContent and evt.serverContent.outputTranscription:
|
|
951
|
-
await self._handle_evt_output_transcription(evt)
|
|
952
|
-
elif evt.serverContent and evt.serverContent.groundingMetadata:
|
|
953
|
-
await self._handle_evt_grounding_metadata(evt)
|
|
954
|
-
elif evt.toolCall:
|
|
955
|
-
await self._handle_evt_tool_call(evt)
|
|
956
|
-
elif False: # !!! todo: error events?
|
|
957
|
-
await self._handle_evt_error(evt)
|
|
958
|
-
# errors are fatal, so exit the receive loop
|
|
959
|
-
return
|
|
960
|
-
|
|
961
|
-
#
|
|
962
|
-
#
|
|
963
|
-
#
|
|
964
|
-
|
|
965
|
-
async def _send_user_audio(self, frame):
|
|
966
|
-
"""Send user audio frame to Gemini Live API."""
|
|
967
|
-
if self._audio_input_paused:
|
|
968
|
-
return
|
|
969
|
-
# Send all audio to Gemini
|
|
970
|
-
evt = events.AudioInputMessage.from_raw_audio(frame.audio, frame.sample_rate)
|
|
971
|
-
await self.send_client_event(evt)
|
|
972
|
-
# Manage a buffer of audio to use for transcription
|
|
973
|
-
audio = frame.audio
|
|
974
|
-
if self._user_is_speaking:
|
|
975
|
-
self._user_audio_buffer.extend(audio)
|
|
976
|
-
else:
|
|
977
|
-
# Keep 1/2 second of audio in the buffer even when not speaking.
|
|
978
|
-
self._user_audio_buffer.extend(audio)
|
|
979
|
-
length = int((frame.sample_rate * frame.num_channels * 2) * 0.5)
|
|
980
|
-
self._user_audio_buffer = self._user_audio_buffer[-length:]
|
|
981
|
-
|
|
982
|
-
async def _send_user_text(self, text: str):
|
|
983
|
-
"""Send user text via Gemini Live API's realtime input stream.
|
|
984
|
-
|
|
985
|
-
This method sends text through the realtimeInput stream (via TextInputMessage)
|
|
986
|
-
rather than the clientContent stream. This ensures text input is synchronized
|
|
987
|
-
with audio and video inputs, preventing temporal misalignment that can occur
|
|
988
|
-
when different modalities are processed through separate API pathways.
|
|
989
|
-
|
|
990
|
-
For realtimeInput, turn completion is automatically inferred by the API based
|
|
991
|
-
on user activity, so no explicit turnComplete signal is needed.
|
|
992
|
-
|
|
993
|
-
Args:
|
|
994
|
-
text: The text to send as user input.
|
|
995
|
-
"""
|
|
996
|
-
evt = events.TextInputMessage.from_text(text)
|
|
997
|
-
await self.send_client_event(evt)
|
|
998
|
-
|
|
999
|
-
async def _send_user_video(self, frame):
|
|
1000
|
-
"""Send user video frame to Gemini Live API."""
|
|
1001
|
-
if self._video_input_paused:
|
|
1002
|
-
return
|
|
1003
|
-
|
|
1004
|
-
now = time.time()
|
|
1005
|
-
if now - self._last_sent_time < 1:
|
|
1006
|
-
return # Ignore if less than 1 second has passed
|
|
1007
|
-
|
|
1008
|
-
self._last_sent_time = now # Update last sent time
|
|
1009
|
-
logger.debug(f"Sending video frame to Gemini: {frame}")
|
|
1010
|
-
evt = events.VideoInputMessage.from_image_frame(frame)
|
|
1011
|
-
await self.send_client_event(evt)
|
|
1012
|
-
|
|
1013
|
-
async def _create_initial_response(self):
|
|
1014
|
-
"""Create initial response based on context history."""
|
|
1015
|
-
if not self._api_session_ready:
|
|
1016
|
-
self._run_llm_when_api_session_ready = True
|
|
1017
|
-
return
|
|
1018
|
-
|
|
1019
|
-
messages = self._context.get_messages_for_initializing_history()
|
|
1020
|
-
if not messages:
|
|
1021
|
-
return
|
|
1022
|
-
|
|
1023
|
-
logger.debug(f"Creating initial response: {messages}")
|
|
1024
|
-
|
|
1025
|
-
await self.start_ttfb_metrics()
|
|
1026
|
-
|
|
1027
|
-
evt = events.ClientContentMessage.model_validate(
|
|
1028
|
-
{
|
|
1029
|
-
"clientContent": {
|
|
1030
|
-
"turns": messages,
|
|
1031
|
-
"turnComplete": self._inference_on_context_initialization,
|
|
1032
|
-
}
|
|
1033
|
-
}
|
|
1034
|
-
)
|
|
1035
|
-
await self.send_client_event(evt)
|
|
1036
|
-
if not self._inference_on_context_initialization:
|
|
1037
|
-
self._needs_turn_complete_message = True
|
|
1038
|
-
|
|
1039
|
-
async def _create_single_response(self, messages_list):
|
|
1040
|
-
"""Create a single response from a list of messages."""
|
|
1041
|
-
# Refactor to combine this logic with same logic in GeminiMultimodalLiveContext
|
|
1042
|
-
messages = []
|
|
1043
|
-
for item in messages_list:
|
|
1044
|
-
role = item.get("role")
|
|
1045
|
-
|
|
1046
|
-
if role == "system":
|
|
1047
|
-
continue
|
|
1048
|
-
|
|
1049
|
-
elif role == "assistant":
|
|
1050
|
-
role = "model"
|
|
1051
|
-
|
|
1052
|
-
content = item.get("content")
|
|
1053
|
-
parts = []
|
|
1054
|
-
if isinstance(content, str):
|
|
1055
|
-
parts = [{"text": content}]
|
|
1056
|
-
elif isinstance(content, list):
|
|
1057
|
-
for part in content:
|
|
1058
|
-
if part.get("type") == "text":
|
|
1059
|
-
parts.append({"text": part.get("text")})
|
|
1060
|
-
elif part.get("type") == "file_data":
|
|
1061
|
-
file_data = part.get("file_data", {})
|
|
1062
|
-
|
|
1063
|
-
parts.append(
|
|
1064
|
-
{
|
|
1065
|
-
"fileData": {
|
|
1066
|
-
"mimeType": file_data.get("mime_type"),
|
|
1067
|
-
"fileUri": file_data.get("file_uri"),
|
|
1068
|
-
}
|
|
1069
|
-
}
|
|
1070
|
-
)
|
|
1071
|
-
else:
|
|
1072
|
-
logger.warning(f"Unsupported content type: {str(part)[:80]}")
|
|
1073
|
-
else:
|
|
1074
|
-
logger.warning(f"Unsupported content type: {str(content)[:80]}")
|
|
1075
|
-
messages.append({"role": role, "parts": parts})
|
|
1076
|
-
if not messages:
|
|
1077
|
-
return
|
|
1078
|
-
logger.debug(f"Creating response: {messages}")
|
|
1079
|
-
|
|
1080
|
-
await self.start_ttfb_metrics()
|
|
1081
|
-
|
|
1082
|
-
evt = events.ClientContentMessage.model_validate(
|
|
1083
|
-
{
|
|
1084
|
-
"clientContent": {
|
|
1085
|
-
"turns": messages,
|
|
1086
|
-
"turnComplete": True,
|
|
1087
|
-
}
|
|
1088
|
-
}
|
|
1089
|
-
)
|
|
1090
|
-
await self.send_client_event(evt)
|
|
1091
|
-
|
|
1092
|
-
@traced_gemini_live(operation="llm_tool_result")
|
|
1093
|
-
async def _tool_result(self, tool_result_message):
|
|
1094
|
-
"""Send tool result back to the API."""
|
|
1095
|
-
# For now we're shoving the name into the tool_call_id field, so this
|
|
1096
|
-
# will work until we revisit that.
|
|
1097
|
-
id = tool_result_message.get("tool_call_id")
|
|
1098
|
-
name = tool_result_message.get("tool_call_name")
|
|
1099
|
-
result = json.loads(tool_result_message.get("content") or "")
|
|
1100
|
-
response_message = json.dumps(
|
|
1101
|
-
{
|
|
1102
|
-
"toolResponse": {
|
|
1103
|
-
"functionResponses": [
|
|
1104
|
-
{
|
|
1105
|
-
"id": id,
|
|
1106
|
-
"name": name,
|
|
1107
|
-
"response": {
|
|
1108
|
-
"result": result,
|
|
1109
|
-
},
|
|
1110
|
-
}
|
|
1111
|
-
],
|
|
1112
|
-
}
|
|
1113
|
-
}
|
|
1114
|
-
)
|
|
1115
|
-
await self._websocket.send(response_message)
|
|
1116
|
-
# await self._websocket.send(json.dumps({"clientContent": {"turnComplete": True}}))
|
|
1117
|
-
|
|
1118
|
-
@traced_gemini_live(operation="llm_setup")
|
|
1119
|
-
async def _handle_evt_setup_complete(self, evt):
|
|
1120
|
-
"""Handle the setup complete event."""
|
|
1121
|
-
# If this is our first context frame, run the LLM
|
|
1122
|
-
self._api_session_ready = True
|
|
1123
|
-
# Now that we've configured the session, we can run the LLM if we need to.
|
|
1124
|
-
if self._run_llm_when_api_session_ready:
|
|
1125
|
-
self._run_llm_when_api_session_ready = False
|
|
1126
|
-
await self._create_initial_response()
|
|
1127
|
-
|
|
1128
|
-
async def _handle_evt_model_turn(self, evt):
|
|
1129
|
-
"""Handle the model turn event."""
|
|
1130
|
-
part = evt.serverContent.modelTurn.parts[0]
|
|
1131
|
-
if not part:
|
|
1132
|
-
return
|
|
1133
|
-
|
|
1134
|
-
await self.stop_ttfb_metrics()
|
|
1135
|
-
|
|
1136
|
-
# part.text is added when `modalities` is set to TEXT; otherwise, it's None
|
|
1137
|
-
text = part.text
|
|
1138
|
-
if text:
|
|
1139
|
-
if not self._bot_text_buffer:
|
|
1140
|
-
await self.push_frame(LLMFullResponseStartFrame())
|
|
1141
|
-
|
|
1142
|
-
self._bot_text_buffer += text
|
|
1143
|
-
self._search_result_buffer += text # Also accumulate for grounding
|
|
1144
|
-
await self.push_frame(LLMTextFrame(text=text))
|
|
1145
|
-
|
|
1146
|
-
# Check for grounding metadata in server content
|
|
1147
|
-
if evt.serverContent and evt.serverContent.groundingMetadata:
|
|
1148
|
-
self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
|
|
1149
|
-
|
|
1150
|
-
inline_data = part.inlineData
|
|
1151
|
-
if not inline_data:
|
|
1152
|
-
return
|
|
1153
|
-
if inline_data.mimeType != f"audio/pcm;rate={self._sample_rate}":
|
|
1154
|
-
logger.warning(f"Unrecognized server_content format {inline_data.mimeType}")
|
|
1155
|
-
return
|
|
1156
|
-
|
|
1157
|
-
audio = base64.b64decode(inline_data.data)
|
|
1158
|
-
if not audio:
|
|
1159
|
-
return
|
|
1160
|
-
|
|
1161
|
-
if not self._bot_is_speaking:
|
|
1162
|
-
self._bot_is_speaking = True
|
|
1163
|
-
await self.push_frame(TTSStartedFrame())
|
|
1164
|
-
await self.push_frame(LLMFullResponseStartFrame())
|
|
1165
|
-
|
|
1166
|
-
self._bot_audio_buffer.extend(audio)
|
|
1167
|
-
frame = TTSAudioRawFrame(
|
|
1168
|
-
audio=audio,
|
|
1169
|
-
sample_rate=self._sample_rate,
|
|
1170
|
-
num_channels=1,
|
|
1171
|
-
)
|
|
1172
|
-
await self.push_frame(frame)
|
|
1173
|
-
|
|
1174
|
-
@traced_gemini_live(operation="llm_tool_call")
|
|
1175
|
-
async def _handle_evt_tool_call(self, evt):
|
|
1176
|
-
"""Handle tool call events."""
|
|
1177
|
-
function_calls = evt.toolCall.functionCalls
|
|
1178
|
-
if not function_calls:
|
|
1179
|
-
return
|
|
1180
|
-
if not self._context:
|
|
1181
|
-
logger.error("Function calls are not supported without a context object.")
|
|
1182
|
-
|
|
1183
|
-
function_calls_llm = [
|
|
1184
|
-
FunctionCallFromLLM(
|
|
1185
|
-
context=self._context,
|
|
1186
|
-
tool_call_id=f.id,
|
|
1187
|
-
function_name=f.name,
|
|
1188
|
-
arguments=f.args,
|
|
1189
|
-
)
|
|
1190
|
-
for f in function_calls
|
|
1191
|
-
]
|
|
1192
|
-
|
|
1193
|
-
await self.run_function_calls(function_calls_llm)
|
|
1194
|
-
|
|
1195
|
-
@traced_gemini_live(operation="llm_response")
|
|
1196
|
-
async def _handle_evt_turn_complete(self, evt):
|
|
1197
|
-
"""Handle the turn complete event."""
|
|
1198
|
-
self._bot_is_speaking = False
|
|
1199
|
-
text = self._bot_text_buffer
|
|
1200
|
-
|
|
1201
|
-
# Determine output and modality for tracing
|
|
1202
|
-
if text:
|
|
1203
|
-
# TEXT modality
|
|
1204
|
-
output_text = text
|
|
1205
|
-
output_modality = "TEXT"
|
|
1206
|
-
else:
|
|
1207
|
-
# AUDIO modality
|
|
1208
|
-
output_text = self._llm_output_buffer
|
|
1209
|
-
output_modality = "AUDIO"
|
|
1210
|
-
|
|
1211
|
-
# Trace the complete LLM response (this will be handled by the decorator)
|
|
1212
|
-
# The decorator will extract the output text and usage metadata from the event
|
|
1213
|
-
|
|
1214
|
-
self._bot_text_buffer = ""
|
|
1215
|
-
self._llm_output_buffer = ""
|
|
1216
|
-
|
|
1217
|
-
# Process grounding metadata if we have accumulated any
|
|
1218
|
-
if self._accumulated_grounding_metadata:
|
|
1219
|
-
await self._process_grounding_metadata(
|
|
1220
|
-
self._accumulated_grounding_metadata, self._search_result_buffer
|
|
1221
|
-
)
|
|
1222
|
-
|
|
1223
|
-
# Reset grounding tracking for next response
|
|
1224
|
-
self._search_result_buffer = ""
|
|
1225
|
-
self._accumulated_grounding_metadata = None
|
|
1226
|
-
|
|
1227
|
-
# Only push the TTSStoppedFrame if the bot is outputting audio
|
|
1228
|
-
# when text is found, modalities is set to TEXT and no audio
|
|
1229
|
-
# is produced.
|
|
1230
|
-
if not text:
|
|
1231
|
-
await self.push_frame(TTSStoppedFrame())
|
|
1232
|
-
|
|
1233
|
-
await self.push_frame(LLMFullResponseEndFrame())
|
|
1234
|
-
|
|
1235
|
-
@traced_stt
|
|
1236
|
-
async def _handle_user_transcription(
|
|
1237
|
-
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
|
1238
|
-
):
|
|
1239
|
-
"""Handle a transcription result with tracing."""
|
|
1240
|
-
pass
|
|
1241
|
-
|
|
1242
|
-
async def _handle_evt_input_transcription(self, evt):
|
|
1243
|
-
"""Handle the input transcription event.
|
|
1244
|
-
|
|
1245
|
-
Gemini Live sends user transcriptions in either single words or multi-word
|
|
1246
|
-
phrases. As a result, we have to aggregate the input transcription. This handler
|
|
1247
|
-
aggregates into sentences, splitting on the end of sentence markers.
|
|
1248
|
-
"""
|
|
1249
|
-
if not evt.serverContent.inputTranscription:
|
|
1250
|
-
return
|
|
1251
|
-
|
|
1252
|
-
text = evt.serverContent.inputTranscription.text
|
|
1253
|
-
|
|
1254
|
-
if not text:
|
|
1255
|
-
return
|
|
1256
|
-
|
|
1257
|
-
# Strip leading space from sentence starts if buffer is empty
|
|
1258
|
-
if text.startswith(" ") and not self._user_transcription_buffer:
|
|
1259
|
-
text = text.lstrip()
|
|
1260
|
-
|
|
1261
|
-
# Accumulate text in the buffer
|
|
1262
|
-
self._user_transcription_buffer += text
|
|
1263
|
-
|
|
1264
|
-
# Check for complete sentences
|
|
1265
|
-
while True:
|
|
1266
|
-
eos_end_marker = match_endofsentence(self._user_transcription_buffer)
|
|
1267
|
-
if not eos_end_marker:
|
|
1268
|
-
break
|
|
1269
|
-
|
|
1270
|
-
# Extract the complete sentence
|
|
1271
|
-
complete_sentence = self._user_transcription_buffer[:eos_end_marker]
|
|
1272
|
-
# Keep the remainder for the next chunk
|
|
1273
|
-
self._user_transcription_buffer = self._user_transcription_buffer[eos_end_marker:]
|
|
1274
|
-
|
|
1275
|
-
# Send a TranscriptionFrame with the complete sentence
|
|
1276
|
-
logger.debug(f"[Transcription:user] [{complete_sentence}]")
|
|
1277
|
-
await self._handle_user_transcription(
|
|
1278
|
-
complete_sentence, True, self._settings["language"]
|
|
1279
|
-
)
|
|
1280
|
-
await self.push_frame(
|
|
1281
|
-
TranscriptionFrame(
|
|
1282
|
-
text=complete_sentence,
|
|
1283
|
-
user_id="",
|
|
1284
|
-
timestamp=time_now_iso8601(),
|
|
1285
|
-
result=evt,
|
|
1286
|
-
),
|
|
1287
|
-
FrameDirection.UPSTREAM,
|
|
1288
|
-
)
|
|
1289
|
-
|
|
1290
|
-
async def _handle_evt_output_transcription(self, evt):
|
|
1291
|
-
"""Handle the output transcription event."""
|
|
1292
|
-
if not evt.serverContent.outputTranscription:
|
|
1293
|
-
return
|
|
1294
|
-
|
|
1295
|
-
# This is the output transcription text when modalities is set to AUDIO.
|
|
1296
|
-
# In this case, we push LLMTextFrame and TTSTextFrame to be handled by the
|
|
1297
|
-
# downstream assistant context aggregator.
|
|
1298
|
-
text = evt.serverContent.outputTranscription.text
|
|
1299
|
-
|
|
1300
|
-
if not text:
|
|
1301
|
-
return
|
|
1302
|
-
|
|
1303
|
-
# Accumulate text for grounding as well
|
|
1304
|
-
self._search_result_buffer += text
|
|
1305
|
-
|
|
1306
|
-
# Check for grounding metadata in server content
|
|
1307
|
-
if evt.serverContent and evt.serverContent.groundingMetadata:
|
|
1308
|
-
self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
|
|
1309
|
-
# Collect text for tracing
|
|
1310
|
-
self._llm_output_buffer += text
|
|
1311
|
-
|
|
1312
|
-
await self.push_frame(LLMTextFrame(text=text))
|
|
1313
|
-
await self.push_frame(TTSTextFrame(text=text))
|
|
1314
|
-
|
|
1315
|
-
async def _handle_evt_grounding_metadata(self, evt):
|
|
1316
|
-
"""Handle dedicated grounding metadata events."""
|
|
1317
|
-
if evt.serverContent and evt.serverContent.groundingMetadata:
|
|
1318
|
-
grounding_metadata = evt.serverContent.groundingMetadata
|
|
1319
|
-
# Process the grounding metadata immediately
|
|
1320
|
-
await self._process_grounding_metadata(grounding_metadata, self._search_result_buffer)
|
|
1321
|
-
|
|
1322
|
-
async def _process_grounding_metadata(
|
|
1323
|
-
self, grounding_metadata: events.GroundingMetadata, search_result: str = ""
|
|
1324
|
-
):
|
|
1325
|
-
"""Process grounding metadata and emit LLMSearchResponseFrame."""
|
|
1326
|
-
if not grounding_metadata:
|
|
1327
|
-
return
|
|
1328
|
-
|
|
1329
|
-
# Extract rendered content for search suggestions
|
|
1330
|
-
rendered_content = None
|
|
1331
|
-
if (
|
|
1332
|
-
grounding_metadata.searchEntryPoint
|
|
1333
|
-
and grounding_metadata.searchEntryPoint.renderedContent
|
|
1334
|
-
):
|
|
1335
|
-
rendered_content = grounding_metadata.searchEntryPoint.renderedContent
|
|
1336
|
-
|
|
1337
|
-
# Convert grounding chunks and supports to LLMSearchOrigin format
|
|
1338
|
-
origins = []
|
|
1339
|
-
|
|
1340
|
-
if grounding_metadata.groundingChunks and grounding_metadata.groundingSupports:
|
|
1341
|
-
# Create a mapping of chunk indices to origins
|
|
1342
|
-
chunk_to_origin = {}
|
|
1343
|
-
|
|
1344
|
-
for index, chunk in enumerate(grounding_metadata.groundingChunks):
|
|
1345
|
-
if chunk.web:
|
|
1346
|
-
origin = LLMSearchOrigin(
|
|
1347
|
-
site_uri=chunk.web.uri, site_title=chunk.web.title, results=[]
|
|
1348
|
-
)
|
|
1349
|
-
chunk_to_origin[index] = origin
|
|
1350
|
-
origins.append(origin)
|
|
1351
|
-
|
|
1352
|
-
# Add grounding support results to the appropriate origins
|
|
1353
|
-
for support in grounding_metadata.groundingSupports:
|
|
1354
|
-
if support.segment and support.groundingChunkIndices:
|
|
1355
|
-
text = support.segment.text or ""
|
|
1356
|
-
confidence_scores = support.confidenceScores or []
|
|
1357
|
-
|
|
1358
|
-
# Add this result to all origins referenced by this support
|
|
1359
|
-
for chunk_index in support.groundingChunkIndices:
|
|
1360
|
-
if chunk_index in chunk_to_origin:
|
|
1361
|
-
result = LLMSearchResult(text=text, confidence=confidence_scores)
|
|
1362
|
-
chunk_to_origin[chunk_index].results.append(result)
|
|
1363
|
-
|
|
1364
|
-
# Create and push the search response frame
|
|
1365
|
-
search_frame = LLMSearchResponseFrame(
|
|
1366
|
-
search_result=search_result, origins=origins, rendered_content=rendered_content
|
|
1367
|
-
)
|
|
1368
|
-
|
|
1369
|
-
await self.push_frame(search_frame)
|
|
1370
|
-
|
|
1371
|
-
async def _handle_evt_usage_metadata(self, evt):
|
|
1372
|
-
"""Handle the usage metadata event."""
|
|
1373
|
-
if not evt.usageMetadata:
|
|
1374
|
-
return
|
|
1375
|
-
|
|
1376
|
-
usage = evt.usageMetadata
|
|
1377
|
-
|
|
1378
|
-
# Ensure we have valid integers for all token counts
|
|
1379
|
-
prompt_tokens = usage.promptTokenCount or 0
|
|
1380
|
-
completion_tokens = usage.responseTokenCount or 0
|
|
1381
|
-
total_tokens = usage.totalTokenCount or (prompt_tokens + completion_tokens)
|
|
1382
|
-
|
|
1383
|
-
tokens = LLMTokenUsage(
|
|
1384
|
-
prompt_tokens=prompt_tokens,
|
|
1385
|
-
completion_tokens=completion_tokens,
|
|
1386
|
-
total_tokens=total_tokens,
|
|
1387
|
-
)
|
|
1388
|
-
|
|
1389
|
-
await self.start_llm_usage_metrics(tokens)
|
|
1390
|
-
|
|
1391
|
-
def create_context_aggregator(
|
|
1392
|
-
self,
|
|
1393
|
-
context: OpenAILLMContext,
|
|
1394
|
-
*,
|
|
1395
|
-
user_params: LLMUserAggregatorParams = LLMUserAggregatorParams(),
|
|
1396
|
-
assistant_params: LLMAssistantAggregatorParams = LLMAssistantAggregatorParams(),
|
|
1397
|
-
) -> GeminiMultimodalLiveContextAggregatorPair:
|
|
1398
|
-
"""Create an instance of GeminiMultimodalLiveContextAggregatorPair from an OpenAILLMContext.
|
|
1399
|
-
|
|
1400
|
-
Constructor keyword arguments for both the user and assistant aggregators can be provided.
|
|
1401
|
-
|
|
1402
|
-
Args:
|
|
1403
|
-
context: The LLM context to use.
|
|
1404
|
-
user_params: User aggregator parameters. Defaults to LLMUserAggregatorParams().
|
|
1405
|
-
assistant_params: Assistant aggregator parameters. Defaults to LLMAssistantAggregatorParams().
|
|
1406
|
-
|
|
1407
|
-
Returns:
|
|
1408
|
-
GeminiMultimodalLiveContextAggregatorPair: A pair of context
|
|
1409
|
-
aggregators, one for the user and one for the assistant,
|
|
1410
|
-
encapsulated in an GeminiMultimodalLiveContextAggregatorPair.
|
|
1411
|
-
"""
|
|
1412
|
-
context.set_llm_adapter(self.get_llm_adapter())
|
|
1413
|
-
|
|
1414
|
-
GeminiMultimodalLiveContext.upgrade(context)
|
|
1415
|
-
user = GeminiMultimodalLiveUserContextAggregator(context, params=user_params)
|
|
1416
47
|
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
48
|
+
GeminiMultimodalLiveContext = GeminiLiveContext
|
|
49
|
+
GeminiMultimodalLiveUserContextAggregator = GeminiLiveUserContextAggregator
|
|
50
|
+
GeminiMultimodalLiveAssistantContextAggregator = GeminiLiveAssistantContextAggregator
|
|
51
|
+
GeminiMultimodalLiveContextAggregatorPair = GeminiLiveContextAggregatorPair
|
|
52
|
+
GeminiMultimodalModalities = GeminiModalities
|
|
53
|
+
GeminiMediaResolution = _GeminiMediaResolution
|
|
54
|
+
GeminiVADParams = _GeminiVADParams
|
|
55
|
+
ContextWindowCompressionParams = _ContextWindowCompressionParams
|
|
56
|
+
InputParams = _InputParams
|
|
57
|
+
GeminiMultimodalLiveLLMService = GeminiLiveLLMService
|