dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Gemini File API client for uploading and managing files.
|
|
8
|
+
|
|
9
|
+
This module provides a client for Google's Gemini File API, enabling file
|
|
10
|
+
uploads, metadata retrieval, listing, and deletion. Files uploaded through
|
|
11
|
+
this API can be referenced in Gemini generative model calls.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import mimetypes
|
|
15
|
+
from typing import Any, Dict, Optional
|
|
16
|
+
|
|
17
|
+
import aiohttp
|
|
18
|
+
from loguru import logger
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class GeminiFileAPI:
|
|
22
|
+
"""Client for the Gemini File API.
|
|
23
|
+
|
|
24
|
+
This class provides methods for uploading, fetching, listing, and deleting files
|
|
25
|
+
through Google's Gemini File API.
|
|
26
|
+
|
|
27
|
+
Files uploaded through this API remain available for 48 hours and can be referenced
|
|
28
|
+
in calls to the Gemini generative models. Maximum file size is 2GB, with total
|
|
29
|
+
project storage limited to 20GB.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
|
|
34
|
+
):
|
|
35
|
+
"""Initialize the Gemini File API client.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
api_key: Google AI API key
|
|
39
|
+
base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
|
|
40
|
+
"""
|
|
41
|
+
self._api_key = api_key
|
|
42
|
+
self._base_url = base_url
|
|
43
|
+
# Upload URL uses the /upload/ path
|
|
44
|
+
self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
|
|
45
|
+
|
|
46
|
+
async def upload_file(
|
|
47
|
+
self, file_path: str, display_name: Optional[str] = None
|
|
48
|
+
) -> Dict[str, Any]:
|
|
49
|
+
"""Upload a file to the Gemini File API using the correct resumable upload protocol.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
file_path: Path to the file to upload
|
|
53
|
+
display_name: Optional display name for the file
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
File metadata including uri, name, and display_name
|
|
57
|
+
"""
|
|
58
|
+
logger.info(f"Uploading file: {file_path}")
|
|
59
|
+
|
|
60
|
+
async with aiohttp.ClientSession() as session:
|
|
61
|
+
# Determine the file's MIME type
|
|
62
|
+
mime_type, _ = mimetypes.guess_type(file_path)
|
|
63
|
+
if not mime_type:
|
|
64
|
+
mime_type = "application/octet-stream"
|
|
65
|
+
|
|
66
|
+
# Read the file
|
|
67
|
+
with open(file_path, "rb") as f:
|
|
68
|
+
file_data = f.read()
|
|
69
|
+
|
|
70
|
+
# Create the metadata payload
|
|
71
|
+
metadata = {}
|
|
72
|
+
if display_name:
|
|
73
|
+
metadata = {"file": {"display_name": display_name}}
|
|
74
|
+
|
|
75
|
+
# Step 1: Initial resumable request to get upload URL
|
|
76
|
+
headers = {
|
|
77
|
+
"X-Goog-Upload-Protocol": "resumable",
|
|
78
|
+
"X-Goog-Upload-Command": "start",
|
|
79
|
+
"X-Goog-Upload-Header-Content-Length": str(len(file_data)),
|
|
80
|
+
"X-Goog-Upload-Header-Content-Type": mime_type,
|
|
81
|
+
"Content-Type": "application/json",
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
|
|
85
|
+
async with session.post(
|
|
86
|
+
f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
|
|
87
|
+
) as response:
|
|
88
|
+
if response.status != 200:
|
|
89
|
+
error_text = await response.text()
|
|
90
|
+
logger.error(f"Error initiating file upload: {error_text}")
|
|
91
|
+
raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
|
|
92
|
+
|
|
93
|
+
# Get the upload URL from the response header
|
|
94
|
+
upload_url = response.headers.get("X-Goog-Upload-URL")
|
|
95
|
+
if not upload_url:
|
|
96
|
+
logger.error(f"Response headers: {dict(response.headers)}")
|
|
97
|
+
raise Exception("No upload URL in response headers")
|
|
98
|
+
|
|
99
|
+
logger.debug(f"Got upload URL: {upload_url}")
|
|
100
|
+
|
|
101
|
+
# Step 2: Upload the actual file data
|
|
102
|
+
upload_headers = {
|
|
103
|
+
"Content-Length": str(len(file_data)),
|
|
104
|
+
"X-Goog-Upload-Offset": "0",
|
|
105
|
+
"X-Goog-Upload-Command": "upload, finalize",
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
logger.debug(f"Step 2: Uploading file data to {upload_url}")
|
|
109
|
+
async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
|
|
110
|
+
if response.status != 200:
|
|
111
|
+
error_text = await response.text()
|
|
112
|
+
logger.error(f"Error uploading file data: {error_text}")
|
|
113
|
+
raise Exception(f"Failed to upload file: {response.status} - {error_text}")
|
|
114
|
+
|
|
115
|
+
file_info = await response.json()
|
|
116
|
+
logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
|
|
117
|
+
return file_info
|
|
118
|
+
|
|
119
|
+
async def get_file(self, name: str) -> Dict[str, Any]:
|
|
120
|
+
"""Get metadata for a file.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
name: File name (or full path)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
File metadata
|
|
127
|
+
"""
|
|
128
|
+
# Extract just the name part if a full path is provided
|
|
129
|
+
if "/" in name:
|
|
130
|
+
name = name.split("/")[-1]
|
|
131
|
+
|
|
132
|
+
async with aiohttp.ClientSession() as session:
|
|
133
|
+
async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
|
134
|
+
if response.status != 200:
|
|
135
|
+
error_text = await response.text()
|
|
136
|
+
logger.error(f"Error getting file metadata: {error_text}")
|
|
137
|
+
raise Exception(f"Failed to get file metadata: {response.status}")
|
|
138
|
+
|
|
139
|
+
file_info = await response.json()
|
|
140
|
+
return file_info
|
|
141
|
+
|
|
142
|
+
async def list_files(
|
|
143
|
+
self, page_size: int = 10, page_token: Optional[str] = None
|
|
144
|
+
) -> Dict[str, Any]:
|
|
145
|
+
"""List uploaded files.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
page_size: Number of files to return per page
|
|
149
|
+
page_token: Token for pagination
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of files and next page token if available
|
|
153
|
+
"""
|
|
154
|
+
params = {"key": self._api_key, "pageSize": page_size}
|
|
155
|
+
|
|
156
|
+
if page_token:
|
|
157
|
+
params["pageToken"] = page_token
|
|
158
|
+
|
|
159
|
+
async with aiohttp.ClientSession() as session:
|
|
160
|
+
async with session.get(self._base_url, params=params) as response:
|
|
161
|
+
if response.status != 200:
|
|
162
|
+
error_text = await response.text()
|
|
163
|
+
logger.error(f"Error listing files: {error_text}")
|
|
164
|
+
raise Exception(f"Failed to list files: {response.status}")
|
|
165
|
+
|
|
166
|
+
result = await response.json()
|
|
167
|
+
return result
|
|
168
|
+
|
|
169
|
+
async def delete_file(self, name: str) -> bool:
|
|
170
|
+
"""Delete a file.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
name: File name (or full path)
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
True if deleted successfully
|
|
177
|
+
"""
|
|
178
|
+
# Extract just the name part if a full path is provided
|
|
179
|
+
if "/" in name:
|
|
180
|
+
name = name.split("/")[-1]
|
|
181
|
+
|
|
182
|
+
async with aiohttp.ClientSession() as session:
|
|
183
|
+
async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
|
|
184
|
+
if response.status != 200:
|
|
185
|
+
error_text = await response.text()
|
|
186
|
+
logger.error(f"Error deleting file: {error_text}")
|
|
187
|
+
raise Exception(f"Failed to delete file: {response.status}")
|
|
188
|
+
|
|
189
|
+
return True
|
|
@@ -32,6 +32,7 @@ from pipecat.frames.frames import (
|
|
|
32
32
|
Frame,
|
|
33
33
|
InputAudioRawFrame,
|
|
34
34
|
InputImageRawFrame,
|
|
35
|
+
InputTextRawFrame,
|
|
35
36
|
LLMFullResponseEndFrame,
|
|
36
37
|
LLMFullResponseStartFrame,
|
|
37
38
|
LLMMessagesAppendFrame,
|
|
@@ -59,21 +60,22 @@ from pipecat.processors.aggregators.openai_llm_context import (
|
|
|
59
60
|
OpenAILLMContextFrame,
|
|
60
61
|
)
|
|
61
62
|
from pipecat.processors.frame_processor import FrameDirection
|
|
63
|
+
from pipecat.services.google.frames import LLMSearchOrigin, LLMSearchResponseFrame, LLMSearchResult
|
|
62
64
|
from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
|
|
63
65
|
from pipecat.services.openai.llm import (
|
|
64
66
|
OpenAIAssistantContextAggregator,
|
|
65
67
|
OpenAIUserContextAggregator,
|
|
66
68
|
)
|
|
67
69
|
from pipecat.transcriptions.language import Language
|
|
68
|
-
from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
|
|
69
70
|
from pipecat.utils.string import match_endofsentence
|
|
70
71
|
from pipecat.utils.time import time_now_iso8601
|
|
71
72
|
from pipecat.utils.tracing.service_decorators import traced_gemini_live, traced_stt
|
|
72
73
|
|
|
73
74
|
from . import events
|
|
75
|
+
from .file_api import GeminiFileAPI
|
|
74
76
|
|
|
75
77
|
try:
|
|
76
|
-
import
|
|
78
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
77
79
|
except ModuleNotFoundError as e:
|
|
78
80
|
logger.error(f"Exception: {e}")
|
|
79
81
|
logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
|
|
@@ -218,6 +220,31 @@ class GeminiMultimodalLiveContext(OpenAILLMContext):
|
|
|
218
220
|
system_instruction += str(content)
|
|
219
221
|
return system_instruction
|
|
220
222
|
|
|
223
|
+
def add_file_reference(self, file_uri: str, mime_type: str, text: Optional[str] = None):
|
|
224
|
+
"""Add a file reference to the context.
|
|
225
|
+
|
|
226
|
+
This adds a user message with a file reference that will be sent during context initialization.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
file_uri: URI of the uploaded file
|
|
230
|
+
mime_type: MIME type of the file
|
|
231
|
+
text: Optional text prompt to accompany the file
|
|
232
|
+
"""
|
|
233
|
+
# Create parts list with file reference
|
|
234
|
+
parts = []
|
|
235
|
+
if text:
|
|
236
|
+
parts.append({"type": "text", "text": text})
|
|
237
|
+
|
|
238
|
+
# Add file reference part
|
|
239
|
+
parts.append(
|
|
240
|
+
{"type": "file_data", "file_data": {"mime_type": mime_type, "file_uri": file_uri}}
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Add to messages
|
|
244
|
+
message = {"role": "user", "content": parts}
|
|
245
|
+
self.messages.append(message)
|
|
246
|
+
logger.info(f"Added file reference to context: {file_uri}")
|
|
247
|
+
|
|
221
248
|
def get_messages_for_initializing_history(self):
|
|
222
249
|
"""Get messages formatted for Gemini history initialization.
|
|
223
250
|
|
|
@@ -242,6 +269,17 @@ class GeminiMultimodalLiveContext(OpenAILLMContext):
|
|
|
242
269
|
for part in content:
|
|
243
270
|
if part.get("type") == "text":
|
|
244
271
|
parts.append({"text": part.get("text")})
|
|
272
|
+
elif part.get("type") == "file_data":
|
|
273
|
+
file_data = part.get("file_data", {})
|
|
274
|
+
|
|
275
|
+
parts.append(
|
|
276
|
+
{
|
|
277
|
+
"fileData": {
|
|
278
|
+
"mimeType": file_data.get("mime_type"),
|
|
279
|
+
"fileUri": file_data.get("file_uri"),
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
)
|
|
245
283
|
else:
|
|
246
284
|
logger.warning(f"Unsupported content type: {str(part)[:80]}")
|
|
247
285
|
else:
|
|
@@ -333,14 +371,26 @@ class GeminiMultimodalLiveContextAggregatorPair:
|
|
|
333
371
|
|
|
334
372
|
|
|
335
373
|
class GeminiMultimodalModalities(Enum):
|
|
336
|
-
"""Supported modalities for Gemini Multimodal Live.
|
|
374
|
+
"""Supported modalities for Gemini Multimodal Live.
|
|
375
|
+
|
|
376
|
+
Parameters:
|
|
377
|
+
TEXT: Text responses.
|
|
378
|
+
AUDIO: Audio responses.
|
|
379
|
+
"""
|
|
337
380
|
|
|
338
381
|
TEXT = "TEXT"
|
|
339
382
|
AUDIO = "AUDIO"
|
|
340
383
|
|
|
341
384
|
|
|
342
385
|
class GeminiMediaResolution(str, Enum):
|
|
343
|
-
"""Media resolution options for Gemini Multimodal Live.
|
|
386
|
+
"""Media resolution options for Gemini Multimodal Live.
|
|
387
|
+
|
|
388
|
+
Parameters:
|
|
389
|
+
UNSPECIFIED: Use default resolution setting.
|
|
390
|
+
LOW: Low resolution with 64 tokens.
|
|
391
|
+
MEDIUM: Medium resolution with 256 tokens.
|
|
392
|
+
HIGH: High resolution with zoomed reframing and 256 tokens.
|
|
393
|
+
"""
|
|
344
394
|
|
|
345
395
|
UNSPECIFIED = "MEDIA_RESOLUTION_UNSPECIFIED" # Use default
|
|
346
396
|
LOW = "MEDIA_RESOLUTION_LOW" # 64 tokens
|
|
@@ -422,20 +472,6 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
422
472
|
This service enables real-time conversations with Gemini, supporting both
|
|
423
473
|
text and audio modalities. It handles voice transcription, streaming audio
|
|
424
474
|
responses, and tool usage.
|
|
425
|
-
|
|
426
|
-
Args:
|
|
427
|
-
api_key: Google AI API key for authentication.
|
|
428
|
-
base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
|
|
429
|
-
model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
|
|
430
|
-
voice_id: TTS voice identifier. Defaults to "Charon".
|
|
431
|
-
start_audio_paused: Whether to start with audio input paused. Defaults to False.
|
|
432
|
-
start_video_paused: Whether to start with video input paused. Defaults to False.
|
|
433
|
-
system_instruction: System prompt for the model. Defaults to None.
|
|
434
|
-
tools: Tools/functions available to the model. Defaults to None.
|
|
435
|
-
params: Configuration parameters for the model. Defaults to InputParams().
|
|
436
|
-
inference_on_context_initialization: Whether to generate a response when context
|
|
437
|
-
is first set. Defaults to True.
|
|
438
|
-
**kwargs: Additional arguments passed to parent LLMService.
|
|
439
475
|
"""
|
|
440
476
|
|
|
441
477
|
# Overriding the default adapter to use the Gemini one.
|
|
@@ -454,8 +490,26 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
454
490
|
tools: Optional[Union[List[dict], ToolsSchema]] = None,
|
|
455
491
|
params: Optional[InputParams] = None,
|
|
456
492
|
inference_on_context_initialization: bool = True,
|
|
493
|
+
file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
|
|
457
494
|
**kwargs,
|
|
458
495
|
):
|
|
496
|
+
"""Initialize the Gemini Multimodal Live LLM service.
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
api_key: Google AI API key for authentication.
|
|
500
|
+
base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
|
|
501
|
+
model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
|
|
502
|
+
voice_id: TTS voice identifier. Defaults to "Charon".
|
|
503
|
+
start_audio_paused: Whether to start with audio input paused. Defaults to False.
|
|
504
|
+
start_video_paused: Whether to start with video input paused. Defaults to False.
|
|
505
|
+
system_instruction: System prompt for the model. Defaults to None.
|
|
506
|
+
tools: Tools/functions available to the model. Defaults to None.
|
|
507
|
+
params: Configuration parameters for the model. Defaults to InputParams().
|
|
508
|
+
inference_on_context_initialization: Whether to generate a response when context
|
|
509
|
+
is first set. Defaults to True.
|
|
510
|
+
file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
|
|
511
|
+
**kwargs: Additional arguments passed to parent LLMService.
|
|
512
|
+
"""
|
|
459
513
|
super().__init__(base_url=base_url, **kwargs)
|
|
460
514
|
|
|
461
515
|
params = params or InputParams()
|
|
@@ -516,6 +570,13 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
516
570
|
"extra": params.extra if isinstance(params.extra, dict) else {},
|
|
517
571
|
}
|
|
518
572
|
|
|
573
|
+
# Initialize the File API client
|
|
574
|
+
self.file_api = GeminiFileAPI(api_key=api_key, base_url=file_api_base_url)
|
|
575
|
+
|
|
576
|
+
# Grounding metadata tracking
|
|
577
|
+
self._search_result_buffer = ""
|
|
578
|
+
self._accumulated_grounding_metadata = None
|
|
579
|
+
|
|
519
580
|
def can_generate_metrics(self) -> bool:
|
|
520
581
|
"""Check if the service can generate usage metrics.
|
|
521
582
|
|
|
@@ -524,6 +585,17 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
524
585
|
"""
|
|
525
586
|
return True
|
|
526
587
|
|
|
588
|
+
def needs_mcp_alternate_schema(self) -> bool:
|
|
589
|
+
"""Check if this LLM service requires alternate MCP schema.
|
|
590
|
+
|
|
591
|
+
Google/Gemini has stricter JSON schema validation and requires
|
|
592
|
+
certain properties to be removed or modified for compatibility.
|
|
593
|
+
|
|
594
|
+
Returns:
|
|
595
|
+
True for Google/Gemini services.
|
|
596
|
+
"""
|
|
597
|
+
return True
|
|
598
|
+
|
|
527
599
|
def set_audio_input_paused(self, paused: bool):
|
|
528
600
|
"""Set the audio input pause state.
|
|
529
601
|
|
|
@@ -666,6 +738,9 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
666
738
|
# Support just one tool call per context frame for now
|
|
667
739
|
tool_result_message = context.messages[-1]
|
|
668
740
|
await self._tool_result(tool_result_message)
|
|
741
|
+
elif isinstance(frame, InputTextRawFrame):
|
|
742
|
+
await self._send_user_text(frame.text)
|
|
743
|
+
await self.push_frame(frame, direction)
|
|
669
744
|
elif isinstance(frame, InputAudioRawFrame):
|
|
670
745
|
await self._send_user_audio(frame)
|
|
671
746
|
await self.push_frame(frame, direction)
|
|
@@ -709,6 +784,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
709
784
|
await self._ws_send(event.model_dump(exclude_none=True))
|
|
710
785
|
|
|
711
786
|
async def _connect(self):
|
|
787
|
+
"""Establish WebSocket connection to Gemini Live API."""
|
|
712
788
|
if self._websocket:
|
|
713
789
|
# Here we assume that if we have a websocket, we are connected. We
|
|
714
790
|
# handle disconnections in the send/recv code paths.
|
|
@@ -718,7 +794,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
718
794
|
try:
|
|
719
795
|
logger.info(f"Connecting to wss://{self._base_url}")
|
|
720
796
|
uri = f"wss://{self._base_url}?key={self._api_key}"
|
|
721
|
-
self._websocket = await
|
|
797
|
+
self._websocket = await websocket_connect(uri=uri)
|
|
722
798
|
self._receive_task = self.create_task(self._receive_task_handler())
|
|
723
799
|
|
|
724
800
|
# Create the basic configuration
|
|
@@ -813,6 +889,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
813
889
|
self._websocket = None
|
|
814
890
|
|
|
815
891
|
async def _disconnect(self):
|
|
892
|
+
"""Disconnect from Gemini Live API and clean up resources."""
|
|
816
893
|
logger.info("Disconnecting from Gemini service")
|
|
817
894
|
try:
|
|
818
895
|
self._disconnecting = True
|
|
@@ -829,6 +906,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
829
906
|
logger.error(f"{self} error disconnecting: {e}")
|
|
830
907
|
|
|
831
908
|
async def _ws_send(self, message):
|
|
909
|
+
"""Send a message to the WebSocket connection."""
|
|
832
910
|
# logger.debug(f"Sending message to websocket: {message}")
|
|
833
911
|
try:
|
|
834
912
|
if self._websocket:
|
|
@@ -849,7 +927,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
849
927
|
#
|
|
850
928
|
|
|
851
929
|
async def _receive_task_handler(self):
|
|
852
|
-
|
|
930
|
+
"""Handle incoming messages from the WebSocket connection."""
|
|
931
|
+
async for message in self._websocket:
|
|
853
932
|
evt = events.parse_server_event(message)
|
|
854
933
|
# logger.debug(f"Received event: {message[:500]}")
|
|
855
934
|
# logger.debug(f"Received event: {evt}")
|
|
@@ -865,6 +944,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
865
944
|
await self._handle_evt_input_transcription(evt)
|
|
866
945
|
elif evt.serverContent and evt.serverContent.outputTranscription:
|
|
867
946
|
await self._handle_evt_output_transcription(evt)
|
|
947
|
+
elif evt.serverContent and evt.serverContent.groundingMetadata:
|
|
948
|
+
await self._handle_evt_grounding_metadata(evt)
|
|
868
949
|
elif evt.toolCall:
|
|
869
950
|
await self._handle_evt_tool_call(evt)
|
|
870
951
|
elif False: # !!! todo: error events?
|
|
@@ -877,6 +958,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
877
958
|
#
|
|
878
959
|
|
|
879
960
|
async def _send_user_audio(self, frame):
|
|
961
|
+
"""Send user audio frame to Gemini Live API."""
|
|
880
962
|
if self._audio_input_paused:
|
|
881
963
|
return
|
|
882
964
|
# Send all audio to Gemini
|
|
@@ -892,7 +974,25 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
892
974
|
length = int((frame.sample_rate * frame.num_channels * 2) * 0.5)
|
|
893
975
|
self._user_audio_buffer = self._user_audio_buffer[-length:]
|
|
894
976
|
|
|
977
|
+
async def _send_user_text(self, text: str):
|
|
978
|
+
"""Send user text via Gemini Live API's realtime input stream.
|
|
979
|
+
|
|
980
|
+
This method sends text through the realtimeInput stream (via TextInputMessage)
|
|
981
|
+
rather than the clientContent stream. This ensures text input is synchronized
|
|
982
|
+
with audio and video inputs, preventing temporal misalignment that can occur
|
|
983
|
+
when different modalities are processed through separate API pathways.
|
|
984
|
+
|
|
985
|
+
For realtimeInput, turn completion is automatically inferred by the API based
|
|
986
|
+
on user activity, so no explicit turnComplete signal is needed.
|
|
987
|
+
|
|
988
|
+
Args:
|
|
989
|
+
text: The text to send as user input.
|
|
990
|
+
"""
|
|
991
|
+
evt = events.TextInputMessage.from_text(text)
|
|
992
|
+
await self.send_client_event(evt)
|
|
993
|
+
|
|
895
994
|
async def _send_user_video(self, frame):
|
|
995
|
+
"""Send user video frame to Gemini Live API."""
|
|
896
996
|
if self._video_input_paused:
|
|
897
997
|
return
|
|
898
998
|
|
|
@@ -906,6 +1006,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
906
1006
|
await self.send_client_event(evt)
|
|
907
1007
|
|
|
908
1008
|
async def _create_initial_response(self):
|
|
1009
|
+
"""Create initial response based on context history."""
|
|
909
1010
|
if not self._api_session_ready:
|
|
910
1011
|
self._run_llm_when_api_session_ready = True
|
|
911
1012
|
return
|
|
@@ -931,7 +1032,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
931
1032
|
self._needs_turn_complete_message = True
|
|
932
1033
|
|
|
933
1034
|
async def _create_single_response(self, messages_list):
|
|
934
|
-
|
|
1035
|
+
"""Create a single response from a list of messages."""
|
|
1036
|
+
# Refactor to combine this logic with same logic in GeminiMultimodalLiveContext
|
|
935
1037
|
messages = []
|
|
936
1038
|
for item in messages_list:
|
|
937
1039
|
role = item.get("role")
|
|
@@ -950,6 +1052,17 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
950
1052
|
for part in content:
|
|
951
1053
|
if part.get("type") == "text":
|
|
952
1054
|
parts.append({"text": part.get("text")})
|
|
1055
|
+
elif part.get("type") == "file_data":
|
|
1056
|
+
file_data = part.get("file_data", {})
|
|
1057
|
+
|
|
1058
|
+
parts.append(
|
|
1059
|
+
{
|
|
1060
|
+
"fileData": {
|
|
1061
|
+
"mimeType": file_data.get("mime_type"),
|
|
1062
|
+
"fileUri": file_data.get("file_uri"),
|
|
1063
|
+
}
|
|
1064
|
+
}
|
|
1065
|
+
)
|
|
953
1066
|
else:
|
|
954
1067
|
logger.warning(f"Unsupported content type: {str(part)[:80]}")
|
|
955
1068
|
else:
|
|
@@ -973,6 +1086,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
973
1086
|
|
|
974
1087
|
@traced_gemini_live(operation="llm_tool_result")
|
|
975
1088
|
async def _tool_result(self, tool_result_message):
|
|
1089
|
+
"""Send tool result back to the API."""
|
|
976
1090
|
# For now we're shoving the name into the tool_call_id field, so this
|
|
977
1091
|
# will work until we revisit that.
|
|
978
1092
|
id = tool_result_message.get("tool_call_id")
|
|
@@ -998,6 +1112,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
998
1112
|
|
|
999
1113
|
@traced_gemini_live(operation="llm_setup")
|
|
1000
1114
|
async def _handle_evt_setup_complete(self, evt):
|
|
1115
|
+
"""Handle the setup complete event."""
|
|
1001
1116
|
# If this is our first context frame, run the LLM
|
|
1002
1117
|
self._api_session_ready = True
|
|
1003
1118
|
# Now that we've configured the session, we can run the LLM if we need to.
|
|
@@ -1006,6 +1121,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
1006
1121
|
await self._create_initial_response()
|
|
1007
1122
|
|
|
1008
1123
|
async def _handle_evt_model_turn(self, evt):
|
|
1124
|
+
"""Handle the model turn event."""
|
|
1009
1125
|
part = evt.serverContent.modelTurn.parts[0]
|
|
1010
1126
|
if not part:
|
|
1011
1127
|
return
|
|
@@ -1019,8 +1135,13 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
1019
1135
|
await self.push_frame(LLMFullResponseStartFrame())
|
|
1020
1136
|
|
|
1021
1137
|
self._bot_text_buffer += text
|
|
1138
|
+
self._search_result_buffer += text # Also accumulate for grounding
|
|
1022
1139
|
await self.push_frame(LLMTextFrame(text=text))
|
|
1023
1140
|
|
|
1141
|
+
# Check for grounding metadata in server content
|
|
1142
|
+
if evt.serverContent and evt.serverContent.groundingMetadata:
|
|
1143
|
+
self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
|
|
1144
|
+
|
|
1024
1145
|
inline_data = part.inlineData
|
|
1025
1146
|
if not inline_data:
|
|
1026
1147
|
return
|
|
@@ -1047,6 +1168,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
1047
1168
|
|
|
1048
1169
|
@traced_gemini_live(operation="llm_tool_call")
|
|
1049
1170
|
async def _handle_evt_tool_call(self, evt):
|
|
1171
|
+
"""Handle tool call events."""
|
|
1050
1172
|
function_calls = evt.toolCall.functionCalls
|
|
1051
1173
|
if not function_calls:
|
|
1052
1174
|
return
|
|
@@ -1067,6 +1189,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
1067
1189
|
|
|
1068
1190
|
@traced_gemini_live(operation="llm_response")
|
|
1069
1191
|
async def _handle_evt_turn_complete(self, evt):
|
|
1192
|
+
"""Handle the turn complete event."""
|
|
1070
1193
|
self._bot_is_speaking = False
|
|
1071
1194
|
text = self._bot_text_buffer
|
|
1072
1195
|
|
|
@@ -1086,6 +1209,16 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
1086
1209
|
self._bot_text_buffer = ""
|
|
1087
1210
|
self._llm_output_buffer = ""
|
|
1088
1211
|
|
|
1212
|
+
# Process grounding metadata if we have accumulated any
|
|
1213
|
+
if self._accumulated_grounding_metadata:
|
|
1214
|
+
await self._process_grounding_metadata(
|
|
1215
|
+
self._accumulated_grounding_metadata, self._search_result_buffer
|
|
1216
|
+
)
|
|
1217
|
+
|
|
1218
|
+
# Reset grounding tracking for next response
|
|
1219
|
+
self._search_result_buffer = ""
|
|
1220
|
+
self._accumulated_grounding_metadata = None
|
|
1221
|
+
|
|
1089
1222
|
# Only push the TTSStoppedFrame if the bot is outputting audio
|
|
1090
1223
|
# when text is found, modalities is set to TEXT and no audio
|
|
1091
1224
|
# is produced.
|
|
@@ -1150,6 +1283,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
1150
1283
|
)
|
|
1151
1284
|
|
|
1152
1285
|
async def _handle_evt_output_transcription(self, evt):
|
|
1286
|
+
"""Handle the output transcription event."""
|
|
1153
1287
|
if not evt.serverContent.outputTranscription:
|
|
1154
1288
|
return
|
|
1155
1289
|
|
|
@@ -1161,13 +1295,76 @@ class GeminiMultimodalLiveLLMService(LLMService):
|
|
|
1161
1295
|
if not text:
|
|
1162
1296
|
return
|
|
1163
1297
|
|
|
1298
|
+
# Accumulate text for grounding as well
|
|
1299
|
+
self._search_result_buffer += text
|
|
1300
|
+
|
|
1301
|
+
# Check for grounding metadata in server content
|
|
1302
|
+
if evt.serverContent and evt.serverContent.groundingMetadata:
|
|
1303
|
+
self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
|
|
1164
1304
|
# Collect text for tracing
|
|
1165
1305
|
self._llm_output_buffer += text
|
|
1166
1306
|
|
|
1167
1307
|
await self.push_frame(LLMTextFrame(text=text))
|
|
1168
1308
|
await self.push_frame(TTSTextFrame(text=text))
|
|
1169
1309
|
|
|
1310
|
+
async def _handle_evt_grounding_metadata(self, evt):
|
|
1311
|
+
"""Handle dedicated grounding metadata events."""
|
|
1312
|
+
if evt.serverContent and evt.serverContent.groundingMetadata:
|
|
1313
|
+
grounding_metadata = evt.serverContent.groundingMetadata
|
|
1314
|
+
# Process the grounding metadata immediately
|
|
1315
|
+
await self._process_grounding_metadata(grounding_metadata, self._search_result_buffer)
|
|
1316
|
+
|
|
1317
|
+
async def _process_grounding_metadata(
|
|
1318
|
+
self, grounding_metadata: events.GroundingMetadata, search_result: str = ""
|
|
1319
|
+
):
|
|
1320
|
+
"""Process grounding metadata and emit LLMSearchResponseFrame."""
|
|
1321
|
+
if not grounding_metadata:
|
|
1322
|
+
return
|
|
1323
|
+
|
|
1324
|
+
# Extract rendered content for search suggestions
|
|
1325
|
+
rendered_content = None
|
|
1326
|
+
if (
|
|
1327
|
+
grounding_metadata.searchEntryPoint
|
|
1328
|
+
and grounding_metadata.searchEntryPoint.renderedContent
|
|
1329
|
+
):
|
|
1330
|
+
rendered_content = grounding_metadata.searchEntryPoint.renderedContent
|
|
1331
|
+
|
|
1332
|
+
# Convert grounding chunks and supports to LLMSearchOrigin format
|
|
1333
|
+
origins = []
|
|
1334
|
+
|
|
1335
|
+
if grounding_metadata.groundingChunks and grounding_metadata.groundingSupports:
|
|
1336
|
+
# Create a mapping of chunk indices to origins
|
|
1337
|
+
chunk_to_origin = {}
|
|
1338
|
+
|
|
1339
|
+
for index, chunk in enumerate(grounding_metadata.groundingChunks):
|
|
1340
|
+
if chunk.web:
|
|
1341
|
+
origin = LLMSearchOrigin(
|
|
1342
|
+
site_uri=chunk.web.uri, site_title=chunk.web.title, results=[]
|
|
1343
|
+
)
|
|
1344
|
+
chunk_to_origin[index] = origin
|
|
1345
|
+
origins.append(origin)
|
|
1346
|
+
|
|
1347
|
+
# Add grounding support results to the appropriate origins
|
|
1348
|
+
for support in grounding_metadata.groundingSupports:
|
|
1349
|
+
if support.segment and support.groundingChunkIndices:
|
|
1350
|
+
text = support.segment.text or ""
|
|
1351
|
+
confidence_scores = support.confidenceScores or []
|
|
1352
|
+
|
|
1353
|
+
# Add this result to all origins referenced by this support
|
|
1354
|
+
for chunk_index in support.groundingChunkIndices:
|
|
1355
|
+
if chunk_index in chunk_to_origin:
|
|
1356
|
+
result = LLMSearchResult(text=text, confidence=confidence_scores)
|
|
1357
|
+
chunk_to_origin[chunk_index].results.append(result)
|
|
1358
|
+
|
|
1359
|
+
# Create and push the search response frame
|
|
1360
|
+
search_frame = LLMSearchResponseFrame(
|
|
1361
|
+
search_result=search_result, origins=origins, rendered_content=rendered_content
|
|
1362
|
+
)
|
|
1363
|
+
|
|
1364
|
+
await self.push_frame(search_frame)
|
|
1365
|
+
|
|
1170
1366
|
async def _handle_evt_usage_metadata(self, evt):
|
|
1367
|
+
"""Handle the usage metadata event."""
|
|
1171
1368
|
if not evt.usageMetadata:
|
|
1172
1369
|
return
|
|
1173
1370
|
|