PyPI - dv-pipecat-ai - Versions diffs - 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show

{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
pipecat/__init__.py +17 -0
pipecat/adapters/base_llm_adapter.py +36 -1
pipecat/adapters/schemas/direct_function.py +296 -0
pipecat/adapters/schemas/function_schema.py +15 -6
pipecat/adapters/schemas/tools_schema.py +55 -7
pipecat/adapters/services/anthropic_adapter.py +22 -3
pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
pipecat/adapters/services/bedrock_adapter.py +22 -3
pipecat/adapters/services/gemini_adapter.py +16 -3
pipecat/adapters/services/open_ai_adapter.py +17 -2
pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
pipecat/audio/filters/base_audio_filter.py +30 -6
pipecat/audio/filters/koala_filter.py +37 -2
pipecat/audio/filters/krisp_filter.py +59 -6
pipecat/audio/filters/noisereduce_filter.py +37 -0
pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
pipecat/audio/mixers/base_audio_mixer.py +30 -7
pipecat/audio/mixers/soundfile_mixer.py +53 -6
pipecat/audio/resamplers/base_audio_resampler.py +17 -9
pipecat/audio/resamplers/resampy_resampler.py +26 -1
pipecat/audio/resamplers/soxr_resampler.py +32 -1
pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
pipecat/audio/utils.py +194 -1
pipecat/audio/vad/silero.py +60 -3
pipecat/audio/vad/vad_analyzer.py +114 -30
pipecat/clocks/base_clock.py +19 -0
pipecat/clocks/system_clock.py +25 -0
pipecat/extensions/voicemail/__init__.py +0 -0
pipecat/extensions/voicemail/voicemail_detector.py +707 -0
pipecat/frames/frames.py +590 -156
pipecat/metrics/metrics.py +64 -1
pipecat/observers/base_observer.py +58 -19
pipecat/observers/loggers/debug_log_observer.py +56 -64
pipecat/observers/loggers/llm_log_observer.py +8 -1
pipecat/observers/loggers/transcription_log_observer.py +19 -7
pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
pipecat/observers/turn_tracking_observer.py +26 -1
pipecat/pipeline/base_pipeline.py +5 -7
pipecat/pipeline/base_task.py +52 -9
pipecat/pipeline/parallel_pipeline.py +121 -177
pipecat/pipeline/pipeline.py +129 -20
pipecat/pipeline/runner.py +50 -1
pipecat/pipeline/sync_parallel_pipeline.py +132 -32
pipecat/pipeline/task.py +263 -280
pipecat/pipeline/task_observer.py +85 -34
pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
pipecat/processors/aggregators/gated.py +25 -24
pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
pipecat/processors/aggregators/llm_response.py +398 -89
pipecat/processors/aggregators/openai_llm_context.py +161 -13
pipecat/processors/aggregators/sentence.py +25 -14
pipecat/processors/aggregators/user_response.py +28 -3
pipecat/processors/aggregators/vision_image_frame.py +24 -14
pipecat/processors/async_generator.py +28 -0
pipecat/processors/audio/audio_buffer_processor.py +78 -37
pipecat/processors/consumer_processor.py +25 -6
pipecat/processors/filters/frame_filter.py +23 -0
pipecat/processors/filters/function_filter.py +30 -0
pipecat/processors/filters/identity_filter.py +17 -2
pipecat/processors/filters/null_filter.py +24 -1
pipecat/processors/filters/stt_mute_filter.py +56 -21
pipecat/processors/filters/wake_check_filter.py +46 -3
pipecat/processors/filters/wake_notifier_filter.py +21 -3
pipecat/processors/frame_processor.py +488 -131
pipecat/processors/frameworks/langchain.py +38 -3
pipecat/processors/frameworks/rtvi.py +719 -34
pipecat/processors/gstreamer/pipeline_source.py +41 -0
pipecat/processors/idle_frame_processor.py +26 -3
pipecat/processors/logger.py +23 -0
pipecat/processors/metrics/frame_processor_metrics.py +77 -4
pipecat/processors/metrics/sentry.py +42 -4
pipecat/processors/producer_processor.py +34 -14
pipecat/processors/text_transformer.py +22 -10
pipecat/processors/transcript_processor.py +48 -29
pipecat/processors/user_idle_processor.py +31 -21
pipecat/runner/__init__.py +1 -0
pipecat/runner/daily.py +132 -0
pipecat/runner/livekit.py +148 -0
pipecat/runner/run.py +543 -0
pipecat/runner/types.py +67 -0
pipecat/runner/utils.py +515 -0
pipecat/serializers/base_serializer.py +42 -0
pipecat/serializers/exotel.py +17 -6
pipecat/serializers/genesys.py +95 -0
pipecat/serializers/livekit.py +33 -0
pipecat/serializers/plivo.py +16 -15
pipecat/serializers/protobuf.py +37 -1
pipecat/serializers/telnyx.py +18 -17
pipecat/serializers/twilio.py +32 -16
pipecat/services/ai_service.py +5 -3
pipecat/services/anthropic/llm.py +113 -43
pipecat/services/assemblyai/models.py +63 -5
pipecat/services/assemblyai/stt.py +64 -11
pipecat/services/asyncai/__init__.py +0 -0
pipecat/services/asyncai/tts.py +501 -0
pipecat/services/aws/llm.py +185 -111
pipecat/services/aws/stt.py +217 -23
pipecat/services/aws/tts.py +118 -52
pipecat/services/aws/utils.py +101 -5
pipecat/services/aws_nova_sonic/aws.py +82 -64
pipecat/services/aws_nova_sonic/context.py +15 -6
pipecat/services/azure/common.py +10 -2
pipecat/services/azure/image.py +32 -0
pipecat/services/azure/llm.py +9 -7
pipecat/services/azure/stt.py +65 -2
pipecat/services/azure/tts.py +154 -23
pipecat/services/cartesia/stt.py +125 -8
pipecat/services/cartesia/tts.py +102 -38
pipecat/services/cerebras/llm.py +15 -23
pipecat/services/deepgram/stt.py +19 -11
pipecat/services/deepgram/tts.py +36 -0
pipecat/services/deepseek/llm.py +14 -23
pipecat/services/elevenlabs/tts.py +330 -64
pipecat/services/fal/image.py +43 -0
pipecat/services/fal/stt.py +48 -10
pipecat/services/fireworks/llm.py +14 -21
pipecat/services/fish/tts.py +109 -9
pipecat/services/gemini_multimodal_live/__init__.py +1 -0
pipecat/services/gemini_multimodal_live/events.py +83 -2
pipecat/services/gemini_multimodal_live/file_api.py +189 -0
pipecat/services/gemini_multimodal_live/gemini.py +218 -21
pipecat/services/gladia/config.py +17 -10
pipecat/services/gladia/stt.py +82 -36
pipecat/services/google/frames.py +40 -0
pipecat/services/google/google.py +2 -0
pipecat/services/google/image.py +39 -2
pipecat/services/google/llm.py +176 -58
pipecat/services/google/llm_openai.py +26 -4
pipecat/services/google/llm_vertex.py +37 -15
pipecat/services/google/rtvi.py +41 -0
pipecat/services/google/stt.py +65 -17
pipecat/services/google/test-google-chirp.py +45 -0
pipecat/services/google/tts.py +390 -19
pipecat/services/grok/llm.py +8 -6
pipecat/services/groq/llm.py +8 -6
pipecat/services/groq/stt.py +13 -9
pipecat/services/groq/tts.py +40 -0
pipecat/services/hamsa/__init__.py +9 -0
pipecat/services/hamsa/stt.py +241 -0
pipecat/services/heygen/__init__.py +5 -0
pipecat/services/heygen/api.py +281 -0
pipecat/services/heygen/client.py +620 -0
pipecat/services/heygen/video.py +338 -0
pipecat/services/image_service.py +5 -3
pipecat/services/inworld/__init__.py +1 -0
pipecat/services/inworld/tts.py +592 -0
pipecat/services/llm_service.py +127 -45
pipecat/services/lmnt/tts.py +80 -7
pipecat/services/mcp_service.py +85 -44
pipecat/services/mem0/memory.py +42 -13
pipecat/services/minimax/tts.py +74 -15
pipecat/services/mistral/__init__.py +0 -0
pipecat/services/mistral/llm.py +185 -0
pipecat/services/moondream/vision.py +55 -10
pipecat/services/neuphonic/tts.py +275 -48
pipecat/services/nim/llm.py +8 -6
pipecat/services/ollama/llm.py +27 -7
pipecat/services/openai/base_llm.py +54 -16
pipecat/services/openai/image.py +30 -0
pipecat/services/openai/llm.py +7 -5
pipecat/services/openai/stt.py +13 -9
pipecat/services/openai/tts.py +42 -10
pipecat/services/openai_realtime_beta/azure.py +11 -9
pipecat/services/openai_realtime_beta/context.py +7 -5
pipecat/services/openai_realtime_beta/events.py +10 -7
pipecat/services/openai_realtime_beta/openai.py +37 -18
pipecat/services/openpipe/llm.py +30 -24
pipecat/services/openrouter/llm.py +9 -7
pipecat/services/perplexity/llm.py +15 -19
pipecat/services/piper/tts.py +26 -12
pipecat/services/playht/tts.py +227 -65
pipecat/services/qwen/llm.py +8 -6
pipecat/services/rime/tts.py +128 -17
pipecat/services/riva/stt.py +160 -22
pipecat/services/riva/tts.py +67 -2
pipecat/services/sambanova/llm.py +19 -17
pipecat/services/sambanova/stt.py +14 -8
pipecat/services/sarvam/tts.py +60 -13
pipecat/services/simli/video.py +82 -21
pipecat/services/soniox/__init__.py +0 -0
pipecat/services/soniox/stt.py +398 -0
pipecat/services/speechmatics/stt.py +29 -17
pipecat/services/stt_service.py +47 -11
pipecat/services/tavus/video.py +94 -25
pipecat/services/together/llm.py +8 -6
pipecat/services/tts_service.py +77 -53
pipecat/services/ultravox/stt.py +46 -43
pipecat/services/vision_service.py +5 -3
pipecat/services/websocket_service.py +12 -11
pipecat/services/whisper/base_stt.py +58 -12
pipecat/services/whisper/stt.py +69 -58
pipecat/services/xtts/tts.py +59 -2
pipecat/sync/base_notifier.py +19 -0
pipecat/sync/event_notifier.py +24 -0
pipecat/tests/utils.py +73 -5
pipecat/transcriptions/language.py +24 -0
pipecat/transports/base_input.py +112 -8
pipecat/transports/base_output.py +235 -13
pipecat/transports/base_transport.py +119 -0
pipecat/transports/local/audio.py +76 -0
pipecat/transports/local/tk.py +84 -0
pipecat/transports/network/fastapi_websocket.py +174 -15
pipecat/transports/network/small_webrtc.py +383 -39
pipecat/transports/network/webrtc_connection.py +214 -8
pipecat/transports/network/websocket_client.py +171 -1
pipecat/transports/network/websocket_server.py +147 -9
pipecat/transports/services/daily.py +792 -70
pipecat/transports/services/helpers/daily_rest.py +122 -129
pipecat/transports/services/livekit.py +339 -4
pipecat/transports/services/tavus.py +273 -38
pipecat/utils/asyncio/task_manager.py +92 -186
pipecat/utils/base_object.py +83 -1
pipecat/utils/network.py +2 -0
pipecat/utils/string.py +114 -58
pipecat/utils/text/base_text_aggregator.py +44 -13
pipecat/utils/text/base_text_filter.py +46 -0
pipecat/utils/text/markdown_text_filter.py +70 -14
pipecat/utils/text/pattern_pair_aggregator.py +18 -14
pipecat/utils/text/simple_text_aggregator.py +43 -2
pipecat/utils/text/skip_tags_aggregator.py +21 -13
pipecat/utils/time.py +36 -0
pipecat/utils/tracing/class_decorators.py +32 -7
pipecat/utils/tracing/conversation_context_provider.py +12 -2
pipecat/utils/tracing/service_attributes.py +80 -64
pipecat/utils/tracing/service_decorators.py +48 -21
pipecat/utils/tracing/setup.py +13 -7
pipecat/utils/tracing/turn_context_provider.py +12 -2
pipecat/utils/tracing/turn_trace_observer.py +27 -0
pipecat/utils/utils.py +14 -14
dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
pipecat/examples/daily_runner.py +0 -64
pipecat/examples/run.py +0 -265
pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
pipecat/utils/asyncio/watchdog_event.py +0 -42
pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
pipecat/utils/asyncio/watchdog_queue.py +0 -48
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
/pipecat/{examples → extensions}/__init__.py +0 -0

pipecat/services/gemini_multimodal_live/file_api.py ADDED Viewed

@@ -0,0 +1,189 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+"""Gemini File API client for uploading and managing files.
+This module provides a client for Google's Gemini File API, enabling file
+uploads, metadata retrieval, listing, and deletion. Files uploaded through
+this API can be referenced in Gemini generative model calls.
+"""
+import mimetypes
+from typing import Any, Dict, Optional
+import aiohttp
+from loguru import logger
+class GeminiFileAPI:
+    """Client for the Gemini File API.
+    This class provides methods for uploading, fetching, listing, and deleting files
+    through Google's Gemini File API.
+    Files uploaded through this API remain available for 48 hours and can be referenced
+    in calls to the Gemini generative models. Maximum file size is 2GB, with total
+    project storage limited to 20GB.
+    """
+    def __init__(
+        self, api_key: str, base_url: str = "https://generativelanguage.googleapis.com/v1beta/files"
+    ):
+        """Initialize the Gemini File API client.
+        Args:
+            api_key: Google AI API key
+            base_url: Base URL for the Gemini File API (default is the v1beta endpoint)
+        """
+        self._api_key = api_key
+        self._base_url = base_url
+        # Upload URL uses the /upload/ path
+        self.upload_base_url = "https://generativelanguage.googleapis.com/upload/v1beta/files"
+    async def upload_file(
+        self, file_path: str, display_name: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """Upload a file to the Gemini File API using the correct resumable upload protocol.
+        Args:
+            file_path: Path to the file to upload
+            display_name: Optional display name for the file
+        Returns:
+            File metadata including uri, name, and display_name
+        """
+        logger.info(f"Uploading file: {file_path}")
+        async with aiohttp.ClientSession() as session:
+            # Determine the file's MIME type
+            mime_type, _ = mimetypes.guess_type(file_path)
+            if not mime_type:
+                mime_type = "application/octet-stream"
+            # Read the file
+            with open(file_path, "rb") as f:
+                file_data = f.read()
+            # Create the metadata payload
+            metadata = {}
+            if display_name:
+                metadata = {"file": {"display_name": display_name}}
+            # Step 1: Initial resumable request to get upload URL
+            headers = {
+                "X-Goog-Upload-Protocol": "resumable",
+                "X-Goog-Upload-Command": "start",
+                "X-Goog-Upload-Header-Content-Length": str(len(file_data)),
+                "X-Goog-Upload-Header-Content-Type": mime_type,
+                "Content-Type": "application/json",
+            }
+            logger.debug(f"Step 1: Getting upload URL from {self.upload_base_url}")
+            async with session.post(
+                f"{self.upload_base_url}?key={self._api_key}", headers=headers, json=metadata
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.error(f"Error initiating file upload: {error_text}")
+                    raise Exception(f"Failed to initiate upload: {response.status} - {error_text}")
+                # Get the upload URL from the response header
+                upload_url = response.headers.get("X-Goog-Upload-URL")
+                if not upload_url:
+                    logger.error(f"Response headers: {dict(response.headers)}")
+                    raise Exception("No upload URL in response headers")
+                logger.debug(f"Got upload URL: {upload_url}")
+            # Step 2: Upload the actual file data
+            upload_headers = {
+                "Content-Length": str(len(file_data)),
+                "X-Goog-Upload-Offset": "0",
+                "X-Goog-Upload-Command": "upload, finalize",
+            }
+            logger.debug(f"Step 2: Uploading file data to {upload_url}")
+            async with session.post(upload_url, headers=upload_headers, data=file_data) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.error(f"Error uploading file data: {error_text}")
+                    raise Exception(f"Failed to upload file: {response.status} - {error_text}")
+                file_info = await response.json()
+                logger.info(f"File uploaded successfully: {file_info.get('file', {}).get('name')}")
+                return file_info
+    async def get_file(self, name: str) -> Dict[str, Any]:
+        """Get metadata for a file.
+        Args:
+            name: File name (or full path)
+        Returns:
+            File metadata
+        """
+        # Extract just the name part if a full path is provided
+        if "/" in name:
+            name = name.split("/")[-1]
+        async with aiohttp.ClientSession() as session:
+            async with session.get(f"{self._base_url}/{name}?key={self._api_key}") as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.error(f"Error getting file metadata: {error_text}")
+                    raise Exception(f"Failed to get file metadata: {response.status}")
+                file_info = await response.json()
+                return file_info
+    async def list_files(
+        self, page_size: int = 10, page_token: Optional[str] = None
+    ) -> Dict[str, Any]:
+        """List uploaded files.
+        Args:
+            page_size: Number of files to return per page
+            page_token: Token for pagination
+        Returns:
+            List of files and next page token if available
+        """
+        params = {"key": self._api_key, "pageSize": page_size}
+        if page_token:
+            params["pageToken"] = page_token
+        async with aiohttp.ClientSession() as session:
+            async with session.get(self._base_url, params=params) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.error(f"Error listing files: {error_text}")
+                    raise Exception(f"Failed to list files: {response.status}")
+                result = await response.json()
+                return result
+    async def delete_file(self, name: str) -> bool:
+        """Delete a file.
+        Args:
+            name: File name (or full path)
+        Returns:
+            True if deleted successfully
+        """
+        # Extract just the name part if a full path is provided
+        if "/" in name:
+            name = name.split("/")[-1]
+        async with aiohttp.ClientSession() as session:
+            async with session.delete(f"{self._base_url}/{name}?key={self._api_key}") as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.error(f"Error deleting file: {error_text}")
+                    raise Exception(f"Failed to delete file: {response.status}")
+                return True

pipecat/services/gemini_multimodal_live/gemini.py CHANGED Viewed

@@ -32,6 +32,7 @@ from pipecat.frames.frames import (
     Frame,
     InputAudioRawFrame,
     InputImageRawFrame,
+    InputTextRawFrame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
     LLMMessagesAppendFrame,
@@ -59,21 +60,22 @@ from pipecat.processors.aggregators.openai_llm_context import (
     OpenAILLMContextFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.google.frames import LLMSearchOrigin, LLMSearchResponseFrame, LLMSearchResult
 from pipecat.services.llm_service import FunctionCallFromLLM, LLMService
 from pipecat.services.openai.llm import (
     OpenAIAssistantContextAggregator,
     OpenAIUserContextAggregator,
 )
 from pipecat.transcriptions.language import Language
-from pipecat.utils.asyncio.watchdog_async_iterator import WatchdogAsyncIterator
 from pipecat.utils.string import match_endofsentence
 from pipecat.utils.time import time_now_iso8601
 from pipecat.utils.tracing.service_decorators import traced_gemini_live, traced_stt
 from . import events
+from .file_api import GeminiFileAPI
 try:
-    import websockets
+    from websockets.asyncio.client import connect as websocket_connect
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error("In order to use Google AI, you need to `pip install pipecat-ai[google]`.")
@@ -218,6 +220,31 @@ class GeminiMultimodalLiveContext(OpenAILLMContext):
                     system_instruction += str(content)
         return system_instruction
+    def add_file_reference(self, file_uri: str, mime_type: str, text: Optional[str] = None):
+        """Add a file reference to the context.
+        This adds a user message with a file reference that will be sent during context initialization.
+        Args:
+            file_uri: URI of the uploaded file
+            mime_type: MIME type of the file
+            text: Optional text prompt to accompany the file
+        """
+        # Create parts list with file reference
+        parts = []
+        if text:
+            parts.append({"type": "text", "text": text})
+        # Add file reference part
+        parts.append(
+            {"type": "file_data", "file_data": {"mime_type": mime_type, "file_uri": file_uri}}
+        )
+        # Add to messages
+        message = {"role": "user", "content": parts}
+        self.messages.append(message)
+        logger.info(f"Added file reference to context: {file_uri}")
     def get_messages_for_initializing_history(self):
         """Get messages formatted for Gemini history initialization.
@@ -242,6 +269,17 @@ class GeminiMultimodalLiveContext(OpenAILLMContext):
                 for part in content:
                     if part.get("type") == "text":
                         parts.append({"text": part.get("text")})
+                    elif part.get("type") == "file_data":
+                        file_data = part.get("file_data", {})
+                        parts.append(
+                            {
+                                "fileData": {
+                                    "mimeType": file_data.get("mime_type"),
+                                    "fileUri": file_data.get("file_uri"),
+                                }
+                            }
+                        )
                     else:
                         logger.warning(f"Unsupported content type: {str(part)[:80]}")
             else:
@@ -333,14 +371,26 @@ class GeminiMultimodalLiveContextAggregatorPair:
 class GeminiMultimodalModalities(Enum):
-    """Supported modalities for Gemini Multimodal Live."""
+    """Supported modalities for Gemini Multimodal Live.
+    Parameters:
+        TEXT: Text responses.
+        AUDIO: Audio responses.
+    """
     TEXT = "TEXT"
     AUDIO = "AUDIO"
 class GeminiMediaResolution(str, Enum):
-    """Media resolution options for Gemini Multimodal Live."""
+    """Media resolution options for Gemini Multimodal Live.
+    Parameters:
+        UNSPECIFIED: Use default resolution setting.
+        LOW: Low resolution with 64 tokens.
+        MEDIUM: Medium resolution with 256 tokens.
+        HIGH: High resolution with zoomed reframing and 256 tokens.
+    """
     UNSPECIFIED = "MEDIA_RESOLUTION_UNSPECIFIED"  # Use default
     LOW = "MEDIA_RESOLUTION_LOW"  # 64 tokens
@@ -422,20 +472,6 @@ class GeminiMultimodalLiveLLMService(LLMService):
     This service enables real-time conversations with Gemini, supporting both
     text and audio modalities. It handles voice transcription, streaming audio
     responses, and tool usage.
-    Args:
-        api_key: Google AI API key for authentication.
-        base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
-        model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
-        voice_id: TTS voice identifier. Defaults to "Charon".
-        start_audio_paused: Whether to start with audio input paused. Defaults to False.
-        start_video_paused: Whether to start with video input paused. Defaults to False.
-        system_instruction: System prompt for the model. Defaults to None.
-        tools: Tools/functions available to the model. Defaults to None.
-        params: Configuration parameters for the model. Defaults to InputParams().
-        inference_on_context_initialization: Whether to generate a response when context
-            is first set. Defaults to True.
-        **kwargs: Additional arguments passed to parent LLMService.
     """
     # Overriding the default adapter to use the Gemini one.
@@ -454,8 +490,26 @@ class GeminiMultimodalLiveLLMService(LLMService):
         tools: Optional[Union[List[dict], ToolsSchema]] = None,
         params: Optional[InputParams] = None,
         inference_on_context_initialization: bool = True,
+        file_api_base_url: str = "https://generativelanguage.googleapis.com/v1beta/files",
         **kwargs,
     ):
+        """Initialize the Gemini Multimodal Live LLM service.
+        Args:
+            api_key: Google AI API key for authentication.
+            base_url: API endpoint base URL. Defaults to the official Gemini Live endpoint.
+            model: Model identifier to use. Defaults to "models/gemini-2.0-flash-live-001".
+            voice_id: TTS voice identifier. Defaults to "Charon".
+            start_audio_paused: Whether to start with audio input paused. Defaults to False.
+            start_video_paused: Whether to start with video input paused. Defaults to False.
+            system_instruction: System prompt for the model. Defaults to None.
+            tools: Tools/functions available to the model. Defaults to None.
+            params: Configuration parameters for the model. Defaults to InputParams().
+            inference_on_context_initialization: Whether to generate a response when context
+                is first set. Defaults to True.
+            file_api_base_url: Base URL for the Gemini File API. Defaults to the official endpoint.
+            **kwargs: Additional arguments passed to parent LLMService.
+        """
         super().__init__(base_url=base_url, **kwargs)
         params = params or InputParams()
@@ -516,6 +570,13 @@ class GeminiMultimodalLiveLLMService(LLMService):
             "extra": params.extra if isinstance(params.extra, dict) else {},
         }
+        # Initialize the File API client
+        self.file_api = GeminiFileAPI(api_key=api_key, base_url=file_api_base_url)
+        # Grounding metadata tracking
+        self._search_result_buffer = ""
+        self._accumulated_grounding_metadata = None
     def can_generate_metrics(self) -> bool:
         """Check if the service can generate usage metrics.
@@ -524,6 +585,17 @@ class GeminiMultimodalLiveLLMService(LLMService):
         """
         return True
+    def needs_mcp_alternate_schema(self) -> bool:
+        """Check if this LLM service requires alternate MCP schema.
+        Google/Gemini has stricter JSON schema validation and requires
+        certain properties to be removed or modified for compatibility.
+        Returns:
+            True for Google/Gemini services.
+        """
+        return True
     def set_audio_input_paused(self, paused: bool):
         """Set the audio input pause state.
@@ -666,6 +738,9 @@ class GeminiMultimodalLiveLLMService(LLMService):
                 # Support just one tool call per context frame for now
                 tool_result_message = context.messages[-1]
                 await self._tool_result(tool_result_message)
+        elif isinstance(frame, InputTextRawFrame):
+            await self._send_user_text(frame.text)
+            await self.push_frame(frame, direction)
         elif isinstance(frame, InputAudioRawFrame):
             await self._send_user_audio(frame)
             await self.push_frame(frame, direction)
@@ -709,6 +784,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
         await self._ws_send(event.model_dump(exclude_none=True))
     async def _connect(self):
+        """Establish WebSocket connection to Gemini Live API."""
         if self._websocket:
             # Here we assume that if we have a websocket, we are connected. We
             # handle disconnections in the send/recv code paths.
@@ -718,7 +794,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
         try:
             logger.info(f"Connecting to wss://{self._base_url}")
             uri = f"wss://{self._base_url}?key={self._api_key}"
-            self._websocket = await websockets.connect(uri=uri)
+            self._websocket = await websocket_connect(uri=uri)
             self._receive_task = self.create_task(self._receive_task_handler())
             # Create the basic configuration
@@ -813,6 +889,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
             self._websocket = None
     async def _disconnect(self):
+        """Disconnect from Gemini Live API and clean up resources."""
         logger.info("Disconnecting from Gemini service")
         try:
             self._disconnecting = True
@@ -829,6 +906,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
             logger.error(f"{self} error disconnecting: {e}")
     async def _ws_send(self, message):
+        """Send a message to the WebSocket connection."""
         # logger.debug(f"Sending message to websocket: {message}")
         try:
             if self._websocket:
@@ -849,7 +927,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
     #
     async def _receive_task_handler(self):
-        async for message in WatchdogAsyncIterator(self._websocket, manager=self.task_manager):
+        """Handle incoming messages from the WebSocket connection."""
+        async for message in self._websocket:
             evt = events.parse_server_event(message)
             # logger.debug(f"Received event: {message[:500]}")
             # logger.debug(f"Received event: {evt}")
@@ -865,6 +944,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
                 await self._handle_evt_input_transcription(evt)
             elif evt.serverContent and evt.serverContent.outputTranscription:
                 await self._handle_evt_output_transcription(evt)
+            elif evt.serverContent and evt.serverContent.groundingMetadata:
+                await self._handle_evt_grounding_metadata(evt)
             elif evt.toolCall:
                 await self._handle_evt_tool_call(evt)
             elif False:  # !!! todo: error events?
@@ -877,6 +958,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
     #
     async def _send_user_audio(self, frame):
+        """Send user audio frame to Gemini Live API."""
         if self._audio_input_paused:
             return
         # Send all audio to Gemini
@@ -892,7 +974,25 @@ class GeminiMultimodalLiveLLMService(LLMService):
             length = int((frame.sample_rate * frame.num_channels * 2) * 0.5)
             self._user_audio_buffer = self._user_audio_buffer[-length:]
+    async def _send_user_text(self, text: str):
+        """Send user text via Gemini Live API's realtime input stream.
+        This method sends text through the realtimeInput stream (via TextInputMessage)
+        rather than the clientContent stream. This ensures text input is synchronized
+        with audio and video inputs, preventing temporal misalignment that can occur
+        when different modalities are processed through separate API pathways.
+        For realtimeInput, turn completion is automatically inferred by the API based
+        on user activity, so no explicit turnComplete signal is needed.
+        Args:
+            text: The text to send as user input.
+        """
+        evt = events.TextInputMessage.from_text(text)
+        await self.send_client_event(evt)
     async def _send_user_video(self, frame):
+        """Send user video frame to Gemini Live API."""
         if self._video_input_paused:
             return
@@ -906,6 +1006,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
         await self.send_client_event(evt)
     async def _create_initial_response(self):
+        """Create initial response based on context history."""
         if not self._api_session_ready:
             self._run_llm_when_api_session_ready = True
             return
@@ -931,7 +1032,8 @@ class GeminiMultimodalLiveLLMService(LLMService):
             self._needs_turn_complete_message = True
     async def _create_single_response(self, messages_list):
-        # refactor to combine this logic with same logic in GeminiMultimodalLiveContext
+        """Create a single response from a list of messages."""
+        # Refactor to combine this logic with same logic in GeminiMultimodalLiveContext
         messages = []
         for item in messages_list:
             role = item.get("role")
@@ -950,6 +1052,17 @@ class GeminiMultimodalLiveLLMService(LLMService):
                 for part in content:
                     if part.get("type") == "text":
                         parts.append({"text": part.get("text")})
+                    elif part.get("type") == "file_data":
+                        file_data = part.get("file_data", {})
+                        parts.append(
+                            {
+                                "fileData": {
+                                    "mimeType": file_data.get("mime_type"),
+                                    "fileUri": file_data.get("file_uri"),
+                                }
+                            }
+                        )
                     else:
                         logger.warning(f"Unsupported content type: {str(part)[:80]}")
             else:
@@ -973,6 +1086,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
     @traced_gemini_live(operation="llm_tool_result")
     async def _tool_result(self, tool_result_message):
+        """Send tool result back to the API."""
         # For now we're shoving the name into the tool_call_id field, so this
         # will work until we revisit that.
         id = tool_result_message.get("tool_call_id")
@@ -998,6 +1112,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
     @traced_gemini_live(operation="llm_setup")
     async def _handle_evt_setup_complete(self, evt):
+        """Handle the setup complete event."""
         # If this is our first context frame, run the LLM
         self._api_session_ready = True
         # Now that we've configured the session, we can run the LLM if we need to.
@@ -1006,6 +1121,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
             await self._create_initial_response()
     async def _handle_evt_model_turn(self, evt):
+        """Handle the model turn event."""
         part = evt.serverContent.modelTurn.parts[0]
         if not part:
             return
@@ -1019,8 +1135,13 @@ class GeminiMultimodalLiveLLMService(LLMService):
                 await self.push_frame(LLMFullResponseStartFrame())
             self._bot_text_buffer += text
+            self._search_result_buffer += text  # Also accumulate for grounding
             await self.push_frame(LLMTextFrame(text=text))
+        # Check for grounding metadata in server content
+        if evt.serverContent and evt.serverContent.groundingMetadata:
+            self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
         inline_data = part.inlineData
         if not inline_data:
             return
@@ -1047,6 +1168,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
     @traced_gemini_live(operation="llm_tool_call")
     async def _handle_evt_tool_call(self, evt):
+        """Handle tool call events."""
         function_calls = evt.toolCall.functionCalls
         if not function_calls:
             return
@@ -1067,6 +1189,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
     @traced_gemini_live(operation="llm_response")
     async def _handle_evt_turn_complete(self, evt):
+        """Handle the turn complete event."""
         self._bot_is_speaking = False
         text = self._bot_text_buffer
@@ -1086,6 +1209,16 @@ class GeminiMultimodalLiveLLMService(LLMService):
         self._bot_text_buffer = ""
         self._llm_output_buffer = ""
+        # Process grounding metadata if we have accumulated any
+        if self._accumulated_grounding_metadata:
+            await self._process_grounding_metadata(
+                self._accumulated_grounding_metadata, self._search_result_buffer
+            )
+        # Reset grounding tracking for next response
+        self._search_result_buffer = ""
+        self._accumulated_grounding_metadata = None
         # Only push the TTSStoppedFrame if the bot is outputting audio
         # when text is found, modalities is set to TEXT and no audio
         # is produced.
@@ -1150,6 +1283,7 @@ class GeminiMultimodalLiveLLMService(LLMService):
             )
     async def _handle_evt_output_transcription(self, evt):
+        """Handle the output transcription event."""
         if not evt.serverContent.outputTranscription:
             return
@@ -1161,13 +1295,76 @@ class GeminiMultimodalLiveLLMService(LLMService):
         if not text:
             return
+        # Accumulate text for grounding as well
+        self._search_result_buffer += text
+        # Check for grounding metadata in server content
+        if evt.serverContent and evt.serverContent.groundingMetadata:
+            self._accumulated_grounding_metadata = evt.serverContent.groundingMetadata
         # Collect text for tracing
         self._llm_output_buffer += text
         await self.push_frame(LLMTextFrame(text=text))
         await self.push_frame(TTSTextFrame(text=text))
+    async def _handle_evt_grounding_metadata(self, evt):
+        """Handle dedicated grounding metadata events."""
+        if evt.serverContent and evt.serverContent.groundingMetadata:
+            grounding_metadata = evt.serverContent.groundingMetadata
+            # Process the grounding metadata immediately
+            await self._process_grounding_metadata(grounding_metadata, self._search_result_buffer)
+    async def _process_grounding_metadata(
+        self, grounding_metadata: events.GroundingMetadata, search_result: str = ""
+    ):
+        """Process grounding metadata and emit LLMSearchResponseFrame."""
+        if not grounding_metadata:
+            return
+        # Extract rendered content for search suggestions
+        rendered_content = None
+        if (
+            grounding_metadata.searchEntryPoint
+            and grounding_metadata.searchEntryPoint.renderedContent
+        ):
+            rendered_content = grounding_metadata.searchEntryPoint.renderedContent
+        # Convert grounding chunks and supports to LLMSearchOrigin format
+        origins = []
+        if grounding_metadata.groundingChunks and grounding_metadata.groundingSupports:
+            # Create a mapping of chunk indices to origins
+            chunk_to_origin = {}
+            for index, chunk in enumerate(grounding_metadata.groundingChunks):
+                if chunk.web:
+                    origin = LLMSearchOrigin(
+                        site_uri=chunk.web.uri, site_title=chunk.web.title, results=[]
+                    )
+                    chunk_to_origin[index] = origin
+                    origins.append(origin)
+            # Add grounding support results to the appropriate origins
+            for support in grounding_metadata.groundingSupports:
+                if support.segment and support.groundingChunkIndices:
+                    text = support.segment.text or ""
+                    confidence_scores = support.confidenceScores or []
+                    # Add this result to all origins referenced by this support
+                    for chunk_index in support.groundingChunkIndices:
+                        if chunk_index in chunk_to_origin:
+                            result = LLMSearchResult(text=text, confidence=confidence_scores)
+                            chunk_to_origin[chunk_index].results.append(result)
+        # Create and push the search response frame
+        search_frame = LLMSearchResponseFrame(
+            search_result=search_result, origins=origins, rendered_content=rendered_content
+        )
+        await self.push_frame(search_frame)
     async def _handle_evt_usage_metadata(self, evt):
+        """Handle the usage metadata event."""
         if not evt.usageMetadata:
             return

dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl