PyPI - dv-pipecat-ai - Versions diffs - 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.82.dev857py3-none-any.whl → 0.0.85.dev837py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show

{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
pipecat/adapters/base_llm_adapter.py +38 -1
pipecat/adapters/services/anthropic_adapter.py +9 -14
pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
pipecat/adapters/services/bedrock_adapter.py +236 -13
pipecat/adapters/services/gemini_adapter.py +12 -8
pipecat/adapters/services/open_ai_adapter.py +19 -7
pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
pipecat/audio/dtmf/dtmf-0.wav +0 -0
pipecat/audio/dtmf/dtmf-1.wav +0 -0
pipecat/audio/dtmf/dtmf-2.wav +0 -0
pipecat/audio/dtmf/dtmf-3.wav +0 -0
pipecat/audio/dtmf/dtmf-4.wav +0 -0
pipecat/audio/dtmf/dtmf-5.wav +0 -0
pipecat/audio/dtmf/dtmf-6.wav +0 -0
pipecat/audio/dtmf/dtmf-7.wav +0 -0
pipecat/audio/dtmf/dtmf-8.wav +0 -0
pipecat/audio/dtmf/dtmf-9.wav +0 -0
pipecat/audio/dtmf/dtmf-pound.wav +0 -0
pipecat/audio/dtmf/dtmf-star.wav +0 -0
pipecat/audio/filters/krisp_viva_filter.py +193 -0
pipecat/audio/filters/noisereduce_filter.py +15 -0
pipecat/audio/turn/base_turn_analyzer.py +9 -1
pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
pipecat/audio/vad/data/README.md +10 -0
pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
pipecat/audio/vad/silero.py +9 -3
pipecat/audio/vad/vad_analyzer.py +13 -1
pipecat/extensions/voicemail/voicemail_detector.py +5 -5
pipecat/frames/frames.py +277 -86
pipecat/observers/loggers/debug_log_observer.py +3 -3
pipecat/observers/loggers/llm_log_observer.py +7 -3
pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
pipecat/pipeline/runner.py +18 -6
pipecat/pipeline/service_switcher.py +64 -36
pipecat/pipeline/task.py +125 -79
pipecat/pipeline/tts_switcher.py +30 -0
pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
pipecat/processors/aggregators/llm_context.py +40 -2
pipecat/processors/aggregators/llm_response.py +32 -15
pipecat/processors/aggregators/llm_response_universal.py +19 -15
pipecat/processors/aggregators/user_response.py +6 -6
pipecat/processors/aggregators/vision_image_frame.py +24 -2
pipecat/processors/audio/audio_buffer_processor.py +43 -8
pipecat/processors/dtmf_aggregator.py +174 -77
pipecat/processors/filters/stt_mute_filter.py +17 -0
pipecat/processors/frame_processor.py +110 -24
pipecat/processors/frameworks/langchain.py +8 -2
pipecat/processors/frameworks/rtvi.py +210 -68
pipecat/processors/frameworks/strands_agents.py +170 -0
pipecat/processors/logger.py +2 -2
pipecat/processors/transcript_processor.py +26 -5
pipecat/processors/user_idle_processor.py +35 -11
pipecat/runner/daily.py +59 -20
pipecat/runner/run.py +395 -93
pipecat/runner/types.py +6 -4
pipecat/runner/utils.py +51 -10
pipecat/serializers/__init__.py +5 -1
pipecat/serializers/asterisk.py +16 -2
pipecat/serializers/convox.py +41 -4
pipecat/serializers/custom.py +257 -0
pipecat/serializers/exotel.py +5 -5
pipecat/serializers/livekit.py +20 -0
pipecat/serializers/plivo.py +5 -5
pipecat/serializers/protobuf.py +6 -5
pipecat/serializers/telnyx.py +2 -2
pipecat/serializers/twilio.py +43 -23
pipecat/serializers/vi.py +324 -0
pipecat/services/ai_service.py +2 -6
pipecat/services/anthropic/llm.py +2 -25
pipecat/services/assemblyai/models.py +6 -0
pipecat/services/assemblyai/stt.py +13 -5
pipecat/services/asyncai/tts.py +5 -3
pipecat/services/aws/__init__.py +1 -0
pipecat/services/aws/llm.py +147 -105
pipecat/services/aws/nova_sonic/__init__.py +0 -0
pipecat/services/aws/nova_sonic/context.py +436 -0
pipecat/services/aws/nova_sonic/frames.py +25 -0
pipecat/services/aws/nova_sonic/llm.py +1265 -0
pipecat/services/aws/stt.py +3 -3
pipecat/services/aws_nova_sonic/__init__.py +19 -1
pipecat/services/aws_nova_sonic/aws.py +11 -1151
pipecat/services/aws_nova_sonic/context.py +8 -354
pipecat/services/aws_nova_sonic/frames.py +13 -17
pipecat/services/azure/llm.py +51 -1
pipecat/services/azure/realtime/__init__.py +0 -0
pipecat/services/azure/realtime/llm.py +65 -0
pipecat/services/azure/stt.py +15 -0
pipecat/services/cartesia/stt.py +77 -70
pipecat/services/cartesia/tts.py +80 -13
pipecat/services/deepgram/__init__.py +1 -0
pipecat/services/deepgram/flux/__init__.py +0 -0
pipecat/services/deepgram/flux/stt.py +640 -0
pipecat/services/elevenlabs/__init__.py +4 -1
pipecat/services/elevenlabs/stt.py +339 -0
pipecat/services/elevenlabs/tts.py +87 -46
pipecat/services/fish/tts.py +5 -2
pipecat/services/gemini_multimodal_live/events.py +38 -524
pipecat/services/gemini_multimodal_live/file_api.py +23 -173
pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
pipecat/services/gladia/stt.py +56 -72
pipecat/services/google/__init__.py +1 -0
pipecat/services/google/gemini_live/__init__.py +3 -0
pipecat/services/google/gemini_live/file_api.py +189 -0
pipecat/services/google/gemini_live/llm.py +1582 -0
pipecat/services/google/gemini_live/llm_vertex.py +184 -0
pipecat/services/google/llm.py +15 -11
pipecat/services/google/llm_openai.py +3 -3
pipecat/services/google/llm_vertex.py +86 -16
pipecat/services/google/stt.py +4 -0
pipecat/services/google/tts.py +7 -3
pipecat/services/heygen/api.py +2 -0
pipecat/services/heygen/client.py +8 -4
pipecat/services/heygen/video.py +2 -0
pipecat/services/hume/__init__.py +5 -0
pipecat/services/hume/tts.py +220 -0
pipecat/services/inworld/tts.py +6 -6
pipecat/services/llm_service.py +15 -5
pipecat/services/lmnt/tts.py +4 -2
pipecat/services/mcp_service.py +4 -2
pipecat/services/mem0/memory.py +6 -5
pipecat/services/mistral/llm.py +29 -8
pipecat/services/moondream/vision.py +42 -16
pipecat/services/neuphonic/tts.py +5 -2
pipecat/services/openai/__init__.py +1 -0
pipecat/services/openai/base_llm.py +27 -20
pipecat/services/openai/realtime/__init__.py +0 -0
pipecat/services/openai/realtime/context.py +272 -0
pipecat/services/openai/realtime/events.py +1106 -0
pipecat/services/openai/realtime/frames.py +37 -0
pipecat/services/openai/realtime/llm.py +829 -0
pipecat/services/openai/tts.py +49 -10
pipecat/services/openai_realtime/__init__.py +27 -0
pipecat/services/openai_realtime/azure.py +21 -0
pipecat/services/openai_realtime/context.py +21 -0
pipecat/services/openai_realtime/events.py +21 -0
pipecat/services/openai_realtime/frames.py +21 -0
pipecat/services/openai_realtime_beta/azure.py +16 -0
pipecat/services/openai_realtime_beta/openai.py +17 -5
pipecat/services/piper/tts.py +7 -9
pipecat/services/playht/tts.py +34 -4
pipecat/services/rime/tts.py +12 -12
pipecat/services/riva/stt.py +3 -1
pipecat/services/salesforce/__init__.py +9 -0
pipecat/services/salesforce/llm.py +700 -0
pipecat/services/sarvam/__init__.py +7 -0
pipecat/services/sarvam/stt.py +540 -0
pipecat/services/sarvam/tts.py +97 -13
pipecat/services/simli/video.py +2 -2
pipecat/services/speechmatics/stt.py +22 -10
pipecat/services/stt_service.py +47 -0
pipecat/services/tavus/video.py +2 -2
pipecat/services/tts_service.py +75 -22
pipecat/services/vision_service.py +7 -6
pipecat/services/vistaar/llm.py +51 -9
pipecat/tests/utils.py +4 -4
pipecat/transcriptions/language.py +41 -1
pipecat/transports/base_input.py +13 -34
pipecat/transports/base_output.py +140 -104
pipecat/transports/daily/transport.py +199 -26
pipecat/transports/heygen/__init__.py +0 -0
pipecat/transports/heygen/transport.py +381 -0
pipecat/transports/livekit/transport.py +228 -63
pipecat/transports/local/audio.py +6 -1
pipecat/transports/local/tk.py +11 -2
pipecat/transports/network/fastapi_websocket.py +1 -1
pipecat/transports/smallwebrtc/connection.py +103 -19
pipecat/transports/smallwebrtc/request_handler.py +246 -0
pipecat/transports/smallwebrtc/transport.py +65 -23
pipecat/transports/tavus/transport.py +23 -12
pipecat/transports/websocket/client.py +41 -5
pipecat/transports/websocket/fastapi.py +21 -11
pipecat/transports/websocket/server.py +14 -7
pipecat/transports/whatsapp/api.py +8 -0
pipecat/transports/whatsapp/client.py +47 -0
pipecat/utils/base_object.py +54 -22
pipecat/utils/redis.py +58 -0
pipecat/utils/string.py +13 -1
pipecat/utils/tracing/service_decorators.py +21 -21
pipecat/serializers/genesys.py +0 -95
pipecat/services/google/test-google-chirp.py +0 -45
pipecat/services/openai.py +0 -698
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
/pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0

pipecat/processors/aggregators/llm_context.py CHANGED Viewed

@@ -15,9 +15,10 @@ service-specific adapter.
 """
 import base64
+import copy
 import io
 from dataclasses import dataclass
-from typing import Any, List, Optional, TypeAlias, Union
+from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
 from loguru import logger
 from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
@@ -31,6 +32,9 @@ from PIL import Image
 from pipecat.adapters.schemas.tools_schema import ToolsSchema
 from pipecat.frames.frames import AudioRawFrame
+if TYPE_CHECKING:
+    from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
 # "Re-export" types from OpenAI that we're using as universal context types.
 # NOTE: if universal message types need to someday diverge from OpenAI's, we
 # should consider managing our own definitions. But we should do so carefully,
@@ -65,6 +69,26 @@ class LLMContext:
     and content formatting.
     """
+    @staticmethod
+    def from_openai_context(openai_context: "OpenAILLMContext") -> "LLMContext":
+        """Create a universal LLM context from an OpenAI-specific context.
+        NOTE: this should only be used internally, for facilitating migration
+        from OpenAILLMContext to LLMContext. New user code should use
+        LLMContext directly.
+        Args:
+            openai_context: The OpenAI LLM context to convert.
+        Returns:
+            New LLMContext instance with converted messages and settings.
+        """
+        return LLMContext(
+            messages=openai_context.get_messages(),
+            tools=openai_context.tools,
+            tool_choice=openai_context.tool_choice,
+        )
     def __init__(
         self,
         messages: Optional[List[LLMContextMessage]] = None,
@@ -82,6 +106,19 @@ class LLMContext:
         self._tools: ToolsSchema | NotGiven = LLMContext._normalize_and_validate_tools(tools)
         self._tool_choice: LLMContextToolChoice | NotGiven = tool_choice
+    @property
+    def messages(self) -> List[LLMContextMessage]:
+        """Get the current messages list.
+        NOTE: This is equivalent to calling `get_messages()` with no filter. If
+        you want to filter out LLM-specific messages that don't pertain to your
+        LLM, use `get_messages()` directly.
+        Returns:
+            List of conversation messages.
+        """
+        return self.get_messages()
     def get_messages(self, llm_specific_filter: Optional[str] = None) -> List[LLMContextMessage]:
         """Get the current messages list.
@@ -89,7 +126,8 @@ class LLMContext:
             llm_specific_filter: Optional filter to return LLM-specific
                 messages for the given LLM, in addition to the standard
                 messages. If messages end up being filtered, an error will be
-                logged.
+                logged; this is intended to catch accidental use of
+                incompatible LLM-specific messages.
         Returns:
             List of conversation messages.

pipecat/processors/aggregators/llm_response.py CHANGED Viewed

@@ -23,7 +23,6 @@ from pipecat.audio.interruptions.base_interruption_strategy import BaseInterrupt
 from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
 from pipecat.audio.vad.vad_analyzer import VADParams
 from pipecat.frames.frames import (
-    BotInterruptionFrame,
     BotStartedSpeakingFrame,
     BotStoppedSpeakingFrame,
     CancelFrame,
@@ -37,6 +36,7 @@ from pipecat.frames.frames import (
     FunctionCallsStartedFrame,
     InputAudioRawFrame,
     InterimTranscriptionFrame,
+    InterruptionFrame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
     LLMMessagesAppendFrame,
@@ -49,8 +49,8 @@ from pipecat.frames.frames import (
     OpenAILLMContextAssistantTimestampFrame,
     SpeechControlParamsFrame,
     StartFrame,
-    StartInterruptionFrame,
     TextFrame,
+    TranscriptDropFrame,
     TranscriptionFrame,
     UserImageRawFrame,
     UserStartedSpeakingFrame,
@@ -139,7 +139,7 @@ class LLMFullResponseAggregator(FrameProcessor):
         """
         await super().process_frame(frame, direction)
-        if isinstance(frame, StartInterruptionFrame):
+        if isinstance(frame, InterruptionFrame):
             await self._call_event_handler("on_completion", self._aggregation, False)
             self._aggregation = ""
             self._started = False
@@ -446,6 +446,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
         self._latest_final_transcript = ""
         self._last_user_speaking_time = 0
         self._last_aggregation_push_time = 0
+        self._pending_transcription_ids: List[int] = []
     async def reset(self):
         """Reset the aggregation state and interruption strategies."""
@@ -453,6 +454,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
         self._was_bot_speaking = False
         self._seen_interim_results = False
         self._waiting_for_aggregation = False
+        self._pending_transcription_ids.clear()
         [await s.reset() for s in self._interruption_strategies]
     async def handle_aggregation(self, aggregation: str):
@@ -470,8 +472,8 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
             frame: The frame to process.
             direction: The direction of frame flow in the pipeline.
         """
-        if isinstance(frame, StartInterruptionFrame):
-            self.logger.debug("Received StartInterruptionFrame")
+        if isinstance(frame, InterruptionFrame):
+            self.logger.debug("Received InterruptionFrame")
         await super().process_frame(frame, direction)
         if isinstance(frame, StartFrame):
@@ -516,9 +518,6 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
             self.set_tools(frame.tools)
         elif isinstance(frame, LLMSetToolChoiceFrame):
             self.set_tool_choice(frame.tool_choice)
-        elif isinstance(frame, LLMFullResponseStartFrame):
-            self._last_llm_response_start_time = time.time()
-            self._latest_final_transcript = ""
         elif isinstance(frame, SpeechControlParamsFrame):
             self._vad_params = frame.vad_params
             self._turn_params = frame.turn_params
@@ -545,13 +544,14 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
                 if should_interrupt:
                     self.logger.debug(
-                        "Interruption conditions met - pushing BotInterruptionFrame and aggregation"
+                        "Interruption conditions met - pushing interruption and aggregation"
                     )
-                    await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
+                    await self.push_interruption_task_frame_and_wait()
                     await self._process_aggregation()
                 else:
                     self.logger.debug("Interruption conditions not met - not pushing aggregation")
-                    # Don't process aggregation, just reset it
+                    # Don't process aggregation, discard pending transcriptions and reset
+                    await self._discard_pending_transcriptions("interruption_conditions_not_met")
                     await self.reset()
             else:
                 if trigger_interruption:
@@ -559,7 +559,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
                         "Triggering interruption - pushing BotInterruptionFrame and aggregation"
                     )
                     # await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
-                    await self.push_frame(StartInterruptionFrame(), FrameDirection.DOWNSTREAM)
+                    await self.push_frame(InterruptionFrame(), FrameDirection.DOWNSTREAM)
                     self.logger.debug("Pushed BotInterruptionFrame")
                 # No interruption config - normal behavior (always push aggregation)
                 await self._process_aggregation()
@@ -591,6 +591,13 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
         return any([await should_interrupt(s) for s in self._interruption_strategies])
+    async def _discard_pending_transcriptions(self, reason: str):
+        """Notify upstream processors that pending transcripts should be dropped."""
+        if self._pending_transcription_ids:
+            drop_frame = TranscriptDropFrame(transcript_ids=list(self._pending_transcription_ids))
+            await self.push_frame(drop_frame, FrameDirection.UPSTREAM)
+        self._pending_transcription_ids.clear()
     async def _start(self, frame: StartFrame):
         self._create_aggregation_task()
@@ -617,10 +624,19 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
         for s in self.interruption_strategies:
             await s.append_audio(frame.audio, frame.sample_rate)
+    async def _discard_pending_transcriptions(self, reason: str):
+        """Notify upstream processors that pending transcripts should be dropped."""
+        if self._pending_transcription_ids:
+            drop_frame = TranscriptDropFrame(transcript_ids=list(self._pending_transcription_ids))
+            await self.push_frame(drop_frame, FrameDirection.UPSTREAM)
+        self._pending_transcription_ids.clear()
     async def _handle_user_started_speaking(self, frame: UserStartedSpeakingFrame):
         if len(self._aggregation) > 0:
             self.logger.debug(f"Dropping {self._aggregation}")
             self._aggregation = ""
+            await self._discard_pending_transcriptions("user_started_speaking")
+        self._latest_final_transcript = ""
         self._last_user_speaking_time = time.time()
         self._user_speaking = True
         self._waiting_for_aggregation = True
@@ -664,6 +680,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
             return
         self._aggregation += f" {text}" if self._aggregation else text
+        self._pending_transcription_ids.append(frame.id)
         # We just got a final result, so let's reset interim results.
         self._seen_interim_results = False
@@ -686,7 +703,6 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
             elif (
                 not self._bot_speaking
                 and time_since_stopped < 3.0
-                and time.time() - self._last_llm_response_start_time > 3.0
                 and self._latest_final_transcript != text
             ):
                 self.logger.debug(
@@ -794,6 +810,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
             if self._bot_speaking and not self._params.enable_emulated_vad_interruptions:
                 # If emulated VAD interruptions are disabled and bot is speaking, ignore
                 logger.debug("Ignoring user speaking emulation, bot is speaking.")
+                await self._discard_pending_transcriptions("emulated_vad_ignored")
                 await self.reset()
             else:
                 # Either bot is not speaking, or emulated VAD interruptions are enabled
@@ -908,7 +925,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
         """
         await super().process_frame(frame, direction)
-        if isinstance(frame, StartInterruptionFrame):
+        if isinstance(frame, InterruptionFrame):
             await self._handle_interruptions(frame)
             await self.push_frame(frame, direction)
         elif isinstance(frame, LLMFullResponseStartFrame):
@@ -974,7 +991,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
         if frame.run_llm:
             await self.push_context_frame(FrameDirection.UPSTREAM)
-    async def _handle_interruptions(self, frame: StartInterruptionFrame):
+    async def _handle_interruptions(self, frame: InterruptionFrame):
         await self.push_aggregation()
         self._started = 0
         await self.reset()

pipecat/processors/aggregators/llm_response_universal.py CHANGED Viewed

@@ -13,7 +13,7 @@ LLM processing, and text-to-speech components in conversational AI pipelines.
 import asyncio
 import json
-from dataclasses import dataclass
+from abc import abstractmethod
 from typing import Any, Dict, List, Literal, Optional, Set
 from loguru import logger
@@ -23,7 +23,6 @@ from pipecat.audio.interruptions.base_interruption_strategy import BaseInterrupt
 from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
 from pipecat.audio.vad.vad_analyzer import VADParams
 from pipecat.frames.frames import (
-    BotInterruptionFrame,
     BotStartedSpeakingFrame,
     BotStoppedSpeakingFrame,
     CancelFrame,
@@ -37,6 +36,7 @@ from pipecat.frames.frames import (
     FunctionCallsStartedFrame,
     InputAudioRawFrame,
     InterimTranscriptionFrame,
+    InterruptionFrame,
     LLMContextAssistantTimestampFrame,
     LLMContextFrame,
     LLMFullResponseEndFrame,
@@ -48,7 +48,6 @@ from pipecat.frames.frames import (
     LLMSetToolsFrame,
     SpeechControlParamsFrame,
     StartFrame,
-    StartInterruptionFrame,
     TextFrame,
     TranscriptionFrame,
     UserImageRawFrame,
@@ -171,6 +170,11 @@ class LLMContextAggregator(FrameProcessor):
         """Reset the aggregation state."""
         self._aggregation = ""
+    @abstractmethod
+    async def push_aggregation(self):
+        """Push the current aggregation downstream."""
+        pass
 class LLMUserAggregator(LLMContextAggregator):
     """User LLM aggregator that processes speech-to-text transcriptions.
@@ -303,7 +307,7 @@ class LLMUserAggregator(LLMContextAggregator):
         frame = LLMContextFrame(self._context)
         await self.push_frame(frame)
-    async def _push_aggregation(self):
+    async def push_aggregation(self):
         """Push the current aggregation based on interruption strategies and conditions."""
         if len(self._aggregation) > 0:
             if self.interruption_strategies and self._bot_speaking:
@@ -311,9 +315,9 @@ class LLMUserAggregator(LLMContextAggregator):
                 if should_interrupt:
                     logger.debug(
-                        "Interruption conditions met - pushing BotInterruptionFrame and aggregation"
+                        "Interruption conditions met - pushing interruption and aggregation"
                     )
-                    await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
+                    await self.push_interruption_task_frame_and_wait()
                     await self._process_aggregation()
                 else:
                     logger.debug("Interruption conditions not met - not pushing aggregation")
@@ -394,7 +398,7 @@ class LLMUserAggregator(LLMContextAggregator):
         # pushing the aggregation as we will probably get a final transcription.
         if len(self._aggregation) > 0:
             if not self._seen_interim_results:
-                await self._push_aggregation()
+                await self.push_aggregation()
         # Handles the case where both the user and the bot are not speaking,
         # and the bot was previously speaking before the user interruption.
         # So in this case we are resetting the aggregation timer
@@ -473,7 +477,7 @@ class LLMUserAggregator(LLMContextAggregator):
                 await self._maybe_emulate_user_speaking()
             except asyncio.TimeoutError:
                 if not self._user_speaking:
-                    await self._push_aggregation()
+                    await self.push_aggregation()
                 # If we are emulating VAD we still need to send the user stopped
                 # speaking frame.
@@ -579,7 +583,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
         """
         await super().process_frame(frame, direction)
-        if isinstance(frame, StartInterruptionFrame):
+        if isinstance(frame, InterruptionFrame):
             await self._handle_interruptions(frame)
             await self.push_frame(frame, direction)
         elif isinstance(frame, LLMFullResponseStartFrame):
@@ -609,12 +613,12 @@ class LLMAssistantAggregator(LLMContextAggregator):
         elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
             await self._handle_user_image_frame(frame)
         elif isinstance(frame, BotStoppedSpeakingFrame):
-            await self._push_aggregation()
+            await self.push_aggregation()
             await self.push_frame(frame, direction)
         else:
             await self.push_frame(frame, direction)
-    async def _push_aggregation(self):
+    async def push_aggregation(self):
         """Push the current assistant aggregation with timestamp."""
         if not self._aggregation:
             return
@@ -645,8 +649,8 @@ class LLMAssistantAggregator(LLMContextAggregator):
         if frame.run_llm:
             await self.push_context_frame(FrameDirection.UPSTREAM)
-    async def _handle_interruptions(self, frame: StartInterruptionFrame):
-        await self._push_aggregation()
+    async def _handle_interruptions(self, frame: InterruptionFrame):
+        await self.push_aggregation()
         self._started = 0
         await self.reset()
@@ -780,7 +784,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
             text=frame.request.context,
         )
-        await self._push_aggregation()
+        await self.push_aggregation()
         await self.push_context_frame(FrameDirection.UPSTREAM)
     async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
@@ -788,7 +792,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
     async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
         self._started -= 1
-        await self._push_aggregation()
+        await self.push_aggregation()
     async def _handle_text(self, frame: TextFrame):
         if not self._started:

pipecat/processors/aggregators/user_response.py CHANGED Viewed

@@ -12,14 +12,14 @@ in conversational pipelines.
 """
 from pipecat.frames.frames import TextFrame
-from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator
-from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.aggregators.llm_response_universal import LLMUserAggregator
-class UserResponseAggregator(LLMUserContextAggregator):
+class UserResponseAggregator(LLMUserAggregator):
     """Aggregates user responses into TextFrame objects.
-    This aggregator extends LLMUserContextAggregator to specifically handle
+    This aggregator extends LLMUserAggregator to specifically handle
     user input by collecting text responses and outputting them as TextFrame
     objects when the aggregation is complete.
     """
@@ -28,9 +28,9 @@ class UserResponseAggregator(LLMUserContextAggregator):
         """Initialize the user response aggregator.
         Args:
-            **kwargs: Additional arguments passed to parent LLMUserContextAggregator.
+            **kwargs: Additional arguments passed to parent LLMUserAggregator.
         """
-        super().__init__(context=OpenAILLMContext(), **kwargs)
+        super().__init__(context=LLMContext(), **kwargs)
     async def push_aggregation(self):
         """Push the aggregated user response as a TextFrame.

pipecat/processors/aggregators/vision_image_frame.py CHANGED Viewed

@@ -10,13 +10,22 @@ This module provides frame aggregation functionality to combine text and image
 frames into vision frames for multimodal processing.
 """
-from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame, VisionImageRawFrame
+from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame
+from pipecat.processors.aggregators.openai_llm_context import (
+    OpenAILLMContext,
+    OpenAILLMContextFrame,
+)
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 class VisionImageFrameAggregator(FrameProcessor):
     """Aggregates consecutive text and image frames into vision frames.
+    .. deprecated:: 0.0.85
+        VisionImageRawFrame has been removed in favor of context frames
+        (LLMContextFrame or OpenAILLMContextFrame), so this aggregator is not
+        needed anymore. See the 12* examples for the new recommended pattern.
     This aggregator waits for a consecutive TextFrame and an InputImageRawFrame.
     After the InputImageRawFrame arrives it will output a VisionImageRawFrame
     combining both the text and image data for multimodal processing.
@@ -28,6 +37,17 @@ class VisionImageFrameAggregator(FrameProcessor):
         The aggregator starts with no cached text, waiting for the first
         TextFrame to arrive before it can create vision frames.
         """
+        import warnings
+        warnings.warn(
+            "VisionImageFrameAggregator is deprecated. "
+            "VisionImageRawFrame has been removed in favor of context frames "
+            "(LLMContextFrame or OpenAILLMContextFrame), so this aggregator is "
+            "not needed anymore. See the 12* examples for the new recommended "
+            "pattern.",
+            DeprecationWarning,
+            stacklevel=2,
+        )
         super().__init__()
         self._describe_text = None
@@ -47,12 +67,14 @@ class VisionImageFrameAggregator(FrameProcessor):
             self._describe_text = frame.text
         elif isinstance(frame, InputImageRawFrame):
             if self._describe_text:
-                frame = VisionImageRawFrame(
+                context = OpenAILLMContext()
+                context.add_image_frame_message(
                     text=self._describe_text,
                     image=frame.image,
                     size=frame.size,
                     format=frame.format,
                 )
+                frame = OpenAILLMContextFrame(context)
                 await self.push_frame(frame)
                 self._describe_text = None
         else:

pipecat/processors/audio/audio_buffer_processor.py CHANGED Viewed

@@ -137,12 +137,12 @@ class AudioBufferProcessor(FrameProcessor):
         return self._num_channels
     def has_audio(self) -> bool:
-        """Check if both user and bot audio buffers contain data.
+        """Check if either user or bot audio buffers contain data.
         Returns:
-            True if both buffers contain audio data.
+            True if either buffer contains audio data.
         """
-        return self._buffer_has_audio(self._user_audio_buffer) and self._buffer_has_audio(
+        return self._buffer_has_audio(self._user_audio_buffer) or self._buffer_has_audio(
             self._bot_audio_buffer
         )
@@ -229,9 +229,12 @@ class AudioBufferProcessor(FrameProcessor):
             # Save time of frame so we can compute silence.
             self._last_bot_frame_at = time.time()
-        if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size:
+        if self._buffer_size > 0 and (
+            len(self._user_audio_buffer) >= self._buffer_size
+            or len(self._bot_audio_buffer) >= self._buffer_size
+        ):
             await self._call_on_audio_data_handler()
-            self._reset_recording()
+            self._reset_primary_audio_buffers()
         # Process turn recording with preprocessed data.
         if self._enable_turn_audio:
@@ -272,9 +275,15 @@ class AudioBufferProcessor(FrameProcessor):
     async def _call_on_audio_data_handler(self):
         """Call the audio data event handlers with buffered audio."""
-        if not self.has_audio() or not self._recording:
+        if not self._recording:
             return
+        if len(self._user_audio_buffer) == 0 and len(self._bot_audio_buffer) == 0:
+            return
+        self._align_track_buffers()
+        flush_time = time.time()
         # Call original handler with merged audio
         merged_audio = self.merge_audio_buffers()
         await self._call_event_handler(
@@ -290,23 +299,49 @@ class AudioBufferProcessor(FrameProcessor):
             self._num_channels,
         )
+        self._last_user_frame_at = flush_time
+        self._last_bot_frame_at = flush_time
     def _buffer_has_audio(self, buffer: bytearray) -> bool:
         """Check if a buffer contains audio data."""
         return buffer is not None and len(buffer) > 0
     def _reset_recording(self):
         """Reset recording state and buffers."""
-        self._reset_audio_buffers()
+        self._reset_all_audio_buffers()
         self._last_user_frame_at = time.time()
         self._last_bot_frame_at = time.time()
-    def _reset_audio_buffers(self):
+    def _reset_all_audio_buffers(self):
         """Reset all audio buffers to empty state."""
+        self._reset_primary_audio_buffers()
+        self._reset_turn_audio_buffers()
+    def _reset_primary_audio_buffers(self):
+        """Clear user and bot buffers while preserving turn buffers and timestamps."""
         self._user_audio_buffer = bytearray()
         self._bot_audio_buffer = bytearray()
+    def _reset_turn_audio_buffers(self):
+        """Clear user and bot turn buffers while preserving primary buffers and timestamps."""
         self._user_turn_audio_buffer = bytearray()
         self._bot_turn_audio_buffer = bytearray()
+    def _align_track_buffers(self):
+        """Pad the shorter track with silence so both tracks stay in sync."""
+        user_len = len(self._user_audio_buffer)
+        bot_len = len(self._bot_audio_buffer)
+        if user_len == bot_len:
+            return
+        target_len = max(user_len, bot_len)
+        if user_len < target_len:
+            self._user_audio_buffer.extend(b"\x00" * (target_len - user_len))
+            self._last_user_frame_at = max(self._last_user_frame_at, self._last_bot_frame_at)
+        if bot_len < target_len:
+            self._bot_audio_buffer.extend(b"\x00" * (target_len - bot_len))
+            self._last_bot_frame_at = max(self._last_bot_frame_at, self._last_user_frame_at)
     async def _resample_input_audio(self, frame: InputAudioRawFrame) -> bytes:
         """Resample audio frame to the target sample rate."""
         return await self._input_resampler.resample(

dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.82.dev857py3-none-any.whl → 0.0.85.dev837py3-none-any.whl