PyPI - dv-pipecat-ai - Versions diffs - 0.0.85.dev7__py3-none-any.whl → 0.0.85.dev699__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.85.dev7py3-none-any.whl → 0.0.85.dev699py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (158) hide show

{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/METADATA +78 -117
{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/RECORD +158 -122
pipecat/adapters/base_llm_adapter.py +38 -1
pipecat/adapters/services/anthropic_adapter.py +9 -14
pipecat/adapters/services/aws_nova_sonic_adapter.py +5 -0
pipecat/adapters/services/bedrock_adapter.py +236 -13
pipecat/adapters/services/gemini_adapter.py +12 -8
pipecat/adapters/services/open_ai_adapter.py +19 -7
pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
pipecat/audio/filters/krisp_viva_filter.py +193 -0
pipecat/audio/filters/noisereduce_filter.py +15 -0
pipecat/audio/turn/base_turn_analyzer.py +9 -1
pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
pipecat/audio/vad/data/README.md +10 -0
pipecat/audio/vad/vad_analyzer.py +13 -1
pipecat/extensions/voicemail/voicemail_detector.py +5 -5
pipecat/frames/frames.py +120 -87
pipecat/observers/loggers/debug_log_observer.py +3 -3
pipecat/observers/loggers/llm_log_observer.py +7 -3
pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
pipecat/pipeline/runner.py +12 -4
pipecat/pipeline/service_switcher.py +64 -36
pipecat/pipeline/task.py +85 -24
pipecat/processors/aggregators/dtmf_aggregator.py +28 -22
pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
pipecat/processors/aggregators/llm_response.py +6 -7
pipecat/processors/aggregators/llm_response_universal.py +19 -15
pipecat/processors/aggregators/user_response.py +6 -6
pipecat/processors/aggregators/vision_image_frame.py +24 -2
pipecat/processors/audio/audio_buffer_processor.py +43 -8
pipecat/processors/filters/stt_mute_filter.py +2 -0
pipecat/processors/frame_processor.py +103 -17
pipecat/processors/frameworks/langchain.py +8 -2
pipecat/processors/frameworks/rtvi.py +209 -68
pipecat/processors/frameworks/strands_agents.py +170 -0
pipecat/processors/logger.py +2 -2
pipecat/processors/transcript_processor.py +4 -4
pipecat/processors/user_idle_processor.py +3 -6
pipecat/runner/run.py +270 -50
pipecat/runner/types.py +2 -0
pipecat/runner/utils.py +51 -10
pipecat/serializers/exotel.py +5 -5
pipecat/serializers/livekit.py +20 -0
pipecat/serializers/plivo.py +6 -9
pipecat/serializers/protobuf.py +6 -5
pipecat/serializers/telnyx.py +2 -2
pipecat/serializers/twilio.py +43 -23
pipecat/services/ai_service.py +2 -6
pipecat/services/anthropic/llm.py +2 -25
pipecat/services/asyncai/tts.py +2 -3
pipecat/services/aws/__init__.py +1 -0
pipecat/services/aws/llm.py +122 -97
pipecat/services/aws/nova_sonic/__init__.py +0 -0
pipecat/services/aws/nova_sonic/context.py +367 -0
pipecat/services/aws/nova_sonic/frames.py +25 -0
pipecat/services/aws/nova_sonic/llm.py +1155 -0
pipecat/services/aws/stt.py +1 -3
pipecat/services/aws_nova_sonic/__init__.py +19 -1
pipecat/services/aws_nova_sonic/aws.py +11 -1151
pipecat/services/aws_nova_sonic/context.py +13 -355
pipecat/services/aws_nova_sonic/frames.py +13 -17
pipecat/services/azure/realtime/__init__.py +0 -0
pipecat/services/azure/realtime/llm.py +65 -0
pipecat/services/azure/stt.py +15 -0
pipecat/services/cartesia/tts.py +2 -2
pipecat/services/deepgram/__init__.py +1 -0
pipecat/services/deepgram/flux/__init__.py +0 -0
pipecat/services/deepgram/flux/stt.py +636 -0
pipecat/services/elevenlabs/__init__.py +2 -1
pipecat/services/elevenlabs/stt.py +254 -276
pipecat/services/elevenlabs/tts.py +5 -5
pipecat/services/fish/tts.py +2 -2
pipecat/services/gemini_multimodal_live/events.py +38 -524
pipecat/services/gemini_multimodal_live/file_api.py +23 -173
pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
pipecat/services/gladia/stt.py +56 -72
pipecat/services/google/__init__.py +1 -0
pipecat/services/google/gemini_live/__init__.py +3 -0
pipecat/services/google/gemini_live/file_api.py +189 -0
pipecat/services/google/gemini_live/llm.py +1582 -0
pipecat/services/google/gemini_live/llm_vertex.py +184 -0
pipecat/services/google/llm.py +15 -11
pipecat/services/google/llm_openai.py +3 -3
pipecat/services/google/llm_vertex.py +86 -16
pipecat/services/google/tts.py +7 -3
pipecat/services/heygen/api.py +2 -0
pipecat/services/heygen/client.py +8 -4
pipecat/services/heygen/video.py +2 -0
pipecat/services/hume/__init__.py +5 -0
pipecat/services/hume/tts.py +220 -0
pipecat/services/inworld/tts.py +6 -6
pipecat/services/llm_service.py +15 -5
pipecat/services/lmnt/tts.py +2 -2
pipecat/services/mcp_service.py +4 -2
pipecat/services/mem0/memory.py +6 -5
pipecat/services/mistral/llm.py +29 -8
pipecat/services/moondream/vision.py +42 -16
pipecat/services/neuphonic/tts.py +2 -2
pipecat/services/openai/__init__.py +1 -0
pipecat/services/openai/base_llm.py +27 -20
pipecat/services/openai/realtime/__init__.py +0 -0
pipecat/services/openai/realtime/context.py +272 -0
pipecat/services/openai/realtime/events.py +1106 -0
pipecat/services/openai/realtime/frames.py +37 -0
pipecat/services/openai/realtime/llm.py +829 -0
pipecat/services/openai/tts.py +16 -8
pipecat/services/openai_realtime/__init__.py +27 -0
pipecat/services/openai_realtime/azure.py +21 -0
pipecat/services/openai_realtime/context.py +21 -0
pipecat/services/openai_realtime/events.py +21 -0
pipecat/services/openai_realtime/frames.py +21 -0
pipecat/services/openai_realtime_beta/azure.py +16 -0
pipecat/services/openai_realtime_beta/openai.py +17 -5
pipecat/services/playht/tts.py +31 -4
pipecat/services/rime/tts.py +3 -4
pipecat/services/salesforce/__init__.py +9 -0
pipecat/services/salesforce/llm.py +465 -0
pipecat/services/sarvam/tts.py +2 -6
pipecat/services/simli/video.py +2 -2
pipecat/services/speechmatics/stt.py +1 -7
pipecat/services/stt_service.py +34 -0
pipecat/services/tavus/video.py +2 -2
pipecat/services/tts_service.py +9 -9
pipecat/services/vision_service.py +7 -6
pipecat/tests/utils.py +4 -4
pipecat/transcriptions/language.py +41 -1
pipecat/transports/base_input.py +17 -42
pipecat/transports/base_output.py +42 -26
pipecat/transports/daily/transport.py +199 -26
pipecat/transports/heygen/__init__.py +0 -0
pipecat/transports/heygen/transport.py +381 -0
pipecat/transports/livekit/transport.py +228 -63
pipecat/transports/local/audio.py +6 -1
pipecat/transports/local/tk.py +11 -2
pipecat/transports/network/fastapi_websocket.py +1 -1
pipecat/transports/smallwebrtc/connection.py +98 -19
pipecat/transports/smallwebrtc/request_handler.py +204 -0
pipecat/transports/smallwebrtc/transport.py +65 -23
pipecat/transports/tavus/transport.py +23 -12
pipecat/transports/websocket/client.py +41 -5
pipecat/transports/websocket/fastapi.py +21 -11
pipecat/transports/websocket/server.py +14 -7
pipecat/transports/whatsapp/api.py +8 -0
pipecat/transports/whatsapp/client.py +47 -0
pipecat/utils/base_object.py +54 -22
pipecat/utils/string.py +12 -1
pipecat/utils/tracing/service_decorators.py +21 -21
{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev699.dist-info}/top_level.txt +0 -0
/pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0

pipecat/services/tts_service.py CHANGED Viewed

@@ -20,10 +20,10 @@ from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
     InterimTranscriptionFrame,
+    InterruptionFrame,
     LLMFullResponseEndFrame,
     LLMFullResponseStartFrame,
     StartFrame,
-    StartInterruptionFrame,
     TextFrame,
     TranscriptionFrame,
     TTSAudioRawFrame,
@@ -319,7 +319,7 @@ class TTSService(AIService):
             and not isinstance(frame, TranscriptionFrame)
         ):
             await self._process_text_frame(frame)
-        elif isinstance(frame, StartInterruptionFrame):
+        elif isinstance(frame, InterruptionFrame):
             await self._handle_interruption(frame, direction)
             await self.push_frame(frame, direction)
         elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
@@ -377,14 +377,14 @@ class TTSService(AIService):
         await super().push_frame(frame, direction)
         if self._push_stop_frames and (
-            isinstance(frame, StartInterruptionFrame)
+            isinstance(frame, InterruptionFrame)
             or isinstance(frame, TTSStartedFrame)
             or isinstance(frame, TTSAudioRawFrame)
             or isinstance(frame, TTSStoppedFrame)
         ):
             await self._stop_frame_queue.put(frame)
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         self._processing_text = False
         await self._text_aggregator.handle_interruption()
         for filter in self._text_filters:
@@ -465,7 +465,7 @@ class TTSService(AIService):
                 )
                 if isinstance(frame, TTSStartedFrame):
                     has_started = True
-                elif isinstance(frame, (TTSStoppedFrame, StartInterruptionFrame)):
+                elif isinstance(frame, (TTSStoppedFrame, InterruptionFrame)):
                     has_started = False
             except asyncio.TimeoutError:
                 if has_started:
@@ -550,7 +550,7 @@ class WordTTSService(TTSService):
         elif isinstance(frame, (LLMFullResponseEndFrame, EndFrame)):
             await self.flush_audio()
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         self._llm_response_started = False
         self.reset_word_timestamps()
@@ -640,7 +640,7 @@ class InterruptibleTTSService(WebsocketTTSService):
         # user interrupts we need to reconnect.
         self._bot_speaking = False
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         if self._bot_speaking:
             await self._disconnect()
@@ -712,7 +712,7 @@ class InterruptibleWordTTSService(WebsocketWordTTSService):
         # user interrupts we need to reconnect.
         self._bot_speaking = False
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         if self._bot_speaking:
             await self._disconnect()
@@ -840,7 +840,7 @@ class AudioContextWordTTSService(WebsocketWordTTSService):
         await super().cancel(frame)
         await self._stop_audio_context_task()
-    async def _handle_interruption(self, frame: StartInterruptionFrame, direction: FrameDirection):
+    async def _handle_interruption(self, frame: InterruptionFrame, direction: FrameDirection):
         await super()._handle_interruption(frame, direction)
         await self._stop_audio_context_task()
         self._create_audio_context_task()

pipecat/services/vision_service.py CHANGED Viewed

@@ -14,7 +14,8 @@ visual content.
 from abc import abstractmethod
 from typing import AsyncGenerator
-from pipecat.frames.frames import Frame, VisionImageRawFrame
+from pipecat.frames.frames import Frame, LLMContextFrame
+from pipecat.processors.aggregators.llm_context import LLMContext
 from pipecat.processors.frame_processor import FrameDirection
 from pipecat.services.ai_service import AIService
@@ -37,15 +38,15 @@ class VisionService(AIService):
         self._describe_text = None
     @abstractmethod
-    async def run_vision(self, frame: VisionImageRawFrame) -> AsyncGenerator[Frame, None]:
-        """Process a vision image frame and generate results.
+    async def run_vision(self, context: LLMContext) -> AsyncGenerator[Frame, None]:
+        """Process the latest image in the context and generate results.
         This method must be implemented by subclasses to provide actual computer
         vision functionality such as image description, object detection, or
         visual question answering.
         Args:
-            frame: The vision image frame to process, containing image data.
+            context: The context to process, containing image data.
         Yields:
             Frame: Frames containing the vision analysis results, typically TextFrame
@@ -65,9 +66,9 @@ class VisionService(AIService):
         """
         await super().process_frame(frame, direction)
-        if isinstance(frame, VisionImageRawFrame):
+        if isinstance(frame, LLMContextFrame):
             await self.start_processing_metrics()
-            await self.process_generator(self.run_vision(frame))
+            await self.process_generator(self.run_vision(frame.context))
             await self.stop_processing_metrics()
         else:
             await self.push_frame(frame, direction)

pipecat/tests/utils.py CHANGED Viewed

@@ -128,7 +128,7 @@ async def run_test(
     expected_up_frames: Optional[Sequence[type]] = None,
     ignore_start: bool = True,
     observers: Optional[List[BaseObserver]] = None,
-    start_metadata: Optional[Dict[str, Any]] = None,
+    pipeline_params: Optional[PipelineParams] = None,
     send_end_frame: bool = True,
 ) -> Tuple[Sequence[Frame], Sequence[Frame]]:
     """Run a test pipeline with the specified processor and validate frame flow.
@@ -144,7 +144,7 @@ async def run_test(
         expected_up_frames: Expected frame types flowing upstream (optional).
         ignore_start: Whether to ignore StartFrames in frame validation.
         observers: Optional list of observers to attach to the pipeline.
-        start_metadata: Optional metadata to include with the StartFrame.
+        pipeline_params: Optional pipeline parameters.
         send_end_frame: Whether to send an EndFrame at the end of the test.
     Returns:
@@ -154,7 +154,7 @@ async def run_test(
         AssertionError: If the received frames don't match the expected frame types.
     """
     observers = observers or []
-    start_metadata = start_metadata or {}
+    pipeline_params = pipeline_params or PipelineParams()
     received_up = asyncio.Queue()
     received_down = asyncio.Queue()
@@ -173,7 +173,7 @@ async def run_test(
     task = PipelineTask(
         pipeline,
-        params=PipelineParams(start_metadata=start_metadata),
+        params=pipeline_params,
         observers=observers,
         cancel_on_idle_timeout=False,
     )

pipecat/transcriptions/language.py CHANGED Viewed

@@ -68,6 +68,9 @@ class Language(StrEnum):
     AS = "as"
     AS_IN = "as-IN"
+    # Asturian
+    AST = "ast"
     # Azerbaijani
     AZ = "az"
     AZ_AZ = "az-AZ"
@@ -101,6 +104,9 @@ class Language(StrEnum):
     CA = "ca"
     CA_ES = "ca-ES"
+    # Cebuano
+    CEB = "ceb"
     # Mandarin Chinese
     CMN = "cmn"
     CMN_CN = "cmn-CN"
@@ -185,6 +191,9 @@ class Language(StrEnum):
     FA = "fa"
     FA_IR = "fa-IR"
+    # Fulah
+    FF = "ff"
     # Finnish
     FI = "fi"
     FI_FI = "fi-FI"
@@ -251,6 +260,9 @@ class Language(StrEnum):
     ID = "id"
     ID_ID = "id-ID"
+    # Igbo
+    IG = "ig"
     # Icelandic
     IS = "is"
     IS_IS = "is-IS"
@@ -279,6 +291,9 @@ class Language(StrEnum):
     KA = "ka"
     KA_GE = "ka-GE"
+    # Kabuverdianu
+    KEA = "kea"
     # Kazakh
     KK = "kk"
     KK_KZ = "kk-KZ"
@@ -295,6 +310,13 @@ class Language(StrEnum):
     KO = "ko"
     KO_KR = "ko-KR"
+    # Kurdish
+    KU = "ku"
+    # Kyrgyz
+    KY = "ky"
+    KY_KG = "ky-KG"
     # Latin
     LA = "la"
@@ -312,6 +334,12 @@ class Language(StrEnum):
     LT = "lt"
     LT_LT = "lt-LT"
+    # Ganda
+    LG = "lg"
+    # Luo
+    LUO = "luo"
     # Latvian
     LV = "lv"
     LV_LV = "lv-LV"
@@ -366,6 +394,12 @@ class Language(StrEnum):
     NL_BE = "nl-BE"
     NL_NL = "nl-NL"
+    # Northern Sotho
+    NSO = "nso"
+    # Chichewa
+    NY = "ny"
     # Occitan
     OC = "oc"
@@ -484,6 +518,9 @@ class Language(StrEnum):
     UK = "uk"
     UK_UA = "uk-UA"
+    # Umbundu
+    UMB = "umb"
     # Urdu
     UR = "ur"
     UR_IN = "ur-IN"
@@ -497,6 +534,9 @@ class Language(StrEnum):
     VI = "vi"
     VI_VN = "vi-VN"
+    # Wolof
+    WO = "wo"
     # Wu Chinese
     WUU = "wuu"
     WUU_CN = "wuu-CN"
@@ -507,7 +547,7 @@ class Language(StrEnum):
     # Yoruba
     YO = "yo"
-    # Yue Chinese
+    # Yue Chinese (Cantonese)
     YUE = "yue"
     YUE_CN = "yue-CN"

pipecat/transports/base_input.py CHANGED Viewed

@@ -11,7 +11,6 @@ input processing, including VAD, turn analysis, and interruption management.
 """
 import asyncio
-from concurrent.futures import ThreadPoolExecutor
 from typing import Optional
 from loguru import logger
@@ -22,7 +21,6 @@ from pipecat.audio.turn.base_turn_analyzer import (
 )
 from pipecat.audio.vad.vad_analyzer import VADAnalyzer, VADState
 from pipecat.frames.frames import (
-    BotInterruptionFrame,
     BotStartedSpeakingFrame,
     BotStoppedSpeakingFrame,
     CancelFrame,
@@ -36,7 +34,6 @@ from pipecat.frames.frames import (
     MetricsFrame,
     SpeechControlParamsFrame,
     StartFrame,
-    StartInterruptionFrame,
     StopFrame,
     SystemFrame,
     UserSpeakingFrame,
@@ -81,10 +78,6 @@ class BaseInputTransport(FrameProcessor):
         # Track user speaking state for interruption logic
         self._user_speaking = False
-        # We read audio from a single queue one at a time and we then run VAD in
-        # a thread. Therefore, only one thread should be necessary.
-        self._executor = ThreadPoolExecutor(max_workers=1)
         # Task to process incoming audio (VAD) and push audio frames downstream
         # if passthrough is enabled.
         self._audio_task = None
@@ -289,8 +282,6 @@ class BaseInputTransport(FrameProcessor):
         elif isinstance(frame, CancelFrame):
             await self.cancel(frame)
             await self.push_frame(frame, direction)
-        elif isinstance(frame, BotInterruptionFrame):
-            await self._handle_bot_interruption(frame)
         elif isinstance(frame, BotStartedSpeakingFrame):
             await self._handle_bot_started_speaking(frame)
             await self.push_frame(frame, direction)
@@ -298,22 +289,12 @@ class BaseInputTransport(FrameProcessor):
             await self._handle_bot_stopped_speaking(frame)
             await self.push_frame(frame, direction)
         elif isinstance(frame, EmulateUserStartedSpeakingFrame):
-            logger.debug("Emulating user started speaking")
+            self.logger.debug("Emulating user started speaking")
             await self._handle_user_interruption(VADState.SPEAKING, emulated=True)
         elif isinstance(frame, EmulateUserStoppedSpeakingFrame):
-            logger.debug("Emulating user stopped speaking")
+            self.logger.debug("Emulating user stopped speaking")
             await self._handle_user_interruption(VADState.QUIET, emulated=True)
         # All other system frames
-        elif isinstance(frame, VADParamsUpdateFrame):
-            if self.vad_analyzer:
-                self.vad_analyzer.set_params(frame.params, bot_logger=self.logger)
-                speech_frame = SpeechControlParamsFrame(
-                    vad_params=frame.params,
-                    turn_params=self._params.turn_analyzer.params
-                    if self._params.turn_analyzer
-                    else None,
-                )
-                await self.push_frame(speech_frame)
         elif isinstance(frame, SystemFrame):
             await self.push_frame(frame, direction)
         # Control frames
@@ -325,6 +306,16 @@ class BaseInputTransport(FrameProcessor):
         elif isinstance(frame, StopFrame):
             await self.push_frame(frame, direction)
             await self.pause(frame)
+        elif isinstance(frame, VADParamsUpdateFrame):
+            if self.vad_analyzer:
+                self.vad_analyzer.set_params(frame.params)
+                speech_frame = SpeechControlParamsFrame(
+                    vad_params=frame.params,
+                    turn_params=self._params.turn_analyzer.params
+                    if self._params.turn_analyzer
+                    else None,
+                )
+                await self.push_frame(speech_frame)
         elif isinstance(frame, FilterUpdateSettingsFrame) and self._params.audio_in_filter:
             await self._params.audio_in_filter.process_frame(frame)
         # Other frames
@@ -335,13 +326,6 @@ class BaseInputTransport(FrameProcessor):
     # Handle interruptions
     #
-    async def _handle_bot_interruption(self, frame: BotInterruptionFrame):
-        """Handle bot interruption frames."""
-        self.logger.debug("Bot interruption")
-        if self.interruptions_allowed:
-            await self._start_interruption()
-            await self.push_frame(StartInterruptionFrame())
     async def _handle_user_interruption(self, vad_state: VADState, emulated: bool = False):
         """Handle user interruption events based on speaking state."""
         if vad_state == VADState.SPEAKING:
@@ -353,7 +337,7 @@ class BaseInputTransport(FrameProcessor):
             await self.push_frame(downstream_frame)
             await self.push_frame(upstream_frame, FrameDirection.UPSTREAM)
-            # Only push StartInterruptionFrame if:
+            # Only push InterruptionFrame if:
             # 1. No interruption config is set, OR
             # 2. Interruption config is set but bot is not speaking
             should_push_immediate_interruption = (
@@ -362,13 +346,9 @@ class BaseInputTransport(FrameProcessor):
             # Make sure we notify about interruptions quickly out-of-band.
             if should_push_immediate_interruption and self.interruptions_allowed:
-                await self._start_interruption()
-                # Push an out-of-band frame (i.e. not using the ordered push
-                # frame task) to stop everything, specially at the output
-                # transport.
-                await self.push_frame(StartInterruptionFrame())
+                await self.push_interruption_task_frame_and_wait()
             elif self.interruption_strategies and self._bot_speaking:
-                logger.debug(
+                self.logger.debug(
                     "User started speaking while bot is speaking with interruption config - "
                     "deferring interruption to aggregator"
                 )
@@ -381,9 +361,6 @@ class BaseInputTransport(FrameProcessor):
             await self.push_frame(downstream_frame)
             await self.push_frame(upstream_frame, FrameDirection.UPSTREAM)
-            if self.interruptions_allowed:
-                await self._stop_interruption()
     #
     # Handle bot speaking state
     #
@@ -416,9 +393,7 @@ class BaseInputTransport(FrameProcessor):
         """Analyze audio frame for voice activity."""
         state = VADState.QUIET
         if self.vad_analyzer:
-            state = await self.get_event_loop().run_in_executor(
-                self._executor, self.vad_analyzer.analyze_audio, audio_frame.audio
-            )
+            state = await self.vad_analyzer.analyze_audio(audio_frame.audio)
         return state
     async def _handle_vad(self, audio_frame: InputAudioRawFrame, vad_state: VADState) -> VADState:
@@ -511,7 +486,7 @@ class BaseInputTransport(FrameProcessor):
                 self._audio_in_queue.task_done()
             except asyncio.TimeoutError:
                 if self._user_speaking:
-                    logger.warning(
+                    self.logger.warning(
                         "Forcing user stopped speaking due to timeout receiving audio frame!"
                     )
                     vad_state = VADState.QUIET

pipecat/transports/base_output.py CHANGED Viewed

@@ -29,20 +29,19 @@ from pipecat.frames.frames import (
     CancelFrame,
     EndFrame,
     Frame,
-    InputTransportMessageUrgentFrame,
+    InterruptionFrame,
     MixerControlFrame,
     OutputAudioRawFrame,
     OutputDTMFFrame,
     OutputDTMFUrgentFrame,
     OutputImageRawFrame,
+    OutputTransportMessageFrame,
+    OutputTransportMessageUrgentFrame,
     OutputTransportReadyFrame,
     SpeechOutputAudioRawFrame,
     SpriteFrame,
     StartFrame,
-    StartInterruptionFrame,
     SystemFrame,
-    TransportMessageFrame,
-    TransportMessageUrgentFrame,
     TTSAudioRawFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -179,7 +178,9 @@ class BaseOutputTransport(FrameProcessor):
         # Sending a frame indicating that the output transport is ready and able to receive frames.
         await self.push_frame(OutputTransportReadyFrame(), FrameDirection.UPSTREAM)
-    async def send_message(self, frame: TransportMessageFrame | TransportMessageUrgentFrame):
+    async def send_message(
+        self, frame: OutputTransportMessageFrame | OutputTransportMessageUrgentFrame
+    ):
         """Send a transport message.
         Args:
@@ -203,21 +204,27 @@ class BaseOutputTransport(FrameProcessor):
         """
         pass
-    async def write_video_frame(self, frame: OutputImageRawFrame):
+    async def write_video_frame(self, frame: OutputImageRawFrame) -> bool:
         """Write a video frame to the transport.
         Args:
             frame: The output video frame to write.
+        Returns:
+            True if the video frame was written successfully, False otherwise.
         """
-        pass
+        return False
-    async def write_audio_frame(self, frame: OutputAudioRawFrame):
+    async def write_audio_frame(self, frame: OutputAudioRawFrame) -> bool:
         """Write an audio frame to the transport.
         Args:
             frame: The output audio frame to write.
+        Returns:
+            True if the audio frame was written successfully, False otherwise.
         """
-        pass
+        return False
     async def write_dtmf(self, frame: OutputDTMFFrame | OutputDTMFUrgentFrame):
         """Write a DTMF tone using the transport's preferred method.
@@ -288,9 +295,8 @@ class BaseOutputTransport(FrameProcessor):
         await super().process_frame(frame, direction)
         #
-        # System frames (like StartInterruptionFrame) are pushed
-        # immediately. Other frames require order so they are put in the sink
-        # queue.
+        # System frames (like InterruptionFrame) are pushed immediately. Other
+        # frames require order so they are put in the sink queue.
         #
         if isinstance(frame, StartFrame):
             # Push StartFrame before start(), because we want StartFrame to be
@@ -300,12 +306,10 @@ class BaseOutputTransport(FrameProcessor):
         elif isinstance(frame, CancelFrame):
             await self.cancel(frame)
             await self.push_frame(frame, direction)
-        elif isinstance(frame, StartInterruptionFrame):
+        elif isinstance(frame, InterruptionFrame):
             await self.push_frame(frame, direction)
             await self._handle_frame(frame)
-        elif isinstance(frame, TransportMessageUrgentFrame) and not isinstance(
-            frame, InputTransportMessageUrgentFrame
-        ):
+        elif isinstance(frame, OutputTransportMessageUrgentFrame):
             await self.send_message(frame)
         elif isinstance(frame, OutputDTMFUrgentFrame):
             await self.write_dtmf(frame)
@@ -341,7 +345,7 @@ class BaseOutputTransport(FrameProcessor):
         sender = self._media_senders[frame.transport_destination]
-        if isinstance(frame, StartInterruptionFrame):
+        if isinstance(frame, InterruptionFrame):
             await sender.handle_interruptions(frame)
         elif isinstance(frame, OutputAudioRawFrame):
             await sender.handle_audio_frame(frame)
@@ -492,7 +496,7 @@ class BaseOutputTransport(FrameProcessor):
             await self._cancel_clock_task()
             await self._cancel_video_task()
-        async def handle_interruptions(self, _: StartInterruptionFrame):
+        async def handle_interruptions(self, _: InterruptionFrame):
             """Handle interruption events by restarting tasks and clearing buffers.
             Args:
@@ -642,7 +646,7 @@ class BaseOutputTransport(FrameProcessor):
                 await self._set_video_image(frame)
             elif isinstance(frame, SpriteFrame):
                 await self._set_video_images(frame.images)
-            elif isinstance(frame, TransportMessageFrame):
+            elif isinstance(frame, OutputTransportMessageFrame):
                 await self._transport.send_message(frame)
             elif isinstance(frame, OutputDTMFFrame):
                 await self._transport.write_dtmf(frame)
@@ -661,6 +665,7 @@ class BaseOutputTransport(FrameProcessor):
                             self._audio_queue.get(), timeout=vad_stop_secs
                         )
                         yield frame
+                        self._audio_queue.task_done()
                     except asyncio.TimeoutError:
                         # Notify the bot stopped speaking upstream if necessary.
                         await self._bot_stopped_speaking()
@@ -673,8 +678,9 @@ class BaseOutputTransport(FrameProcessor):
                         frame = self._audio_queue.get_nowait()
                         if isinstance(frame, OutputAudioRawFrame):
                             frame.audio = await self._mixer.mix(frame.audio)
-                        last_frame_time = time.time()
+                            last_frame_time = time.time()
                         yield frame
+                        self._audio_queue.task_done()
                     except asyncio.QueueEmpty:
                         # Notify the bot stopped speaking upstream if necessary.
                         diff_time = time.time() - last_frame_time
@@ -740,12 +746,22 @@ class BaseOutputTransport(FrameProcessor):
                 # Handle frame.
                 await self._handle_frame(frame)
-                # Also, push frame downstream in case anyone else needs it.
-                await self._transport.push_frame(frame)
-                # Send audio.
-                if isinstance(frame, OutputAudioRawFrame):
-                    await self._transport.write_audio_frame(frame)
+                # If we are not able to write to the transport we shouldn't
+                # pushb downstream.
+                push_downstream = True
+                # Try to send audio to the transport.
+                try:
+                    if isinstance(frame, OutputAudioRawFrame):
+                        push_downstream = await self._transport.write_audio_frame(frame)
+                except Exception as e:
+                    logger.error(f"{self} Error writing {frame} to transport: {e}")
+                    push_downstream = False
+                # If we were able to send to the transport, push the frame
+                # downstream in case anyone else needs it.
+                if push_downstream:
+                    await self._transport.push_frame(frame)
         #
         # Video handling

dv-pipecat-ai 0.0.85.dev7__py3-none-any.whl → 0.0.85.dev699__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.85.dev7py3-none-any.whl → 0.0.85.dev699py3-none-any.whl