PyPI - dv-pipecat-ai - Versions diffs - 0.0.85.dev699__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.85.dev699py3-none-any.whl → 0.0.85.dev814py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (43) hide show

{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +43 -43
pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
pipecat/pipeline/runner.py +6 -2
pipecat/pipeline/task.py +40 -55
pipecat/processors/aggregators/llm_context.py +40 -2
pipecat/processors/frameworks/rtvi.py +1 -0
pipecat/runner/daily.py +59 -20
pipecat/runner/run.py +149 -67
pipecat/runner/types.py +5 -5
pipecat/services/assemblyai/models.py +6 -0
pipecat/services/assemblyai/stt.py +13 -5
pipecat/services/asyncai/tts.py +3 -0
pipecat/services/aws/llm.py +33 -16
pipecat/services/aws/nova_sonic/context.py +69 -0
pipecat/services/aws/nova_sonic/llm.py +199 -89
pipecat/services/aws/stt.py +2 -0
pipecat/services/aws_nova_sonic/context.py +8 -12
pipecat/services/cartesia/stt.py +77 -70
pipecat/services/cartesia/tts.py +3 -1
pipecat/services/deepgram/flux/stt.py +4 -0
pipecat/services/elevenlabs/tts.py +82 -41
pipecat/services/fish/tts.py +3 -0
pipecat/services/google/stt.py +4 -0
pipecat/services/lmnt/tts.py +2 -0
pipecat/services/neuphonic/tts.py +3 -0
pipecat/services/openai/tts.py +37 -6
pipecat/services/piper/tts.py +7 -9
pipecat/services/playht/tts.py +3 -0
pipecat/services/rime/tts.py +9 -8
pipecat/services/riva/stt.py +3 -1
pipecat/services/sarvam/tts.py +87 -10
pipecat/services/speechmatics/stt.py +3 -1
pipecat/services/stt_service.py +23 -10
pipecat/services/tts_service.py +64 -13
pipecat/transports/base_input.py +3 -0
pipecat/transports/base_output.py +71 -77
pipecat/transports/smallwebrtc/connection.py +5 -0
pipecat/transports/smallwebrtc/request_handler.py +42 -0
pipecat/utils/string.py +1 -0
{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0

pipecat/transports/base_output.py CHANGED Viewed

@@ -294,15 +294,15 @@ class BaseOutputTransport(FrameProcessor):
         """
         await super().process_frame(frame, direction)
-        #
-        # System frames (like InterruptionFrame) are pushed immediately. Other
-        # frames require order so they are put in the sink queue.
-        #
         if isinstance(frame, StartFrame):
             # Push StartFrame before start(), because we want StartFrame to be
             # processed by every processor before any other frame is processed.
             await self.push_frame(frame, direction)
             await self.start(frame)
+        elif isinstance(frame, EndFrame):
+            await self.stop(frame)
+            # Keep pushing EndFrame down so all the pipeline stops nicely.
+            await self.push_frame(frame, direction)
         elif isinstance(frame, CancelFrame):
             await self.cancel(frame)
             await self.push_frame(frame, direction)
@@ -315,21 +315,6 @@ class BaseOutputTransport(FrameProcessor):
             await self.write_dtmf(frame)
         elif isinstance(frame, SystemFrame):
             await self.push_frame(frame, direction)
-        # Control frames.
-        elif isinstance(frame, EndFrame):
-            await self.stop(frame)
-            # Keep pushing EndFrame down so all the pipeline stops nicely.
-            await self.push_frame(frame, direction)
-        elif isinstance(frame, MixerControlFrame):
-            await self._handle_frame(frame)
-        # Other frames.
-        elif isinstance(frame, OutputAudioRawFrame):
-            await self._handle_frame(frame)
-        elif isinstance(frame, (OutputImageRawFrame, SpriteFrame)):
-            await self._handle_frame(frame)
-        # TODO(aleix): Images and audio should support presentation timestamps.
-        elif frame.pts:
-            await self._handle_frame(frame)
         elif direction == FrameDirection.UPSTREAM:
             await self.push_frame(frame, direction)
         else:
@@ -411,6 +396,13 @@ class BaseOutputTransport(FrameProcessor):
             # Indicates if the bot is currently speaking.
             self._bot_speaking = False
+            # Last time a BotSpeakingFrame was pushed.
+            self._bot_speaking_frame_time = 0
+            # How often a BotSpeakingFrame should be pushed (value should be
+            # lower than the audio chunks).
+            self._bot_speaking_frame_period = 0.2
+            # Last time the bot actually spoke.
+            self._bot_speech_last_time = 0
             self._audio_task: Optional[asyncio.Task] = None
             self._video_task: Optional[asyncio.Task] = None
@@ -602,39 +594,71 @@ class BaseOutputTransport(FrameProcessor):
         async def _bot_started_speaking(self):
             """Handle bot started speaking event."""
-            if not self._bot_speaking:
-                self._transport.logger.debug(
-                    f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
-                )
+            if self._bot_speaking:
+                return
-                downstream_frame = BotStartedSpeakingFrame()
-                downstream_frame.transport_destination = self._destination
-                upstream_frame = BotStartedSpeakingFrame()
-                upstream_frame.transport_destination = self._destination
-                await self._transport.push_frame(downstream_frame)
-                await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
+            self._transport.logger.debug(
+                f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
+            )
+            downstream_frame = BotStartedSpeakingFrame()
+            downstream_frame.transport_destination = self._destination
+            upstream_frame = BotStartedSpeakingFrame()
+            upstream_frame.transport_destination = self._destination
+            await self._transport.push_frame(downstream_frame)
+            await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
-                self._bot_speaking = True
+            self._bot_speaking = True
         async def _bot_stopped_speaking(self):
             """Handle bot stopped speaking event."""
-            if self._bot_speaking:
-                self._transport.logger.debug(
-                    f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
-                )
+            if not self._bot_speaking:
+                return
-                downstream_frame = BotStoppedSpeakingFrame()
-                downstream_frame.transport_destination = self._destination
-                upstream_frame = BotStoppedSpeakingFrame()
-                upstream_frame.transport_destination = self._destination
-                await self._transport.push_frame(downstream_frame)
-                await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
+            self._transport.logger.debug(
+                f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
+            )
+            downstream_frame = BotStoppedSpeakingFrame()
+            downstream_frame.transport_destination = self._destination
+            upstream_frame = BotStoppedSpeakingFrame()
+            upstream_frame.transport_destination = self._destination
+            await self._transport.push_frame(downstream_frame)
+            await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
-                self._bot_speaking = False
+            self._bot_speaking = False
+            # Clean audio buffer (there could be tiny left overs if not multiple
+            # to our output chunk size).
+            self._audio_buffer = bytearray()
-                # Clean audio buffer (there could be tiny left overs if not multiple
-                # to our output chunk size).
-                self._audio_buffer = bytearray()
+        async def _bot_currently_speaking(self):
+            """Handle bot speaking event."""
+            await self._bot_started_speaking()
+            diff_time = time.time() - self._bot_speaking_frame_time
+            if diff_time >= self._bot_speaking_frame_period:
+                await self._transport.push_frame(BotSpeakingFrame())
+                await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
+                self._bot_speaking_frame_time = time.time()
+            self._bot_speech_last_time = time.time()
+        async def _maybe_bot_currently_speaking(self, frame: SpeechOutputAudioRawFrame):
+            if not is_silence(frame.audio):
+                await self._bot_currently_speaking()
+            else:
+                silence_duration = time.time() - self._bot_speech_last_time
+                if silence_duration > BOT_VAD_STOP_SECS:
+                    await self._bot_stopped_speaking()
+        async def _handle_bot_speech(self, frame: Frame):
+            # TTS case.
+            if isinstance(frame, TTSAudioRawFrame):
+                await self._bot_currently_speaking()
+            # Speech stream case.
+            elif isinstance(frame, SpeechOutputAudioRawFrame):
+                await self._maybe_bot_currently_speaking(frame)
         async def _handle_frame(self, frame: Frame):
             """Handle various frame types with appropriate processing.
@@ -642,7 +666,9 @@ class BaseOutputTransport(FrameProcessor):
             Args:
                 frame: The frame to handle.
             """
-            if isinstance(frame, OutputImageRawFrame):
+            if isinstance(frame, OutputAudioRawFrame):
+                await self._handle_bot_speech(frame)
+            elif isinstance(frame, OutputImageRawFrame):
                 await self._set_video_image(frame)
             elif isinstance(frame, SpriteFrame):
                 await self._set_video_images(frame.images)
@@ -706,39 +732,7 @@ class BaseOutputTransport(FrameProcessor):
         async def _audio_task_handler(self):
             """Main audio processing task handler."""
-            # Push a BotSpeakingFrame every 200ms, we don't really need to push it
-            # at every audio chunk. If the audio chunk is bigger than 200ms, push at
-            # every audio chunk.
-            TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
-            BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
-            bot_speaking_counter = 0
-            speech_last_speaking_time = 0
             async for frame in self._next_frame():
-                # Notify the bot started speaking upstream if necessary and that
-                # it's actually speaking.
-                is_speaking = False
-                if isinstance(frame, TTSAudioRawFrame):
-                    is_speaking = True
-                elif isinstance(frame, SpeechOutputAudioRawFrame):
-                    if not is_silence(frame.audio):
-                        is_speaking = True
-                        speech_last_speaking_time = time.time()
-                    else:
-                        silence_duration = time.time() - speech_last_speaking_time
-                        if silence_duration > BOT_VAD_STOP_SECS:
-                            await self._bot_stopped_speaking()
-                if is_speaking:
-                    await self._bot_started_speaking()
-                    if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
-                        await self._transport.push_frame(BotSpeakingFrame())
-                        await self._transport.push_frame(
-                            BotSpeakingFrame(), FrameDirection.UPSTREAM
-                        )
-                        bot_speaking_counter = 0
-                    bot_speaking_counter += 1
                 # No need to push EndFrame, it's pushed from process_frame().
                 if isinstance(frame, EndFrame):
                     break

pipecat/transports/smallwebrtc/connection.py CHANGED Viewed

@@ -689,3 +689,8 @@ class SmallWebRTCConnection(BaseObject):
                 )()
                 if track:
                     track.set_enabled(signalling_message.enabled)
+    async def add_ice_candidate(self, candidate):
+        """Handle incoming ICE candidates."""
+        logger.debug(f"Adding remote candidate: {candidate}")
+        await self.pc.addIceCandidate(candidate)

pipecat/transports/smallwebrtc/request_handler.py CHANGED Viewed

@@ -14,6 +14,7 @@ from dataclasses import dataclass
 from enum import Enum
 from typing import Any, Awaitable, Callable, Dict, List, Optional
+from aiortc.sdp import candidate_from_sdp
 from fastapi import HTTPException
 from loguru import logger
@@ -39,6 +40,34 @@ class SmallWebRTCRequest:
     request_data: Optional[Any] = None
+@dataclass
+class IceCandidate:
+    """The remote ice candidate object received from the peer connection.
+    Parameters:
+        candidate: The ice candidate patch SDP string (Session Description Protocol).
+        sdp_mid: The SDP mid for the candidate patch.
+        sdp_mline_index: The SDP mline index for the candidate patch.
+    """
+    candidate: str
+    sdp_mid: str
+    sdp_mline_index: int
+@dataclass
+class SmallWebRTCPatchRequest:
+    """Small WebRTC transport session arguments for the runner.
+    Parameters:
+        pc_id: Identifier for the peer connection.
+        candidates: A list of ICE candidate patches.
+    """
+    pc_id: str
+    candidates: List[IceCandidate]
 class ConnectionMode(Enum):
     """Enum defining the connection handling modes."""
@@ -197,6 +226,19 @@ class SmallWebRTCRequestHandler:
             logger.debug(f"SmallWebRTC request details: {request}")
             raise
+    async def handle_patch_request(self, request: SmallWebRTCPatchRequest):
+        """Handle a SmallWebRTC patch candidate request."""
+        peer_connection = self._pcs_map.get(request.pc_id)
+        if not peer_connection:
+            raise HTTPException(status_code=404, detail="Peer connection not found")
+        for c in request.candidates:
+            candidate = candidate_from_sdp(c.candidate)
+            candidate.sdpMid = c.sdp_mid
+            candidate.sdpMLineIndex = c.sdp_mline_index
+            await peer_connection.add_ice_candidate(candidate)
     async def close(self):
         """Clear the connection map."""
         coros = [pc.disconnect() for pc in self._pcs_map.values()]

pipecat/utils/string.py CHANGED Viewed

@@ -47,6 +47,7 @@ SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
         "!",
         "?",
         ";",
+        "…",
         # East Asian punctuation (Chinese (Traditional & Simplified), Japanese, Korean)
         "。",  # Ideographic full stop
         "？",  # Full-width question mark

{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt RENAMED Viewed

File without changes

dv-pipecat-ai 0.0.85.dev699__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.85.dev699py3-none-any.whl → 0.0.85.dev814py3-none-any.whl