dv-pipecat-ai 0.0.85.dev699__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +43 -43
- pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
- pipecat/pipeline/runner.py +6 -2
- pipecat/pipeline/task.py +40 -55
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/frameworks/rtvi.py +1 -0
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +149 -67
- pipecat/runner/types.py +5 -5
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +3 -0
- pipecat/services/aws/llm.py +33 -16
- pipecat/services/aws/nova_sonic/context.py +69 -0
- pipecat/services/aws/nova_sonic/llm.py +199 -89
- pipecat/services/aws/stt.py +2 -0
- pipecat/services/aws_nova_sonic/context.py +8 -12
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +3 -1
- pipecat/services/deepgram/flux/stt.py +4 -0
- pipecat/services/elevenlabs/tts.py +82 -41
- pipecat/services/fish/tts.py +3 -0
- pipecat/services/google/stt.py +4 -0
- pipecat/services/lmnt/tts.py +2 -0
- pipecat/services/neuphonic/tts.py +3 -0
- pipecat/services/openai/tts.py +37 -6
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +3 -0
- pipecat/services/rime/tts.py +9 -8
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/sarvam/tts.py +87 -10
- pipecat/services/speechmatics/stt.py +3 -1
- pipecat/services/stt_service.py +23 -10
- pipecat/services/tts_service.py +64 -13
- pipecat/transports/base_input.py +3 -0
- pipecat/transports/base_output.py +71 -77
- pipecat/transports/smallwebrtc/connection.py +5 -0
- pipecat/transports/smallwebrtc/request_handler.py +42 -0
- pipecat/utils/string.py +1 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0
|
@@ -294,15 +294,15 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
294
294
|
"""
|
|
295
295
|
await super().process_frame(frame, direction)
|
|
296
296
|
|
|
297
|
-
#
|
|
298
|
-
# System frames (like InterruptionFrame) are pushed immediately. Other
|
|
299
|
-
# frames require order so they are put in the sink queue.
|
|
300
|
-
#
|
|
301
297
|
if isinstance(frame, StartFrame):
|
|
302
298
|
# Push StartFrame before start(), because we want StartFrame to be
|
|
303
299
|
# processed by every processor before any other frame is processed.
|
|
304
300
|
await self.push_frame(frame, direction)
|
|
305
301
|
await self.start(frame)
|
|
302
|
+
elif isinstance(frame, EndFrame):
|
|
303
|
+
await self.stop(frame)
|
|
304
|
+
# Keep pushing EndFrame down so all the pipeline stops nicely.
|
|
305
|
+
await self.push_frame(frame, direction)
|
|
306
306
|
elif isinstance(frame, CancelFrame):
|
|
307
307
|
await self.cancel(frame)
|
|
308
308
|
await self.push_frame(frame, direction)
|
|
@@ -315,21 +315,6 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
315
315
|
await self.write_dtmf(frame)
|
|
316
316
|
elif isinstance(frame, SystemFrame):
|
|
317
317
|
await self.push_frame(frame, direction)
|
|
318
|
-
# Control frames.
|
|
319
|
-
elif isinstance(frame, EndFrame):
|
|
320
|
-
await self.stop(frame)
|
|
321
|
-
# Keep pushing EndFrame down so all the pipeline stops nicely.
|
|
322
|
-
await self.push_frame(frame, direction)
|
|
323
|
-
elif isinstance(frame, MixerControlFrame):
|
|
324
|
-
await self._handle_frame(frame)
|
|
325
|
-
# Other frames.
|
|
326
|
-
elif isinstance(frame, OutputAudioRawFrame):
|
|
327
|
-
await self._handle_frame(frame)
|
|
328
|
-
elif isinstance(frame, (OutputImageRawFrame, SpriteFrame)):
|
|
329
|
-
await self._handle_frame(frame)
|
|
330
|
-
# TODO(aleix): Images and audio should support presentation timestamps.
|
|
331
|
-
elif frame.pts:
|
|
332
|
-
await self._handle_frame(frame)
|
|
333
318
|
elif direction == FrameDirection.UPSTREAM:
|
|
334
319
|
await self.push_frame(frame, direction)
|
|
335
320
|
else:
|
|
@@ -411,6 +396,13 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
411
396
|
|
|
412
397
|
# Indicates if the bot is currently speaking.
|
|
413
398
|
self._bot_speaking = False
|
|
399
|
+
# Last time a BotSpeakingFrame was pushed.
|
|
400
|
+
self._bot_speaking_frame_time = 0
|
|
401
|
+
# How often a BotSpeakingFrame should be pushed (value should be
|
|
402
|
+
# lower than the audio chunks).
|
|
403
|
+
self._bot_speaking_frame_period = 0.2
|
|
404
|
+
# Last time the bot actually spoke.
|
|
405
|
+
self._bot_speech_last_time = 0
|
|
414
406
|
|
|
415
407
|
self._audio_task: Optional[asyncio.Task] = None
|
|
416
408
|
self._video_task: Optional[asyncio.Task] = None
|
|
@@ -602,39 +594,71 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
602
594
|
|
|
603
595
|
async def _bot_started_speaking(self):
|
|
604
596
|
"""Handle bot started speaking event."""
|
|
605
|
-
if
|
|
606
|
-
|
|
607
|
-
f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
|
|
608
|
-
)
|
|
597
|
+
if self._bot_speaking:
|
|
598
|
+
return
|
|
609
599
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
600
|
+
self._transport.logger.debug(
|
|
601
|
+
f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
downstream_frame = BotStartedSpeakingFrame()
|
|
605
|
+
downstream_frame.transport_destination = self._destination
|
|
606
|
+
upstream_frame = BotStartedSpeakingFrame()
|
|
607
|
+
upstream_frame.transport_destination = self._destination
|
|
608
|
+
await self._transport.push_frame(downstream_frame)
|
|
609
|
+
await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
616
610
|
|
|
617
|
-
|
|
611
|
+
self._bot_speaking = True
|
|
618
612
|
|
|
619
613
|
async def _bot_stopped_speaking(self):
|
|
620
614
|
"""Handle bot stopped speaking event."""
|
|
621
|
-
if self._bot_speaking:
|
|
622
|
-
|
|
623
|
-
f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
|
|
624
|
-
)
|
|
615
|
+
if not self._bot_speaking:
|
|
616
|
+
return
|
|
625
617
|
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
618
|
+
self._transport.logger.debug(
|
|
619
|
+
f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
|
|
620
|
+
)
|
|
621
|
+
|
|
622
|
+
downstream_frame = BotStoppedSpeakingFrame()
|
|
623
|
+
downstream_frame.transport_destination = self._destination
|
|
624
|
+
upstream_frame = BotStoppedSpeakingFrame()
|
|
625
|
+
upstream_frame.transport_destination = self._destination
|
|
626
|
+
await self._transport.push_frame(downstream_frame)
|
|
627
|
+
await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
|
|
632
628
|
|
|
633
|
-
|
|
629
|
+
self._bot_speaking = False
|
|
630
|
+
|
|
631
|
+
# Clean audio buffer (there could be tiny left overs if not multiple
|
|
632
|
+
# to our output chunk size).
|
|
633
|
+
self._audio_buffer = bytearray()
|
|
634
634
|
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
635
|
+
async def _bot_currently_speaking(self):
|
|
636
|
+
"""Handle bot speaking event."""
|
|
637
|
+
await self._bot_started_speaking()
|
|
638
|
+
|
|
639
|
+
diff_time = time.time() - self._bot_speaking_frame_time
|
|
640
|
+
if diff_time >= self._bot_speaking_frame_period:
|
|
641
|
+
await self._transport.push_frame(BotSpeakingFrame())
|
|
642
|
+
await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
|
|
643
|
+
self._bot_speaking_frame_time = time.time()
|
|
644
|
+
|
|
645
|
+
self._bot_speech_last_time = time.time()
|
|
646
|
+
|
|
647
|
+
async def _maybe_bot_currently_speaking(self, frame: SpeechOutputAudioRawFrame):
|
|
648
|
+
if not is_silence(frame.audio):
|
|
649
|
+
await self._bot_currently_speaking()
|
|
650
|
+
else:
|
|
651
|
+
silence_duration = time.time() - self._bot_speech_last_time
|
|
652
|
+
if silence_duration > BOT_VAD_STOP_SECS:
|
|
653
|
+
await self._bot_stopped_speaking()
|
|
654
|
+
|
|
655
|
+
async def _handle_bot_speech(self, frame: Frame):
|
|
656
|
+
# TTS case.
|
|
657
|
+
if isinstance(frame, TTSAudioRawFrame):
|
|
658
|
+
await self._bot_currently_speaking()
|
|
659
|
+
# Speech stream case.
|
|
660
|
+
elif isinstance(frame, SpeechOutputAudioRawFrame):
|
|
661
|
+
await self._maybe_bot_currently_speaking(frame)
|
|
638
662
|
|
|
639
663
|
async def _handle_frame(self, frame: Frame):
|
|
640
664
|
"""Handle various frame types with appropriate processing.
|
|
@@ -642,7 +666,9 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
642
666
|
Args:
|
|
643
667
|
frame: The frame to handle.
|
|
644
668
|
"""
|
|
645
|
-
if isinstance(frame,
|
|
669
|
+
if isinstance(frame, OutputAudioRawFrame):
|
|
670
|
+
await self._handle_bot_speech(frame)
|
|
671
|
+
elif isinstance(frame, OutputImageRawFrame):
|
|
646
672
|
await self._set_video_image(frame)
|
|
647
673
|
elif isinstance(frame, SpriteFrame):
|
|
648
674
|
await self._set_video_images(frame.images)
|
|
@@ -706,39 +732,7 @@ class BaseOutputTransport(FrameProcessor):
|
|
|
706
732
|
|
|
707
733
|
async def _audio_task_handler(self):
|
|
708
734
|
"""Main audio processing task handler."""
|
|
709
|
-
# Push a BotSpeakingFrame every 200ms, we don't really need to push it
|
|
710
|
-
# at every audio chunk. If the audio chunk is bigger than 200ms, push at
|
|
711
|
-
# every audio chunk.
|
|
712
|
-
TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
|
|
713
|
-
BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
|
|
714
|
-
bot_speaking_counter = 0
|
|
715
|
-
speech_last_speaking_time = 0
|
|
716
|
-
|
|
717
735
|
async for frame in self._next_frame():
|
|
718
|
-
# Notify the bot started speaking upstream if necessary and that
|
|
719
|
-
# it's actually speaking.
|
|
720
|
-
is_speaking = False
|
|
721
|
-
if isinstance(frame, TTSAudioRawFrame):
|
|
722
|
-
is_speaking = True
|
|
723
|
-
elif isinstance(frame, SpeechOutputAudioRawFrame):
|
|
724
|
-
if not is_silence(frame.audio):
|
|
725
|
-
is_speaking = True
|
|
726
|
-
speech_last_speaking_time = time.time()
|
|
727
|
-
else:
|
|
728
|
-
silence_duration = time.time() - speech_last_speaking_time
|
|
729
|
-
if silence_duration > BOT_VAD_STOP_SECS:
|
|
730
|
-
await self._bot_stopped_speaking()
|
|
731
|
-
|
|
732
|
-
if is_speaking:
|
|
733
|
-
await self._bot_started_speaking()
|
|
734
|
-
if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
|
|
735
|
-
await self._transport.push_frame(BotSpeakingFrame())
|
|
736
|
-
await self._transport.push_frame(
|
|
737
|
-
BotSpeakingFrame(), FrameDirection.UPSTREAM
|
|
738
|
-
)
|
|
739
|
-
bot_speaking_counter = 0
|
|
740
|
-
bot_speaking_counter += 1
|
|
741
|
-
|
|
742
736
|
# No need to push EndFrame, it's pushed from process_frame().
|
|
743
737
|
if isinstance(frame, EndFrame):
|
|
744
738
|
break
|
|
@@ -689,3 +689,8 @@ class SmallWebRTCConnection(BaseObject):
|
|
|
689
689
|
)()
|
|
690
690
|
if track:
|
|
691
691
|
track.set_enabled(signalling_message.enabled)
|
|
692
|
+
|
|
693
|
+
async def add_ice_candidate(self, candidate):
|
|
694
|
+
"""Handle incoming ICE candidates."""
|
|
695
|
+
logger.debug(f"Adding remote candidate: {candidate}")
|
|
696
|
+
await self.pc.addIceCandidate(candidate)
|
|
@@ -14,6 +14,7 @@ from dataclasses import dataclass
|
|
|
14
14
|
from enum import Enum
|
|
15
15
|
from typing import Any, Awaitable, Callable, Dict, List, Optional
|
|
16
16
|
|
|
17
|
+
from aiortc.sdp import candidate_from_sdp
|
|
17
18
|
from fastapi import HTTPException
|
|
18
19
|
from loguru import logger
|
|
19
20
|
|
|
@@ -39,6 +40,34 @@ class SmallWebRTCRequest:
|
|
|
39
40
|
request_data: Optional[Any] = None
|
|
40
41
|
|
|
41
42
|
|
|
43
|
+
@dataclass
|
|
44
|
+
class IceCandidate:
|
|
45
|
+
"""The remote ice candidate object received from the peer connection.
|
|
46
|
+
|
|
47
|
+
Parameters:
|
|
48
|
+
candidate: The ice candidate patch SDP string (Session Description Protocol).
|
|
49
|
+
sdp_mid: The SDP mid for the candidate patch.
|
|
50
|
+
sdp_mline_index: The SDP mline index for the candidate patch.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
candidate: str
|
|
54
|
+
sdp_mid: str
|
|
55
|
+
sdp_mline_index: int
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@dataclass
|
|
59
|
+
class SmallWebRTCPatchRequest:
|
|
60
|
+
"""Small WebRTC transport session arguments for the runner.
|
|
61
|
+
|
|
62
|
+
Parameters:
|
|
63
|
+
pc_id: Identifier for the peer connection.
|
|
64
|
+
candidates: A list of ICE candidate patches.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
pc_id: str
|
|
68
|
+
candidates: List[IceCandidate]
|
|
69
|
+
|
|
70
|
+
|
|
42
71
|
class ConnectionMode(Enum):
|
|
43
72
|
"""Enum defining the connection handling modes."""
|
|
44
73
|
|
|
@@ -197,6 +226,19 @@ class SmallWebRTCRequestHandler:
|
|
|
197
226
|
logger.debug(f"SmallWebRTC request details: {request}")
|
|
198
227
|
raise
|
|
199
228
|
|
|
229
|
+
async def handle_patch_request(self, request: SmallWebRTCPatchRequest):
|
|
230
|
+
"""Handle a SmallWebRTC patch candidate request."""
|
|
231
|
+
peer_connection = self._pcs_map.get(request.pc_id)
|
|
232
|
+
|
|
233
|
+
if not peer_connection:
|
|
234
|
+
raise HTTPException(status_code=404, detail="Peer connection not found")
|
|
235
|
+
|
|
236
|
+
for c in request.candidates:
|
|
237
|
+
candidate = candidate_from_sdp(c.candidate)
|
|
238
|
+
candidate.sdpMid = c.sdp_mid
|
|
239
|
+
candidate.sdpMLineIndex = c.sdp_mline_index
|
|
240
|
+
await peer_connection.add_ice_candidate(candidate)
|
|
241
|
+
|
|
200
242
|
async def close(self):
|
|
201
243
|
"""Clear the connection map."""
|
|
202
244
|
coros = [pc.disconnect() for pc in self._pcs_map.values()]
|
pipecat/utils/string.py
CHANGED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt
RENAMED
|
File without changes
|