dv-pipecat-ai 0.0.85.dev699__py3-none-any.whl → 0.0.85.dev814__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (43) hide show
  1. {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/METADATA +23 -18
  2. {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/RECORD +43 -43
  3. pipecat/adapters/services/aws_nova_sonic_adapter.py +116 -6
  4. pipecat/pipeline/runner.py +6 -2
  5. pipecat/pipeline/task.py +40 -55
  6. pipecat/processors/aggregators/llm_context.py +40 -2
  7. pipecat/processors/frameworks/rtvi.py +1 -0
  8. pipecat/runner/daily.py +59 -20
  9. pipecat/runner/run.py +149 -67
  10. pipecat/runner/types.py +5 -5
  11. pipecat/services/assemblyai/models.py +6 -0
  12. pipecat/services/assemblyai/stt.py +13 -5
  13. pipecat/services/asyncai/tts.py +3 -0
  14. pipecat/services/aws/llm.py +33 -16
  15. pipecat/services/aws/nova_sonic/context.py +69 -0
  16. pipecat/services/aws/nova_sonic/llm.py +199 -89
  17. pipecat/services/aws/stt.py +2 -0
  18. pipecat/services/aws_nova_sonic/context.py +8 -12
  19. pipecat/services/cartesia/stt.py +77 -70
  20. pipecat/services/cartesia/tts.py +3 -1
  21. pipecat/services/deepgram/flux/stt.py +4 -0
  22. pipecat/services/elevenlabs/tts.py +82 -41
  23. pipecat/services/fish/tts.py +3 -0
  24. pipecat/services/google/stt.py +4 -0
  25. pipecat/services/lmnt/tts.py +2 -0
  26. pipecat/services/neuphonic/tts.py +3 -0
  27. pipecat/services/openai/tts.py +37 -6
  28. pipecat/services/piper/tts.py +7 -9
  29. pipecat/services/playht/tts.py +3 -0
  30. pipecat/services/rime/tts.py +9 -8
  31. pipecat/services/riva/stt.py +3 -1
  32. pipecat/services/sarvam/tts.py +87 -10
  33. pipecat/services/speechmatics/stt.py +3 -1
  34. pipecat/services/stt_service.py +23 -10
  35. pipecat/services/tts_service.py +64 -13
  36. pipecat/transports/base_input.py +3 -0
  37. pipecat/transports/base_output.py +71 -77
  38. pipecat/transports/smallwebrtc/connection.py +5 -0
  39. pipecat/transports/smallwebrtc/request_handler.py +42 -0
  40. pipecat/utils/string.py +1 -0
  41. {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/WHEEL +0 -0
  42. {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/licenses/LICENSE +0 -0
  43. {dv_pipecat_ai-0.0.85.dev699.dist-info → dv_pipecat_ai-0.0.85.dev814.dist-info}/top_level.txt +0 -0
@@ -294,15 +294,15 @@ class BaseOutputTransport(FrameProcessor):
294
294
  """
295
295
  await super().process_frame(frame, direction)
296
296
 
297
- #
298
- # System frames (like InterruptionFrame) are pushed immediately. Other
299
- # frames require order so they are put in the sink queue.
300
- #
301
297
  if isinstance(frame, StartFrame):
302
298
  # Push StartFrame before start(), because we want StartFrame to be
303
299
  # processed by every processor before any other frame is processed.
304
300
  await self.push_frame(frame, direction)
305
301
  await self.start(frame)
302
+ elif isinstance(frame, EndFrame):
303
+ await self.stop(frame)
304
+ # Keep pushing EndFrame down so all the pipeline stops nicely.
305
+ await self.push_frame(frame, direction)
306
306
  elif isinstance(frame, CancelFrame):
307
307
  await self.cancel(frame)
308
308
  await self.push_frame(frame, direction)
@@ -315,21 +315,6 @@ class BaseOutputTransport(FrameProcessor):
315
315
  await self.write_dtmf(frame)
316
316
  elif isinstance(frame, SystemFrame):
317
317
  await self.push_frame(frame, direction)
318
- # Control frames.
319
- elif isinstance(frame, EndFrame):
320
- await self.stop(frame)
321
- # Keep pushing EndFrame down so all the pipeline stops nicely.
322
- await self.push_frame(frame, direction)
323
- elif isinstance(frame, MixerControlFrame):
324
- await self._handle_frame(frame)
325
- # Other frames.
326
- elif isinstance(frame, OutputAudioRawFrame):
327
- await self._handle_frame(frame)
328
- elif isinstance(frame, (OutputImageRawFrame, SpriteFrame)):
329
- await self._handle_frame(frame)
330
- # TODO(aleix): Images and audio should support presentation timestamps.
331
- elif frame.pts:
332
- await self._handle_frame(frame)
333
318
  elif direction == FrameDirection.UPSTREAM:
334
319
  await self.push_frame(frame, direction)
335
320
  else:
@@ -411,6 +396,13 @@ class BaseOutputTransport(FrameProcessor):
411
396
 
412
397
  # Indicates if the bot is currently speaking.
413
398
  self._bot_speaking = False
399
+ # Last time a BotSpeakingFrame was pushed.
400
+ self._bot_speaking_frame_time = 0
401
+ # How often a BotSpeakingFrame should be pushed (value should be
402
+ # lower than the audio chunks).
403
+ self._bot_speaking_frame_period = 0.2
404
+ # Last time the bot actually spoke.
405
+ self._bot_speech_last_time = 0
414
406
 
415
407
  self._audio_task: Optional[asyncio.Task] = None
416
408
  self._video_task: Optional[asyncio.Task] = None
@@ -602,39 +594,71 @@ class BaseOutputTransport(FrameProcessor):
602
594
 
603
595
  async def _bot_started_speaking(self):
604
596
  """Handle bot started speaking event."""
605
- if not self._bot_speaking:
606
- self._transport.logger.debug(
607
- f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
608
- )
597
+ if self._bot_speaking:
598
+ return
609
599
 
610
- downstream_frame = BotStartedSpeakingFrame()
611
- downstream_frame.transport_destination = self._destination
612
- upstream_frame = BotStartedSpeakingFrame()
613
- upstream_frame.transport_destination = self._destination
614
- await self._transport.push_frame(downstream_frame)
615
- await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
600
+ self._transport.logger.debug(
601
+ f"Bot{f' [{self._destination}]' if self._destination else ''} started speaking"
602
+ )
603
+
604
+ downstream_frame = BotStartedSpeakingFrame()
605
+ downstream_frame.transport_destination = self._destination
606
+ upstream_frame = BotStartedSpeakingFrame()
607
+ upstream_frame.transport_destination = self._destination
608
+ await self._transport.push_frame(downstream_frame)
609
+ await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
616
610
 
617
- self._bot_speaking = True
611
+ self._bot_speaking = True
618
612
 
619
613
  async def _bot_stopped_speaking(self):
620
614
  """Handle bot stopped speaking event."""
621
- if self._bot_speaking:
622
- self._transport.logger.debug(
623
- f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
624
- )
615
+ if not self._bot_speaking:
616
+ return
625
617
 
626
- downstream_frame = BotStoppedSpeakingFrame()
627
- downstream_frame.transport_destination = self._destination
628
- upstream_frame = BotStoppedSpeakingFrame()
629
- upstream_frame.transport_destination = self._destination
630
- await self._transport.push_frame(downstream_frame)
631
- await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
618
+ self._transport.logger.debug(
619
+ f"Bot{f' [{self._destination}]' if self._destination else ''} stopped speaking"
620
+ )
621
+
622
+ downstream_frame = BotStoppedSpeakingFrame()
623
+ downstream_frame.transport_destination = self._destination
624
+ upstream_frame = BotStoppedSpeakingFrame()
625
+ upstream_frame.transport_destination = self._destination
626
+ await self._transport.push_frame(downstream_frame)
627
+ await self._transport.push_frame(upstream_frame, FrameDirection.UPSTREAM)
632
628
 
633
- self._bot_speaking = False
629
+ self._bot_speaking = False
630
+
631
+ # Clean audio buffer (there could be tiny left overs if not multiple
632
+ # to our output chunk size).
633
+ self._audio_buffer = bytearray()
634
634
 
635
- # Clean audio buffer (there could be tiny left overs if not multiple
636
- # to our output chunk size).
637
- self._audio_buffer = bytearray()
635
+ async def _bot_currently_speaking(self):
636
+ """Handle bot speaking event."""
637
+ await self._bot_started_speaking()
638
+
639
+ diff_time = time.time() - self._bot_speaking_frame_time
640
+ if diff_time >= self._bot_speaking_frame_period:
641
+ await self._transport.push_frame(BotSpeakingFrame())
642
+ await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
643
+ self._bot_speaking_frame_time = time.time()
644
+
645
+ self._bot_speech_last_time = time.time()
646
+
647
+ async def _maybe_bot_currently_speaking(self, frame: SpeechOutputAudioRawFrame):
648
+ if not is_silence(frame.audio):
649
+ await self._bot_currently_speaking()
650
+ else:
651
+ silence_duration = time.time() - self._bot_speech_last_time
652
+ if silence_duration > BOT_VAD_STOP_SECS:
653
+ await self._bot_stopped_speaking()
654
+
655
+ async def _handle_bot_speech(self, frame: Frame):
656
+ # TTS case.
657
+ if isinstance(frame, TTSAudioRawFrame):
658
+ await self._bot_currently_speaking()
659
+ # Speech stream case.
660
+ elif isinstance(frame, SpeechOutputAudioRawFrame):
661
+ await self._maybe_bot_currently_speaking(frame)
638
662
 
639
663
  async def _handle_frame(self, frame: Frame):
640
664
  """Handle various frame types with appropriate processing.
@@ -642,7 +666,9 @@ class BaseOutputTransport(FrameProcessor):
642
666
  Args:
643
667
  frame: The frame to handle.
644
668
  """
645
- if isinstance(frame, OutputImageRawFrame):
669
+ if isinstance(frame, OutputAudioRawFrame):
670
+ await self._handle_bot_speech(frame)
671
+ elif isinstance(frame, OutputImageRawFrame):
646
672
  await self._set_video_image(frame)
647
673
  elif isinstance(frame, SpriteFrame):
648
674
  await self._set_video_images(frame.images)
@@ -706,39 +732,7 @@ class BaseOutputTransport(FrameProcessor):
706
732
 
707
733
  async def _audio_task_handler(self):
708
734
  """Main audio processing task handler."""
709
- # Push a BotSpeakingFrame every 200ms, we don't really need to push it
710
- # at every audio chunk. If the audio chunk is bigger than 200ms, push at
711
- # every audio chunk.
712
- TOTAL_CHUNK_MS = self._params.audio_out_10ms_chunks * 10
713
- BOT_SPEAKING_CHUNK_PERIOD = max(int(200 / TOTAL_CHUNK_MS), 1)
714
- bot_speaking_counter = 0
715
- speech_last_speaking_time = 0
716
-
717
735
  async for frame in self._next_frame():
718
- # Notify the bot started speaking upstream if necessary and that
719
- # it's actually speaking.
720
- is_speaking = False
721
- if isinstance(frame, TTSAudioRawFrame):
722
- is_speaking = True
723
- elif isinstance(frame, SpeechOutputAudioRawFrame):
724
- if not is_silence(frame.audio):
725
- is_speaking = True
726
- speech_last_speaking_time = time.time()
727
- else:
728
- silence_duration = time.time() - speech_last_speaking_time
729
- if silence_duration > BOT_VAD_STOP_SECS:
730
- await self._bot_stopped_speaking()
731
-
732
- if is_speaking:
733
- await self._bot_started_speaking()
734
- if bot_speaking_counter % BOT_SPEAKING_CHUNK_PERIOD == 0:
735
- await self._transport.push_frame(BotSpeakingFrame())
736
- await self._transport.push_frame(
737
- BotSpeakingFrame(), FrameDirection.UPSTREAM
738
- )
739
- bot_speaking_counter = 0
740
- bot_speaking_counter += 1
741
-
742
736
  # No need to push EndFrame, it's pushed from process_frame().
743
737
  if isinstance(frame, EndFrame):
744
738
  break
@@ -689,3 +689,8 @@ class SmallWebRTCConnection(BaseObject):
689
689
  )()
690
690
  if track:
691
691
  track.set_enabled(signalling_message.enabled)
692
+
693
+ async def add_ice_candidate(self, candidate):
694
+ """Handle incoming ICE candidates."""
695
+ logger.debug(f"Adding remote candidate: {candidate}")
696
+ await self.pc.addIceCandidate(candidate)
@@ -14,6 +14,7 @@ from dataclasses import dataclass
14
14
  from enum import Enum
15
15
  from typing import Any, Awaitable, Callable, Dict, List, Optional
16
16
 
17
+ from aiortc.sdp import candidate_from_sdp
17
18
  from fastapi import HTTPException
18
19
  from loguru import logger
19
20
 
@@ -39,6 +40,34 @@ class SmallWebRTCRequest:
39
40
  request_data: Optional[Any] = None
40
41
 
41
42
 
43
+ @dataclass
44
+ class IceCandidate:
45
+ """The remote ice candidate object received from the peer connection.
46
+
47
+ Parameters:
48
+ candidate: The ice candidate patch SDP string (Session Description Protocol).
49
+ sdp_mid: The SDP mid for the candidate patch.
50
+ sdp_mline_index: The SDP mline index for the candidate patch.
51
+ """
52
+
53
+ candidate: str
54
+ sdp_mid: str
55
+ sdp_mline_index: int
56
+
57
+
58
+ @dataclass
59
+ class SmallWebRTCPatchRequest:
60
+ """Small WebRTC transport session arguments for the runner.
61
+
62
+ Parameters:
63
+ pc_id: Identifier for the peer connection.
64
+ candidates: A list of ICE candidate patches.
65
+ """
66
+
67
+ pc_id: str
68
+ candidates: List[IceCandidate]
69
+
70
+
42
71
  class ConnectionMode(Enum):
43
72
  """Enum defining the connection handling modes."""
44
73
 
@@ -197,6 +226,19 @@ class SmallWebRTCRequestHandler:
197
226
  logger.debug(f"SmallWebRTC request details: {request}")
198
227
  raise
199
228
 
229
+ async def handle_patch_request(self, request: SmallWebRTCPatchRequest):
230
+ """Handle a SmallWebRTC patch candidate request."""
231
+ peer_connection = self._pcs_map.get(request.pc_id)
232
+
233
+ if not peer_connection:
234
+ raise HTTPException(status_code=404, detail="Peer connection not found")
235
+
236
+ for c in request.candidates:
237
+ candidate = candidate_from_sdp(c.candidate)
238
+ candidate.sdpMid = c.sdp_mid
239
+ candidate.sdpMLineIndex = c.sdp_mline_index
240
+ await peer_connection.add_ice_candidate(candidate)
241
+
200
242
  async def close(self):
201
243
  """Clear the connection map."""
202
244
  coros = [pc.disconnect() for pc in self._pcs_map.values()]
pipecat/utils/string.py CHANGED
@@ -47,6 +47,7 @@ SENTENCE_ENDING_PUNCTUATION: FrozenSet[str] = frozenset(
47
47
  "!",
48
48
  "?",
49
49
  ";",
50
+ "…",
50
51
  # East Asian punctuation (Chinese (Traditional & Simplified), Japanese, Korean)
51
52
  "。", # Ideographic full stop
52
53
  "?", # Full-width question mark