dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -15,9 +15,10 @@ service-specific adapter.
15
15
  """
16
16
 
17
17
  import base64
18
+ import copy
18
19
  import io
19
20
  from dataclasses import dataclass
20
- from typing import Any, List, Optional, TypeAlias, Union
21
+ from typing import TYPE_CHECKING, Any, List, Optional, TypeAlias, Union
21
22
 
22
23
  from loguru import logger
23
24
  from openai._types import NOT_GIVEN as OPEN_AI_NOT_GIVEN
@@ -31,6 +32,9 @@ from PIL import Image
31
32
  from pipecat.adapters.schemas.tools_schema import ToolsSchema
32
33
  from pipecat.frames.frames import AudioRawFrame
33
34
 
35
+ if TYPE_CHECKING:
36
+ from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
37
+
34
38
  # "Re-export" types from OpenAI that we're using as universal context types.
35
39
  # NOTE: if universal message types need to someday diverge from OpenAI's, we
36
40
  # should consider managing our own definitions. But we should do so carefully,
@@ -65,6 +69,26 @@ class LLMContext:
65
69
  and content formatting.
66
70
  """
67
71
 
72
+ @staticmethod
73
+ def from_openai_context(openai_context: "OpenAILLMContext") -> "LLMContext":
74
+ """Create a universal LLM context from an OpenAI-specific context.
75
+
76
+ NOTE: this should only be used internally, for facilitating migration
77
+ from OpenAILLMContext to LLMContext. New user code should use
78
+ LLMContext directly.
79
+
80
+ Args:
81
+ openai_context: The OpenAI LLM context to convert.
82
+
83
+ Returns:
84
+ New LLMContext instance with converted messages and settings.
85
+ """
86
+ return LLMContext(
87
+ messages=openai_context.get_messages(),
88
+ tools=openai_context.tools,
89
+ tool_choice=openai_context.tool_choice,
90
+ )
91
+
68
92
  def __init__(
69
93
  self,
70
94
  messages: Optional[List[LLMContextMessage]] = None,
@@ -82,6 +106,19 @@ class LLMContext:
82
106
  self._tools: ToolsSchema | NotGiven = LLMContext._normalize_and_validate_tools(tools)
83
107
  self._tool_choice: LLMContextToolChoice | NotGiven = tool_choice
84
108
 
109
+ @property
110
+ def messages(self) -> List[LLMContextMessage]:
111
+ """Get the current messages list.
112
+
113
+ NOTE: This is equivalent to calling `get_messages()` with no filter. If
114
+ you want to filter out LLM-specific messages that don't pertain to your
115
+ LLM, use `get_messages()` directly.
116
+
117
+ Returns:
118
+ List of conversation messages.
119
+ """
120
+ return self.get_messages()
121
+
85
122
  def get_messages(self, llm_specific_filter: Optional[str] = None) -> List[LLMContextMessage]:
86
123
  """Get the current messages list.
87
124
 
@@ -89,7 +126,8 @@ class LLMContext:
89
126
  llm_specific_filter: Optional filter to return LLM-specific
90
127
  messages for the given LLM, in addition to the standard
91
128
  messages. If messages end up being filtered, an error will be
92
- logged.
129
+ logged; this is intended to catch accidental use of
130
+ incompatible LLM-specific messages.
93
131
 
94
132
  Returns:
95
133
  List of conversation messages.
@@ -23,7 +23,6 @@ from pipecat.audio.interruptions.base_interruption_strategy import BaseInterrupt
23
23
  from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
24
24
  from pipecat.audio.vad.vad_analyzer import VADParams
25
25
  from pipecat.frames.frames import (
26
- BotInterruptionFrame,
27
26
  BotStartedSpeakingFrame,
28
27
  BotStoppedSpeakingFrame,
29
28
  CancelFrame,
@@ -37,6 +36,7 @@ from pipecat.frames.frames import (
37
36
  FunctionCallsStartedFrame,
38
37
  InputAudioRawFrame,
39
38
  InterimTranscriptionFrame,
39
+ InterruptionFrame,
40
40
  LLMFullResponseEndFrame,
41
41
  LLMFullResponseStartFrame,
42
42
  LLMMessagesAppendFrame,
@@ -49,8 +49,8 @@ from pipecat.frames.frames import (
49
49
  OpenAILLMContextAssistantTimestampFrame,
50
50
  SpeechControlParamsFrame,
51
51
  StartFrame,
52
- StartInterruptionFrame,
53
52
  TextFrame,
53
+ TranscriptDropFrame,
54
54
  TranscriptionFrame,
55
55
  UserImageRawFrame,
56
56
  UserStartedSpeakingFrame,
@@ -139,7 +139,7 @@ class LLMFullResponseAggregator(FrameProcessor):
139
139
  """
140
140
  await super().process_frame(frame, direction)
141
141
 
142
- if isinstance(frame, StartInterruptionFrame):
142
+ if isinstance(frame, InterruptionFrame):
143
143
  await self._call_event_handler("on_completion", self._aggregation, False)
144
144
  self._aggregation = ""
145
145
  self._started = False
@@ -446,6 +446,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
446
446
  self._latest_final_transcript = ""
447
447
  self._last_user_speaking_time = 0
448
448
  self._last_aggregation_push_time = 0
449
+ self._pending_transcription_ids: List[int] = []
449
450
 
450
451
  async def reset(self):
451
452
  """Reset the aggregation state and interruption strategies."""
@@ -453,6 +454,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
453
454
  self._was_bot_speaking = False
454
455
  self._seen_interim_results = False
455
456
  self._waiting_for_aggregation = False
457
+ self._pending_transcription_ids.clear()
456
458
  [await s.reset() for s in self._interruption_strategies]
457
459
 
458
460
  async def handle_aggregation(self, aggregation: str):
@@ -470,8 +472,8 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
470
472
  frame: The frame to process.
471
473
  direction: The direction of frame flow in the pipeline.
472
474
  """
473
- if isinstance(frame, StartInterruptionFrame):
474
- self.logger.debug("Received StartInterruptionFrame")
475
+ if isinstance(frame, InterruptionFrame):
476
+ self.logger.debug("Received InterruptionFrame")
475
477
  await super().process_frame(frame, direction)
476
478
 
477
479
  if isinstance(frame, StartFrame):
@@ -516,9 +518,6 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
516
518
  self.set_tools(frame.tools)
517
519
  elif isinstance(frame, LLMSetToolChoiceFrame):
518
520
  self.set_tool_choice(frame.tool_choice)
519
- elif isinstance(frame, LLMFullResponseStartFrame):
520
- self._last_llm_response_start_time = time.time()
521
- self._latest_final_transcript = ""
522
521
  elif isinstance(frame, SpeechControlParamsFrame):
523
522
  self._vad_params = frame.vad_params
524
523
  self._turn_params = frame.turn_params
@@ -545,13 +544,14 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
545
544
 
546
545
  if should_interrupt:
547
546
  self.logger.debug(
548
- "Interruption conditions met - pushing BotInterruptionFrame and aggregation"
547
+ "Interruption conditions met - pushing interruption and aggregation"
549
548
  )
550
- await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
549
+ await self.push_interruption_task_frame_and_wait()
551
550
  await self._process_aggregation()
552
551
  else:
553
552
  self.logger.debug("Interruption conditions not met - not pushing aggregation")
554
- # Don't process aggregation, just reset it
553
+ # Don't process aggregation, discard pending transcriptions and reset
554
+ await self._discard_pending_transcriptions("interruption_conditions_not_met")
555
555
  await self.reset()
556
556
  else:
557
557
  if trigger_interruption:
@@ -559,7 +559,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
559
559
  "Triggering interruption - pushing BotInterruptionFrame and aggregation"
560
560
  )
561
561
  # await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
562
- await self.push_frame(StartInterruptionFrame(), FrameDirection.DOWNSTREAM)
562
+ await self.push_frame(InterruptionFrame(), FrameDirection.DOWNSTREAM)
563
563
  self.logger.debug("Pushed BotInterruptionFrame")
564
564
  # No interruption config - normal behavior (always push aggregation)
565
565
  await self._process_aggregation()
@@ -591,6 +591,13 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
591
591
 
592
592
  return any([await should_interrupt(s) for s in self._interruption_strategies])
593
593
 
594
+ async def _discard_pending_transcriptions(self, reason: str):
595
+ """Notify upstream processors that pending transcripts should be dropped."""
596
+ if self._pending_transcription_ids:
597
+ drop_frame = TranscriptDropFrame(transcript_ids=list(self._pending_transcription_ids))
598
+ await self.push_frame(drop_frame, FrameDirection.UPSTREAM)
599
+ self._pending_transcription_ids.clear()
600
+
594
601
  async def _start(self, frame: StartFrame):
595
602
  self._create_aggregation_task()
596
603
 
@@ -617,10 +624,19 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
617
624
  for s in self.interruption_strategies:
618
625
  await s.append_audio(frame.audio, frame.sample_rate)
619
626
 
627
+ async def _discard_pending_transcriptions(self, reason: str):
628
+ """Notify upstream processors that pending transcripts should be dropped."""
629
+ if self._pending_transcription_ids:
630
+ drop_frame = TranscriptDropFrame(transcript_ids=list(self._pending_transcription_ids))
631
+ await self.push_frame(drop_frame, FrameDirection.UPSTREAM)
632
+ self._pending_transcription_ids.clear()
633
+
620
634
  async def _handle_user_started_speaking(self, frame: UserStartedSpeakingFrame):
621
635
  if len(self._aggregation) > 0:
622
636
  self.logger.debug(f"Dropping {self._aggregation}")
623
637
  self._aggregation = ""
638
+ await self._discard_pending_transcriptions("user_started_speaking")
639
+ self._latest_final_transcript = ""
624
640
  self._last_user_speaking_time = time.time()
625
641
  self._user_speaking = True
626
642
  self._waiting_for_aggregation = True
@@ -664,6 +680,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
664
680
  return
665
681
 
666
682
  self._aggregation += f" {text}" if self._aggregation else text
683
+ self._pending_transcription_ids.append(frame.id)
667
684
  # We just got a final result, so let's reset interim results.
668
685
  self._seen_interim_results = False
669
686
 
@@ -686,7 +703,6 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
686
703
  elif (
687
704
  not self._bot_speaking
688
705
  and time_since_stopped < 3.0
689
- and time.time() - self._last_llm_response_start_time > 3.0
690
706
  and self._latest_final_transcript != text
691
707
  ):
692
708
  self.logger.debug(
@@ -794,6 +810,7 @@ class LLMUserContextAggregator(LLMContextResponseAggregator):
794
810
  if self._bot_speaking and not self._params.enable_emulated_vad_interruptions:
795
811
  # If emulated VAD interruptions are disabled and bot is speaking, ignore
796
812
  logger.debug("Ignoring user speaking emulation, bot is speaking.")
813
+ await self._discard_pending_transcriptions("emulated_vad_ignored")
797
814
  await self.reset()
798
815
  else:
799
816
  # Either bot is not speaking, or emulated VAD interruptions are enabled
@@ -908,7 +925,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
908
925
  """
909
926
  await super().process_frame(frame, direction)
910
927
 
911
- if isinstance(frame, StartInterruptionFrame):
928
+ if isinstance(frame, InterruptionFrame):
912
929
  await self._handle_interruptions(frame)
913
930
  await self.push_frame(frame, direction)
914
931
  elif isinstance(frame, LLMFullResponseStartFrame):
@@ -974,7 +991,7 @@ class LLMAssistantContextAggregator(LLMContextResponseAggregator):
974
991
  if frame.run_llm:
975
992
  await self.push_context_frame(FrameDirection.UPSTREAM)
976
993
 
977
- async def _handle_interruptions(self, frame: StartInterruptionFrame):
994
+ async def _handle_interruptions(self, frame: InterruptionFrame):
978
995
  await self.push_aggregation()
979
996
  self._started = 0
980
997
  await self.reset()
@@ -13,7 +13,7 @@ LLM processing, and text-to-speech components in conversational AI pipelines.
13
13
 
14
14
  import asyncio
15
15
  import json
16
- from dataclasses import dataclass
16
+ from abc import abstractmethod
17
17
  from typing import Any, Dict, List, Literal, Optional, Set
18
18
 
19
19
  from loguru import logger
@@ -23,7 +23,6 @@ from pipecat.audio.interruptions.base_interruption_strategy import BaseInterrupt
23
23
  from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
24
24
  from pipecat.audio.vad.vad_analyzer import VADParams
25
25
  from pipecat.frames.frames import (
26
- BotInterruptionFrame,
27
26
  BotStartedSpeakingFrame,
28
27
  BotStoppedSpeakingFrame,
29
28
  CancelFrame,
@@ -37,6 +36,7 @@ from pipecat.frames.frames import (
37
36
  FunctionCallsStartedFrame,
38
37
  InputAudioRawFrame,
39
38
  InterimTranscriptionFrame,
39
+ InterruptionFrame,
40
40
  LLMContextAssistantTimestampFrame,
41
41
  LLMContextFrame,
42
42
  LLMFullResponseEndFrame,
@@ -48,7 +48,6 @@ from pipecat.frames.frames import (
48
48
  LLMSetToolsFrame,
49
49
  SpeechControlParamsFrame,
50
50
  StartFrame,
51
- StartInterruptionFrame,
52
51
  TextFrame,
53
52
  TranscriptionFrame,
54
53
  UserImageRawFrame,
@@ -171,6 +170,11 @@ class LLMContextAggregator(FrameProcessor):
171
170
  """Reset the aggregation state."""
172
171
  self._aggregation = ""
173
172
 
173
+ @abstractmethod
174
+ async def push_aggregation(self):
175
+ """Push the current aggregation downstream."""
176
+ pass
177
+
174
178
 
175
179
  class LLMUserAggregator(LLMContextAggregator):
176
180
  """User LLM aggregator that processes speech-to-text transcriptions.
@@ -303,7 +307,7 @@ class LLMUserAggregator(LLMContextAggregator):
303
307
  frame = LLMContextFrame(self._context)
304
308
  await self.push_frame(frame)
305
309
 
306
- async def _push_aggregation(self):
310
+ async def push_aggregation(self):
307
311
  """Push the current aggregation based on interruption strategies and conditions."""
308
312
  if len(self._aggregation) > 0:
309
313
  if self.interruption_strategies and self._bot_speaking:
@@ -311,9 +315,9 @@ class LLMUserAggregator(LLMContextAggregator):
311
315
 
312
316
  if should_interrupt:
313
317
  logger.debug(
314
- "Interruption conditions met - pushing BotInterruptionFrame and aggregation"
318
+ "Interruption conditions met - pushing interruption and aggregation"
315
319
  )
316
- await self.push_frame(BotInterruptionFrame(), FrameDirection.UPSTREAM)
320
+ await self.push_interruption_task_frame_and_wait()
317
321
  await self._process_aggregation()
318
322
  else:
319
323
  logger.debug("Interruption conditions not met - not pushing aggregation")
@@ -394,7 +398,7 @@ class LLMUserAggregator(LLMContextAggregator):
394
398
  # pushing the aggregation as we will probably get a final transcription.
395
399
  if len(self._aggregation) > 0:
396
400
  if not self._seen_interim_results:
397
- await self._push_aggregation()
401
+ await self.push_aggregation()
398
402
  # Handles the case where both the user and the bot are not speaking,
399
403
  # and the bot was previously speaking before the user interruption.
400
404
  # So in this case we are resetting the aggregation timer
@@ -473,7 +477,7 @@ class LLMUserAggregator(LLMContextAggregator):
473
477
  await self._maybe_emulate_user_speaking()
474
478
  except asyncio.TimeoutError:
475
479
  if not self._user_speaking:
476
- await self._push_aggregation()
480
+ await self.push_aggregation()
477
481
 
478
482
  # If we are emulating VAD we still need to send the user stopped
479
483
  # speaking frame.
@@ -579,7 +583,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
579
583
  """
580
584
  await super().process_frame(frame, direction)
581
585
 
582
- if isinstance(frame, StartInterruptionFrame):
586
+ if isinstance(frame, InterruptionFrame):
583
587
  await self._handle_interruptions(frame)
584
588
  await self.push_frame(frame, direction)
585
589
  elif isinstance(frame, LLMFullResponseStartFrame):
@@ -609,12 +613,12 @@ class LLMAssistantAggregator(LLMContextAggregator):
609
613
  elif isinstance(frame, UserImageRawFrame) and frame.request and frame.request.tool_call_id:
610
614
  await self._handle_user_image_frame(frame)
611
615
  elif isinstance(frame, BotStoppedSpeakingFrame):
612
- await self._push_aggregation()
616
+ await self.push_aggregation()
613
617
  await self.push_frame(frame, direction)
614
618
  else:
615
619
  await self.push_frame(frame, direction)
616
620
 
617
- async def _push_aggregation(self):
621
+ async def push_aggregation(self):
618
622
  """Push the current assistant aggregation with timestamp."""
619
623
  if not self._aggregation:
620
624
  return
@@ -645,8 +649,8 @@ class LLMAssistantAggregator(LLMContextAggregator):
645
649
  if frame.run_llm:
646
650
  await self.push_context_frame(FrameDirection.UPSTREAM)
647
651
 
648
- async def _handle_interruptions(self, frame: StartInterruptionFrame):
649
- await self._push_aggregation()
652
+ async def _handle_interruptions(self, frame: InterruptionFrame):
653
+ await self.push_aggregation()
650
654
  self._started = 0
651
655
  await self.reset()
652
656
 
@@ -780,7 +784,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
780
784
  text=frame.request.context,
781
785
  )
782
786
 
783
- await self._push_aggregation()
787
+ await self.push_aggregation()
784
788
  await self.push_context_frame(FrameDirection.UPSTREAM)
785
789
 
786
790
  async def _handle_llm_start(self, _: LLMFullResponseStartFrame):
@@ -788,7 +792,7 @@ class LLMAssistantAggregator(LLMContextAggregator):
788
792
 
789
793
  async def _handle_llm_end(self, _: LLMFullResponseEndFrame):
790
794
  self._started -= 1
791
- await self._push_aggregation()
795
+ await self.push_aggregation()
792
796
 
793
797
  async def _handle_text(self, frame: TextFrame):
794
798
  if not self._started:
@@ -12,14 +12,14 @@ in conversational pipelines.
12
12
  """
13
13
 
14
14
  from pipecat.frames.frames import TextFrame
15
- from pipecat.processors.aggregators.llm_response import LLMUserContextAggregator
16
- from pipecat.processors.aggregators.openai_llm_context import OpenAILLMContext
15
+ from pipecat.processors.aggregators.llm_context import LLMContext
16
+ from pipecat.processors.aggregators.llm_response_universal import LLMUserAggregator
17
17
 
18
18
 
19
- class UserResponseAggregator(LLMUserContextAggregator):
19
+ class UserResponseAggregator(LLMUserAggregator):
20
20
  """Aggregates user responses into TextFrame objects.
21
21
 
22
- This aggregator extends LLMUserContextAggregator to specifically handle
22
+ This aggregator extends LLMUserAggregator to specifically handle
23
23
  user input by collecting text responses and outputting them as TextFrame
24
24
  objects when the aggregation is complete.
25
25
  """
@@ -28,9 +28,9 @@ class UserResponseAggregator(LLMUserContextAggregator):
28
28
  """Initialize the user response aggregator.
29
29
 
30
30
  Args:
31
- **kwargs: Additional arguments passed to parent LLMUserContextAggregator.
31
+ **kwargs: Additional arguments passed to parent LLMUserAggregator.
32
32
  """
33
- super().__init__(context=OpenAILLMContext(), **kwargs)
33
+ super().__init__(context=LLMContext(), **kwargs)
34
34
 
35
35
  async def push_aggregation(self):
36
36
  """Push the aggregated user response as a TextFrame.
@@ -10,13 +10,22 @@ This module provides frame aggregation functionality to combine text and image
10
10
  frames into vision frames for multimodal processing.
11
11
  """
12
12
 
13
- from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame, VisionImageRawFrame
13
+ from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame
14
+ from pipecat.processors.aggregators.openai_llm_context import (
15
+ OpenAILLMContext,
16
+ OpenAILLMContextFrame,
17
+ )
14
18
  from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
15
19
 
16
20
 
17
21
  class VisionImageFrameAggregator(FrameProcessor):
18
22
  """Aggregates consecutive text and image frames into vision frames.
19
23
 
24
+ .. deprecated:: 0.0.85
25
+ VisionImageRawFrame has been removed in favor of context frames
26
+ (LLMContextFrame or OpenAILLMContextFrame), so this aggregator is not
27
+ needed anymore. See the 12* examples for the new recommended pattern.
28
+
20
29
  This aggregator waits for a consecutive TextFrame and an InputImageRawFrame.
21
30
  After the InputImageRawFrame arrives it will output a VisionImageRawFrame
22
31
  combining both the text and image data for multimodal processing.
@@ -28,6 +37,17 @@ class VisionImageFrameAggregator(FrameProcessor):
28
37
  The aggregator starts with no cached text, waiting for the first
29
38
  TextFrame to arrive before it can create vision frames.
30
39
  """
40
+ import warnings
41
+
42
+ warnings.warn(
43
+ "VisionImageFrameAggregator is deprecated. "
44
+ "VisionImageRawFrame has been removed in favor of context frames "
45
+ "(LLMContextFrame or OpenAILLMContextFrame), so this aggregator is "
46
+ "not needed anymore. See the 12* examples for the new recommended "
47
+ "pattern.",
48
+ DeprecationWarning,
49
+ stacklevel=2,
50
+ )
31
51
  super().__init__()
32
52
  self._describe_text = None
33
53
 
@@ -47,12 +67,14 @@ class VisionImageFrameAggregator(FrameProcessor):
47
67
  self._describe_text = frame.text
48
68
  elif isinstance(frame, InputImageRawFrame):
49
69
  if self._describe_text:
50
- frame = VisionImageRawFrame(
70
+ context = OpenAILLMContext()
71
+ context.add_image_frame_message(
51
72
  text=self._describe_text,
52
73
  image=frame.image,
53
74
  size=frame.size,
54
75
  format=frame.format,
55
76
  )
77
+ frame = OpenAILLMContextFrame(context)
56
78
  await self.push_frame(frame)
57
79
  self._describe_text = None
58
80
  else:
@@ -137,12 +137,12 @@ class AudioBufferProcessor(FrameProcessor):
137
137
  return self._num_channels
138
138
 
139
139
  def has_audio(self) -> bool:
140
- """Check if both user and bot audio buffers contain data.
140
+ """Check if either user or bot audio buffers contain data.
141
141
 
142
142
  Returns:
143
- True if both buffers contain audio data.
143
+ True if either buffer contains audio data.
144
144
  """
145
- return self._buffer_has_audio(self._user_audio_buffer) and self._buffer_has_audio(
145
+ return self._buffer_has_audio(self._user_audio_buffer) or self._buffer_has_audio(
146
146
  self._bot_audio_buffer
147
147
  )
148
148
 
@@ -229,9 +229,12 @@ class AudioBufferProcessor(FrameProcessor):
229
229
  # Save time of frame so we can compute silence.
230
230
  self._last_bot_frame_at = time.time()
231
231
 
232
- if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size:
232
+ if self._buffer_size > 0 and (
233
+ len(self._user_audio_buffer) >= self._buffer_size
234
+ or len(self._bot_audio_buffer) >= self._buffer_size
235
+ ):
233
236
  await self._call_on_audio_data_handler()
234
- self._reset_recording()
237
+ self._reset_primary_audio_buffers()
235
238
 
236
239
  # Process turn recording with preprocessed data.
237
240
  if self._enable_turn_audio:
@@ -272,9 +275,15 @@ class AudioBufferProcessor(FrameProcessor):
272
275
 
273
276
  async def _call_on_audio_data_handler(self):
274
277
  """Call the audio data event handlers with buffered audio."""
275
- if not self.has_audio() or not self._recording:
278
+ if not self._recording:
276
279
  return
277
280
 
281
+ if len(self._user_audio_buffer) == 0 and len(self._bot_audio_buffer) == 0:
282
+ return
283
+
284
+ self._align_track_buffers()
285
+ flush_time = time.time()
286
+
278
287
  # Call original handler with merged audio
279
288
  merged_audio = self.merge_audio_buffers()
280
289
  await self._call_event_handler(
@@ -290,23 +299,49 @@ class AudioBufferProcessor(FrameProcessor):
290
299
  self._num_channels,
291
300
  )
292
301
 
302
+ self._last_user_frame_at = flush_time
303
+ self._last_bot_frame_at = flush_time
304
+
293
305
  def _buffer_has_audio(self, buffer: bytearray) -> bool:
294
306
  """Check if a buffer contains audio data."""
295
307
  return buffer is not None and len(buffer) > 0
296
308
 
297
309
  def _reset_recording(self):
298
310
  """Reset recording state and buffers."""
299
- self._reset_audio_buffers()
311
+ self._reset_all_audio_buffers()
300
312
  self._last_user_frame_at = time.time()
301
313
  self._last_bot_frame_at = time.time()
302
314
 
303
- def _reset_audio_buffers(self):
315
+ def _reset_all_audio_buffers(self):
304
316
  """Reset all audio buffers to empty state."""
317
+ self._reset_primary_audio_buffers()
318
+ self._reset_turn_audio_buffers()
319
+
320
+ def _reset_primary_audio_buffers(self):
321
+ """Clear user and bot buffers while preserving turn buffers and timestamps."""
305
322
  self._user_audio_buffer = bytearray()
306
323
  self._bot_audio_buffer = bytearray()
324
+
325
+ def _reset_turn_audio_buffers(self):
326
+ """Clear user and bot turn buffers while preserving primary buffers and timestamps."""
307
327
  self._user_turn_audio_buffer = bytearray()
308
328
  self._bot_turn_audio_buffer = bytearray()
309
329
 
330
+ def _align_track_buffers(self):
331
+ """Pad the shorter track with silence so both tracks stay in sync."""
332
+ user_len = len(self._user_audio_buffer)
333
+ bot_len = len(self._bot_audio_buffer)
334
+ if user_len == bot_len:
335
+ return
336
+
337
+ target_len = max(user_len, bot_len)
338
+ if user_len < target_len:
339
+ self._user_audio_buffer.extend(b"\x00" * (target_len - user_len))
340
+ self._last_user_frame_at = max(self._last_user_frame_at, self._last_bot_frame_at)
341
+ if bot_len < target_len:
342
+ self._bot_audio_buffer.extend(b"\x00" * (target_len - bot_len))
343
+ self._last_bot_frame_at = max(self._last_bot_frame_at, self._last_user_frame_at)
344
+
310
345
  async def _resample_input_audio(self, frame: InputAudioRawFrame) -> bytes:
311
346
  """Resample audio frame to the target sample rate."""
312
347
  return await self._input_resampler.resample(