dv-pipecat-ai 0.0.82.dev69__py3-none-any.whl → 0.0.82.dev759__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (157) hide show
  1. {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/METADATA +78 -117
  2. {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/RECORD +157 -123
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +5 -0
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  11. pipecat/audio/filters/noisereduce_filter.py +15 -0
  12. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  13. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  14. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  15. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  16. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  17. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  18. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  19. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  20. pipecat/audio/vad/data/README.md +10 -0
  21. pipecat/audio/vad/vad_analyzer.py +13 -1
  22. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  23. pipecat/frames/frames.py +232 -88
  24. pipecat/observers/loggers/debug_log_observer.py +3 -3
  25. pipecat/observers/loggers/llm_log_observer.py +7 -3
  26. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  27. pipecat/pipeline/runner.py +12 -4
  28. pipecat/pipeline/service_switcher.py +64 -36
  29. pipecat/pipeline/task.py +85 -24
  30. pipecat/processors/aggregators/dtmf_aggregator.py +28 -22
  31. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  32. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  33. pipecat/processors/aggregators/llm_response.py +6 -7
  34. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  35. pipecat/processors/aggregators/user_response.py +6 -6
  36. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  37. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  38. pipecat/processors/dtmf_aggregator.py +128 -87
  39. pipecat/processors/filters/stt_mute_filter.py +17 -0
  40. pipecat/processors/frame_processor.py +103 -17
  41. pipecat/processors/frameworks/langchain.py +8 -2
  42. pipecat/processors/frameworks/rtvi.py +209 -68
  43. pipecat/processors/frameworks/strands_agents.py +170 -0
  44. pipecat/processors/logger.py +2 -2
  45. pipecat/processors/transcript_processor.py +4 -4
  46. pipecat/processors/user_idle_processor.py +18 -10
  47. pipecat/runner/run.py +270 -50
  48. pipecat/runner/types.py +2 -0
  49. pipecat/runner/utils.py +51 -10
  50. pipecat/serializers/exotel.py +5 -5
  51. pipecat/serializers/livekit.py +20 -0
  52. pipecat/serializers/plivo.py +6 -9
  53. pipecat/serializers/protobuf.py +6 -5
  54. pipecat/serializers/telnyx.py +2 -2
  55. pipecat/serializers/twilio.py +43 -23
  56. pipecat/services/ai_service.py +2 -6
  57. pipecat/services/anthropic/llm.py +2 -25
  58. pipecat/services/asyncai/tts.py +2 -3
  59. pipecat/services/aws/__init__.py +1 -0
  60. pipecat/services/aws/llm.py +122 -97
  61. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  62. pipecat/services/aws/nova_sonic/context.py +367 -0
  63. pipecat/services/aws/nova_sonic/frames.py +25 -0
  64. pipecat/services/aws/nova_sonic/llm.py +1155 -0
  65. pipecat/services/aws/stt.py +1 -3
  66. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  67. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  68. pipecat/services/aws_nova_sonic/context.py +13 -355
  69. pipecat/services/aws_nova_sonic/frames.py +13 -17
  70. pipecat/services/azure/realtime/__init__.py +0 -0
  71. pipecat/services/azure/realtime/llm.py +65 -0
  72. pipecat/services/azure/stt.py +15 -0
  73. pipecat/services/cartesia/tts.py +2 -2
  74. pipecat/services/deepgram/__init__.py +1 -0
  75. pipecat/services/deepgram/flux/__init__.py +0 -0
  76. pipecat/services/deepgram/flux/stt.py +636 -0
  77. pipecat/services/elevenlabs/__init__.py +2 -1
  78. pipecat/services/elevenlabs/stt.py +254 -276
  79. pipecat/services/elevenlabs/tts.py +5 -5
  80. pipecat/services/fish/tts.py +2 -2
  81. pipecat/services/gemini_multimodal_live/events.py +38 -524
  82. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  83. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  84. pipecat/services/gladia/stt.py +56 -72
  85. pipecat/services/google/__init__.py +1 -0
  86. pipecat/services/google/gemini_live/__init__.py +3 -0
  87. pipecat/services/google/gemini_live/file_api.py +189 -0
  88. pipecat/services/google/gemini_live/llm.py +1582 -0
  89. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  90. pipecat/services/google/llm.py +15 -11
  91. pipecat/services/google/llm_openai.py +3 -3
  92. pipecat/services/google/llm_vertex.py +86 -16
  93. pipecat/services/google/tts.py +7 -3
  94. pipecat/services/heygen/api.py +2 -0
  95. pipecat/services/heygen/client.py +8 -4
  96. pipecat/services/heygen/video.py +2 -0
  97. pipecat/services/hume/__init__.py +5 -0
  98. pipecat/services/hume/tts.py +220 -0
  99. pipecat/services/inworld/tts.py +6 -6
  100. pipecat/services/llm_service.py +15 -5
  101. pipecat/services/lmnt/tts.py +2 -2
  102. pipecat/services/mcp_service.py +4 -2
  103. pipecat/services/mem0/memory.py +6 -5
  104. pipecat/services/mistral/llm.py +29 -8
  105. pipecat/services/moondream/vision.py +42 -16
  106. pipecat/services/neuphonic/tts.py +2 -2
  107. pipecat/services/openai/__init__.py +1 -0
  108. pipecat/services/openai/base_llm.py +27 -20
  109. pipecat/services/openai/realtime/__init__.py +0 -0
  110. pipecat/services/openai/realtime/context.py +272 -0
  111. pipecat/services/openai/realtime/events.py +1106 -0
  112. pipecat/services/openai/realtime/frames.py +37 -0
  113. pipecat/services/openai/realtime/llm.py +829 -0
  114. pipecat/services/openai/tts.py +16 -8
  115. pipecat/services/openai_realtime/__init__.py +27 -0
  116. pipecat/services/openai_realtime/azure.py +21 -0
  117. pipecat/services/openai_realtime/context.py +21 -0
  118. pipecat/services/openai_realtime/events.py +21 -0
  119. pipecat/services/openai_realtime/frames.py +21 -0
  120. pipecat/services/openai_realtime_beta/azure.py +16 -0
  121. pipecat/services/openai_realtime_beta/openai.py +17 -5
  122. pipecat/services/playht/tts.py +31 -4
  123. pipecat/services/rime/tts.py +3 -4
  124. pipecat/services/sarvam/tts.py +2 -6
  125. pipecat/services/simli/video.py +2 -2
  126. pipecat/services/speechmatics/stt.py +1 -7
  127. pipecat/services/stt_service.py +34 -0
  128. pipecat/services/tavus/video.py +2 -2
  129. pipecat/services/tts_service.py +9 -9
  130. pipecat/services/vision_service.py +7 -6
  131. pipecat/tests/utils.py +4 -4
  132. pipecat/transcriptions/language.py +41 -1
  133. pipecat/transports/base_input.py +17 -42
  134. pipecat/transports/base_output.py +42 -26
  135. pipecat/transports/daily/transport.py +199 -26
  136. pipecat/transports/heygen/__init__.py +0 -0
  137. pipecat/transports/heygen/transport.py +381 -0
  138. pipecat/transports/livekit/transport.py +228 -63
  139. pipecat/transports/local/audio.py +6 -1
  140. pipecat/transports/local/tk.py +11 -2
  141. pipecat/transports/network/fastapi_websocket.py +1 -1
  142. pipecat/transports/smallwebrtc/connection.py +98 -19
  143. pipecat/transports/smallwebrtc/request_handler.py +204 -0
  144. pipecat/transports/smallwebrtc/transport.py +65 -23
  145. pipecat/transports/tavus/transport.py +23 -12
  146. pipecat/transports/websocket/client.py +41 -5
  147. pipecat/transports/websocket/fastapi.py +21 -11
  148. pipecat/transports/websocket/server.py +14 -7
  149. pipecat/transports/whatsapp/api.py +8 -0
  150. pipecat/transports/whatsapp/client.py +47 -0
  151. pipecat/utils/base_object.py +54 -22
  152. pipecat/utils/string.py +12 -1
  153. pipecat/utils/tracing/service_decorators.py +21 -21
  154. {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/WHEEL +0 -0
  155. {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/licenses/LICENSE +0 -0
  156. {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/top_level.txt +0 -0
  157. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -10,13 +10,22 @@ This module provides frame aggregation functionality to combine text and image
10
10
  frames into vision frames for multimodal processing.
11
11
  """
12
12
 
13
- from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame, VisionImageRawFrame
13
+ from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame
14
+ from pipecat.processors.aggregators.openai_llm_context import (
15
+ OpenAILLMContext,
16
+ OpenAILLMContextFrame,
17
+ )
14
18
  from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
15
19
 
16
20
 
17
21
  class VisionImageFrameAggregator(FrameProcessor):
18
22
  """Aggregates consecutive text and image frames into vision frames.
19
23
 
24
+ .. deprecated:: 0.0.85
25
+ VisionImageRawFrame has been removed in favor of context frames
26
+ (LLMContextFrame or OpenAILLMContextFrame), so this aggregator is not
27
+ needed anymore. See the 12* examples for the new recommended pattern.
28
+
20
29
  This aggregator waits for a consecutive TextFrame and an InputImageRawFrame.
21
30
  After the InputImageRawFrame arrives it will output a VisionImageRawFrame
22
31
  combining both the text and image data for multimodal processing.
@@ -28,6 +37,17 @@ class VisionImageFrameAggregator(FrameProcessor):
28
37
  The aggregator starts with no cached text, waiting for the first
29
38
  TextFrame to arrive before it can create vision frames.
30
39
  """
40
+ import warnings
41
+
42
+ warnings.warn(
43
+ "VisionImageFrameAggregator is deprecated. "
44
+ "VisionImageRawFrame has been removed in favor of context frames "
45
+ "(LLMContextFrame or OpenAILLMContextFrame), so this aggregator is "
46
+ "not needed anymore. See the 12* examples for the new recommended "
47
+ "pattern.",
48
+ DeprecationWarning,
49
+ stacklevel=2,
50
+ )
31
51
  super().__init__()
32
52
  self._describe_text = None
33
53
 
@@ -47,12 +67,14 @@ class VisionImageFrameAggregator(FrameProcessor):
47
67
  self._describe_text = frame.text
48
68
  elif isinstance(frame, InputImageRawFrame):
49
69
  if self._describe_text:
50
- frame = VisionImageRawFrame(
70
+ context = OpenAILLMContext()
71
+ context.add_image_frame_message(
51
72
  text=self._describe_text,
52
73
  image=frame.image,
53
74
  size=frame.size,
54
75
  format=frame.format,
55
76
  )
77
+ frame = OpenAILLMContextFrame(context)
56
78
  await self.push_frame(frame)
57
79
  self._describe_text = None
58
80
  else:
@@ -137,12 +137,12 @@ class AudioBufferProcessor(FrameProcessor):
137
137
  return self._num_channels
138
138
 
139
139
  def has_audio(self) -> bool:
140
- """Check if both user and bot audio buffers contain data.
140
+ """Check if either user or bot audio buffers contain data.
141
141
 
142
142
  Returns:
143
- True if both buffers contain audio data.
143
+ True if either buffer contains audio data.
144
144
  """
145
- return self._buffer_has_audio(self._user_audio_buffer) and self._buffer_has_audio(
145
+ return self._buffer_has_audio(self._user_audio_buffer) or self._buffer_has_audio(
146
146
  self._bot_audio_buffer
147
147
  )
148
148
 
@@ -229,9 +229,12 @@ class AudioBufferProcessor(FrameProcessor):
229
229
  # Save time of frame so we can compute silence.
230
230
  self._last_bot_frame_at = time.time()
231
231
 
232
- if self._buffer_size > 0 and len(self._user_audio_buffer) > self._buffer_size:
232
+ if self._buffer_size > 0 and (
233
+ len(self._user_audio_buffer) >= self._buffer_size
234
+ or len(self._bot_audio_buffer) >= self._buffer_size
235
+ ):
233
236
  await self._call_on_audio_data_handler()
234
- self._reset_recording()
237
+ self._reset_primary_audio_buffers()
235
238
 
236
239
  # Process turn recording with preprocessed data.
237
240
  if self._enable_turn_audio:
@@ -272,9 +275,15 @@ class AudioBufferProcessor(FrameProcessor):
272
275
 
273
276
  async def _call_on_audio_data_handler(self):
274
277
  """Call the audio data event handlers with buffered audio."""
275
- if not self.has_audio() or not self._recording:
278
+ if not self._recording:
276
279
  return
277
280
 
281
+ if len(self._user_audio_buffer) == 0 and len(self._bot_audio_buffer) == 0:
282
+ return
283
+
284
+ self._align_track_buffers()
285
+ flush_time = time.time()
286
+
278
287
  # Call original handler with merged audio
279
288
  merged_audio = self.merge_audio_buffers()
280
289
  await self._call_event_handler(
@@ -290,23 +299,49 @@ class AudioBufferProcessor(FrameProcessor):
290
299
  self._num_channels,
291
300
  )
292
301
 
302
+ self._last_user_frame_at = flush_time
303
+ self._last_bot_frame_at = flush_time
304
+
293
305
  def _buffer_has_audio(self, buffer: bytearray) -> bool:
294
306
  """Check if a buffer contains audio data."""
295
307
  return buffer is not None and len(buffer) > 0
296
308
 
297
309
  def _reset_recording(self):
298
310
  """Reset recording state and buffers."""
299
- self._reset_audio_buffers()
311
+ self._reset_all_audio_buffers()
300
312
  self._last_user_frame_at = time.time()
301
313
  self._last_bot_frame_at = time.time()
302
314
 
303
- def _reset_audio_buffers(self):
315
+ def _reset_all_audio_buffers(self):
304
316
  """Reset all audio buffers to empty state."""
317
+ self._reset_primary_audio_buffers()
318
+ self._reset_turn_audio_buffers()
319
+
320
+ def _reset_primary_audio_buffers(self):
321
+ """Clear user and bot buffers while preserving turn buffers and timestamps."""
305
322
  self._user_audio_buffer = bytearray()
306
323
  self._bot_audio_buffer = bytearray()
324
+
325
+ def _reset_turn_audio_buffers(self):
326
+ """Clear user and bot turn buffers while preserving primary buffers and timestamps."""
307
327
  self._user_turn_audio_buffer = bytearray()
308
328
  self._bot_turn_audio_buffer = bytearray()
309
329
 
330
+ def _align_track_buffers(self):
331
+ """Pad the shorter track with silence so both tracks stay in sync."""
332
+ user_len = len(self._user_audio_buffer)
333
+ bot_len = len(self._bot_audio_buffer)
334
+ if user_len == bot_len:
335
+ return
336
+
337
+ target_len = max(user_len, bot_len)
338
+ if user_len < target_len:
339
+ self._user_audio_buffer.extend(b"\x00" * (target_len - user_len))
340
+ self._last_user_frame_at = max(self._last_user_frame_at, self._last_bot_frame_at)
341
+ if bot_len < target_len:
342
+ self._bot_audio_buffer.extend(b"\x00" * (target_len - bot_len))
343
+ self._last_bot_frame_at = max(self._last_bot_frame_at, self._last_user_frame_at)
344
+
310
345
  async def _resample_input_audio(self, frame: InputAudioRawFrame) -> bytes:
311
346
  """Resample audio frame to the target sample rate."""
312
347
  return await self._input_resampler.resample(
@@ -4,15 +4,13 @@ from pipecat.frames.frames import (
4
4
  BotSpeakingFrame,
5
5
  CancelFrame,
6
6
  DTMFUpdateSettingsFrame,
7
+ EndDTMFCaptureFrame,
7
8
  EndFrame,
8
9
  Frame,
9
10
  InputDTMFFrame,
11
+ StartDTMFCaptureFrame,
10
12
  StartInterruptionFrame,
11
- StartUserIdleProcessorFrame,
12
- StopUserIdleProcessorFrame,
13
13
  TranscriptionFrame,
14
- UserStartedSpeakingFrame,
15
- UserStoppedSpeakingFrame,
16
14
  WaitForDTMFFrame,
17
15
  )
18
16
  from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -44,95 +42,78 @@ class DTMFAggregator(FrameProcessor):
44
42
  self._idle_timeout = timeout
45
43
  self._digits = digits
46
44
  self._digit_event = asyncio.Event()
47
- self._digit_aggregate_task = None
45
+ self._aggregation_task = None
48
46
  self._end_on = end_on if end_on else set()
49
47
  self._reset_on = reset_on if reset_on else set()
50
- self._stopped_idle_processor = False
51
-
52
- async def _start_idle_processor(self):
53
- await self.push_frame(StartUserIdleProcessorFrame(), FrameDirection.UPSTREAM)
54
- self._stopped_idle_processor = False
55
-
56
- async def _stop_idle_processor(self):
57
- await self.push_frame(StopUserIdleProcessorFrame(), FrameDirection.UPSTREAM)
58
- self._stopped_idle_processor = True
48
+ self._dtmf_capture_active = False
59
49
 
60
50
  async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
61
51
  # Handle DTMF frames.
62
52
  await super().process_frame(frame, direction)
63
- await self.push_frame(frame, direction)
64
- if isinstance(frame, InputDTMFFrame):
65
- # Start the digit aggregation task if it's not running yet.
66
- if self._digit_aggregate_task is None:
67
- self._digit_aggregate_task = self.create_task(self._digit_agg_handler(direction))
68
-
69
- # Append the incoming digit.
70
- if frame.button.value in self._reset_on:
71
- self._aggregation = ""
72
- elif frame.button.value in self._end_on:
73
- await self.flush_aggregation(direction)
74
- self._aggregation = ""
75
- else:
76
- self._digit_event.set()
77
- self._aggregation += frame.button.value
78
-
79
- # Flush if the aggregated digits reach the specified length.
80
- if self._digits and len(self._aggregation) == self._digits:
81
- await self.flush_aggregation(direction)
82
- self._aggregation = ""
83
- if self._stopped_idle_processor:
84
- await self._start_idle_processor()
85
53
 
54
+ if isinstance(frame, InputDTMFFrame):
55
+ # Push the DTMF frame downstream first
56
+ await self.push_frame(frame, direction)
57
+ # Then handle it for proper frame ordering
58
+ await self._handle_dtmf_frame(frame)
86
59
  elif isinstance(frame, (EndFrame, CancelFrame)):
87
60
  # For EndFrame, flush any pending aggregation and stop the digit aggregation task.
88
61
  if self._aggregation:
89
- await self.flush_aggregation(direction)
90
- if self._digit_aggregate_task:
91
- await self._stop_digit_aggregate_task()
62
+ await self.flush_aggregation()
63
+ if self._aggregation_task:
64
+ await self._stop_aggregation_task()
65
+ await self.push_frame(frame, direction)
92
66
  elif isinstance(frame, WaitForDTMFFrame):
93
67
  self.logger.debug("Received WaitForDTMFFrame: Waiting for DTMF input")
94
- if self._digit_aggregate_task is None:
95
- self._digit_aggregate_task = self.create_task(
96
- self._digit_agg_handler(direction, raise_timeout=True)
97
- )
98
- self._digit_event.set()
99
- await self._stop_idle_processor()
68
+ self._create_aggregation_task(raise_timeout=True)
69
+ self._digit_event.set() # Trigger the timeout handler
70
+ await self._start_dtmf_capture()
71
+ await self.push_frame(frame, direction)
100
72
  elif isinstance(frame, StartInterruptionFrame):
101
- self.logger.debug("Received StartInterruptionFrame: Starting idle processor")
102
- if self._stopped_idle_processor:
103
- await self._start_idle_processor()
73
+ self.logger.debug("Received StartInterruptionFrame")
104
74
  if self._aggregation:
105
- await self.flush_aggregation(direction)
75
+ await self.flush_aggregation()
76
+ await self._end_dtmf_capture()
77
+ await self.push_frame(frame, direction)
106
78
  elif isinstance(frame, BotSpeakingFrame):
107
- if self._digit_aggregate_task is not None:
79
+ # Signal the aggregation task to continue when bot speaks
80
+ if self._aggregation_task is not None:
108
81
  self._digit_event.set()
82
+ await self.push_frame(frame, direction)
109
83
  elif isinstance(frame, DTMFUpdateSettingsFrame):
110
84
  await self._update_settings(frame.settings)
111
85
  # Don't pass the settings frame downstream
86
+ else:
87
+ # Pass all other frames through
88
+ await self.push_frame(frame, direction)
112
89
 
113
90
  async def _update_settings(self, settings: dict) -> None:
114
91
  """Update DTMF aggregator settings dynamically.
115
-
92
+
116
93
  Args:
117
94
  settings: Dictionary containing new DTMF settings
118
95
  Supported keys: timeout, digits, end, reset
119
96
  """
120
97
  settings_changed = False
121
-
98
+
122
99
  if "timeout" in settings and settings["timeout"] is not None:
123
100
  new_timeout = float(settings["timeout"])
124
101
  if new_timeout != self._idle_timeout:
125
- self.logger.debug(f"Updating DTMF timeout from {self._idle_timeout} to {new_timeout}")
102
+ self.logger.debug(
103
+ f"Updating DTMF timeout from {self._idle_timeout} to {new_timeout}"
104
+ )
126
105
  self._idle_timeout = new_timeout
127
106
  settings_changed = True
128
-
107
+
129
108
  if "digits" in settings:
130
109
  new_digits = settings["digits"]
131
110
  if new_digits != self._digits:
132
- self.logger.debug(f"Updating DTMF digits from {self._digits} to {new_digits}")
111
+ self.logger.debug(
112
+ f"Updating DTMF digits from {self._digits} to {new_digits}"
113
+ )
133
114
  self._digits = new_digits
134
115
  settings_changed = True
135
-
116
+
136
117
  if "end" in settings:
137
118
  # Convert single string to set if needed
138
119
  end_value = settings["end"]
@@ -142,12 +123,14 @@ class DTMFAggregator(FrameProcessor):
142
123
  new_end_on = {end_value} if end_value else set()
143
124
  else:
144
125
  new_end_on = set(end_value)
145
-
126
+
146
127
  if new_end_on != self._end_on:
147
- self.logger.debug(f"Updating DTMF end_on from {self._end_on} to {new_end_on}")
128
+ self.logger.debug(
129
+ f"Updating DTMF end_on from {self._end_on} to {new_end_on}"
130
+ )
148
131
  self._end_on = new_end_on
149
132
  settings_changed = True
150
-
133
+
151
134
  if "reset" in settings:
152
135
  # Convert single string to set if needed
153
136
  reset_value = settings["reset"]
@@ -157,58 +140,116 @@ class DTMFAggregator(FrameProcessor):
157
140
  new_reset_on = {reset_value} if reset_value else set()
158
141
  else:
159
142
  new_reset_on = set(reset_value)
160
-
143
+
161
144
  if new_reset_on != self._reset_on:
162
- self.logger.debug(f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}")
145
+ self.logger.debug(
146
+ f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}"
147
+ )
163
148
  self._reset_on = new_reset_on
164
149
  settings_changed = True
165
-
150
+
166
151
  if settings_changed:
167
152
  self.logger.info(f"DTMF settings updated successfully")
168
153
 
169
- async def _digit_agg_handler(self, direction: FrameDirection, raise_timeout=False):
170
- """Idle task that waits for new DTMF activity. If no new digit is received within
171
- the timeout period, the current aggregation is flushed.
172
- """
154
+ async def _handle_dtmf_frame(self, frame: InputDTMFFrame):
155
+ """Handle DTMF input frame processing."""
156
+ # Create aggregation task if needed
157
+ if self._aggregation_task is None:
158
+ self._create_aggregation_task()
159
+
160
+ digit_value = frame.button.value
161
+
162
+ # Handle reset digits
163
+ if digit_value in self._reset_on:
164
+ self._aggregation = ""
165
+ return
166
+
167
+ # Handle end digits
168
+ if digit_value in self._end_on:
169
+ if self._aggregation: # Only flush if we have aggregation
170
+ await self.flush_aggregation()
171
+ return
172
+
173
+ # Add digit to aggregation
174
+ self._aggregation += digit_value
175
+
176
+ # Signal the aggregation task that a digit was received
177
+ self._digit_event.set()
178
+
179
+ # Check if we reached the digit limit
180
+ if self._digits and len(self._aggregation) == self._digits:
181
+ await self.flush_aggregation()
182
+
183
+ def _create_aggregation_task(self, raise_timeout: bool = False) -> None:
184
+ """Creates the aggregation task if it hasn't been created yet."""
185
+ if not self._aggregation_task:
186
+ self._aggregation_task = self.create_task(
187
+ self._aggregation_task_handler(raise_timeout)
188
+ )
189
+
190
+ async def _stop_aggregation_task(self) -> None:
191
+ """Stops the aggregation task."""
192
+ if self._aggregation_task:
193
+ await self.cancel_task(self._aggregation_task)
194
+ self._aggregation_task = None
195
+
196
+ async def _aggregation_task_handler(self, raise_timeout=False):
197
+ """Background task that handles timeout-based flushing."""
173
198
  while True:
174
199
  try:
175
200
  # Wait for a new digit signal with a timeout.
176
- await asyncio.wait_for(self._digit_event.wait(), timeout=self._idle_timeout)
177
- except asyncio.TimeoutError:
178
- # No new digit arrived within the timeout period; flush aggregation if non-empty.
179
- await self.flush_aggregation(direction, raise_timeout)
180
- finally:
181
- # Clear the event for the next cycle.
201
+ await asyncio.wait_for(
202
+ self._digit_event.wait(), timeout=self._idle_timeout
203
+ )
182
204
  self._digit_event.clear()
205
+ except asyncio.TimeoutError:
206
+ # No new digit arrived within the timeout period; flush if needed
207
+ await self.flush_aggregation(raise_timeout=raise_timeout)
183
208
 
184
- async def flush_aggregation(self, direction: FrameDirection, raise_timeout=False):
209
+ async def flush_aggregation(self, *, raise_timeout: bool = False):
185
210
  """Flush the aggregated digits by emitting a TranscriptionFrame downstream."""
186
211
  if self._aggregation:
187
- # Todo: Change to different frame type if we decide to handle it in llm processor separately.
212
+ # Create transcription frame
188
213
  aggregated_frame = TranscriptionFrame(
189
214
  f"User inputted: {self._aggregation}.", "", time_now_iso8601()
190
215
  )
191
216
  aggregated_frame.metadata["push_aggregation"] = True
192
- await self.push_frame(StartInterruptionFrame())
193
- await self.push_frame(aggregated_frame, direction)
217
+
218
+ # Send interruption frame (as per original design)
219
+ await self.push_frame(StartInterruptionFrame(), FrameDirection.DOWNSTREAM)
220
+
221
+ # Push the transcription frame
222
+ await self.push_frame(aggregated_frame, FrameDirection.DOWNSTREAM)
223
+
224
+ # Reset state
194
225
  self._aggregation = ""
195
- elif raise_timeout and self._stopped_idle_processor:
226
+ await self._end_dtmf_capture()
227
+
228
+ elif raise_timeout and not self._aggregation:
229
+ # Timeout with no aggregation (WaitForDTMFFrame case)
196
230
  transcript_frame = TranscriptionFrame(
197
231
  "User didn't press any digits on the keyboard.", "", time_now_iso8601()
198
232
  )
199
233
  transcript_frame.metadata["push_aggregation"] = True
200
- await self.push_frame(transcript_frame)
201
- if self._stopped_idle_processor:
202
- await self._start_idle_processor()
234
+ await self.push_frame(transcript_frame, FrameDirection.DOWNSTREAM)
235
+ await self._end_dtmf_capture()
236
+
237
+ async def _start_dtmf_capture(self):
238
+ """Signal the start of DTMF capture upstream."""
239
+ if self._dtmf_capture_active:
240
+ return
241
+ await self.push_frame(StartDTMFCaptureFrame(), FrameDirection.UPSTREAM)
242
+ self._dtmf_capture_active = True
203
243
 
204
- async def _stop_digit_aggregate_task(self):
205
- """Cancels the digit aggregation task if it exists."""
206
- if self._digit_aggregate_task:
207
- await self.cancel_task(self._digit_aggregate_task)
208
- self._digit_aggregate_task = None
244
+ async def _end_dtmf_capture(self):
245
+ """Signal the end of DTMF capture upstream."""
246
+ if not self._dtmf_capture_active:
247
+ return
248
+ await self.push_frame(EndDTMFCaptureFrame(), FrameDirection.UPSTREAM)
249
+ self._dtmf_capture_active = False
209
250
 
210
251
  async def cleanup(self) -> None:
211
252
  """Cleans up resources, ensuring that the digit aggregation task is cancelled."""
212
253
  await super().cleanup()
213
- if self._digit_aggregate_task:
214
- await self._stop_digit_aggregate_task()
254
+ if self._aggregation_task:
255
+ await self._stop_aggregation_task()
@@ -25,14 +25,17 @@ from pipecat.frames.frames import (
25
25
  FunctionCallResultFrame,
26
26
  InputAudioRawFrame,
27
27
  InterimTranscriptionFrame,
28
+ InterruptionFrame,
28
29
  StartFrame,
29
30
  StartInterruptionFrame,
31
+ StartDTMFCaptureFrame,
30
32
  STTMuteFrame,
31
33
  TranscriptionFrame,
32
34
  UserStartedSpeakingFrame,
33
35
  UserStoppedSpeakingFrame,
34
36
  VADUserStartedSpeakingFrame,
35
37
  VADUserStoppedSpeakingFrame,
38
+ EndDTMFCaptureFrame,
36
39
  )
37
40
  from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
38
41
 
@@ -58,6 +61,7 @@ class STTMuteStrategy(Enum):
58
61
  FUNCTION_CALL = "function_call"
59
62
  ALWAYS = "always"
60
63
  CUSTOM = "custom"
64
+ DTMF_CAPTURE = "dtmf_capture"
61
65
 
62
66
 
63
67
  @dataclass
@@ -120,6 +124,7 @@ class STTMuteFilter(FrameProcessor):
120
124
  self._function_call_in_progress = False
121
125
  self._is_muted = False # Initialize as unmuted, will set state on StartFrame if needed
122
126
  self._voicemail_detection_enabled = False # Default to False
127
+ self._dtmf_capture_active = False
123
128
 
124
129
  @property
125
130
  def is_muted(self) -> bool:
@@ -165,6 +170,10 @@ class STTMuteFilter(FrameProcessor):
165
170
  if should_mute:
166
171
  return True
167
172
 
173
+ case STTMuteStrategy.DTMF_CAPTURE:
174
+ if self._dtmf_capture_active:
175
+ return True
176
+
168
177
  return False
169
178
 
170
179
  async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -205,12 +214,20 @@ class STTMuteFilter(FrameProcessor):
205
214
  self._first_speech_handled = True
206
215
  should_mute = await self._should_mute()
207
216
  self.logger.debug(f"BotStoppedSpeaking: should mute={should_mute}")
217
+ elif isinstance(frame, StartDTMFCaptureFrame):
218
+ self._dtmf_capture_active = True
219
+ should_mute = await self._should_mute()
220
+ elif isinstance(frame, EndDTMFCaptureFrame):
221
+ self._dtmf_capture_active = False
222
+ should_mute = await self._should_mute()
208
223
  elif isinstance(frame, STTMuteFrame):
224
+ # TODO: Duplication of frame is actually happening. We get this frame from the downstream and then we again push it downstream. Also we're psuhing is upstream and again push it upstream in _handle_mute_state.
209
225
  should_mute = frame.mute
210
226
 
211
227
  # Then push the original frame
212
228
  # Conditionally include InputAudioRawFrame in suppression tuple based on voicemail_detection_enabled
213
229
  suppression_types = (
230
+ InterruptionFrame,
214
231
  StartInterruptionFrame,
215
232
  VADUserStartedSpeakingFrame,
216
233
  VADUserStoppedSpeakingFrame,