dv-pipecat-ai 0.0.82.dev69__py3-none-any.whl → 0.0.82.dev759__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/METADATA +78 -117
- {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/RECORD +157 -123
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +5 -0
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +232 -88
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +12 -4
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +85 -24
- pipecat/processors/aggregators/dtmf_aggregator.py +28 -22
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_response.py +6 -7
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +128 -87
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +103 -17
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +209 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +4 -4
- pipecat/processors/user_idle_processor.py +18 -10
- pipecat/runner/run.py +270 -50
- pipecat/runner/types.py +2 -0
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +6 -9
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/asyncai/tts.py +2 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +122 -97
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +367 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1155 -0
- pipecat/services/aws/stt.py +1 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +13 -355
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/tts.py +2 -2
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +636 -0
- pipecat/services/elevenlabs/__init__.py +2 -1
- pipecat/services/elevenlabs/stt.py +254 -276
- pipecat/services/elevenlabs/tts.py +5 -5
- pipecat/services/fish/tts.py +2 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +2 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +2 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +16 -8
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/playht/tts.py +31 -4
- pipecat/services/rime/tts.py +3 -4
- pipecat/services/sarvam/tts.py +2 -6
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +1 -7
- pipecat/services/stt_service.py +34 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +9 -9
- pipecat/services/vision_service.py +7 -6
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +17 -42
- pipecat/transports/base_output.py +42 -26
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +98 -19
- pipecat/transports/smallwebrtc/request_handler.py +204 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/string.py +12 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev69.dist-info → dv_pipecat_ai-0.0.82.dev759.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -10,13 +10,22 @@ This module provides frame aggregation functionality to combine text and image
|
|
|
10
10
|
frames into vision frames for multimodal processing.
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
-
from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame
|
|
13
|
+
from pipecat.frames.frames import Frame, InputImageRawFrame, TextFrame
|
|
14
|
+
from pipecat.processors.aggregators.openai_llm_context import (
|
|
15
|
+
OpenAILLMContext,
|
|
16
|
+
OpenAILLMContextFrame,
|
|
17
|
+
)
|
|
14
18
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|
15
19
|
|
|
16
20
|
|
|
17
21
|
class VisionImageFrameAggregator(FrameProcessor):
|
|
18
22
|
"""Aggregates consecutive text and image frames into vision frames.
|
|
19
23
|
|
|
24
|
+
.. deprecated:: 0.0.85
|
|
25
|
+
VisionImageRawFrame has been removed in favor of context frames
|
|
26
|
+
(LLMContextFrame or OpenAILLMContextFrame), so this aggregator is not
|
|
27
|
+
needed anymore. See the 12* examples for the new recommended pattern.
|
|
28
|
+
|
|
20
29
|
This aggregator waits for a consecutive TextFrame and an InputImageRawFrame.
|
|
21
30
|
After the InputImageRawFrame arrives it will output a VisionImageRawFrame
|
|
22
31
|
combining both the text and image data for multimodal processing.
|
|
@@ -28,6 +37,17 @@ class VisionImageFrameAggregator(FrameProcessor):
|
|
|
28
37
|
The aggregator starts with no cached text, waiting for the first
|
|
29
38
|
TextFrame to arrive before it can create vision frames.
|
|
30
39
|
"""
|
|
40
|
+
import warnings
|
|
41
|
+
|
|
42
|
+
warnings.warn(
|
|
43
|
+
"VisionImageFrameAggregator is deprecated. "
|
|
44
|
+
"VisionImageRawFrame has been removed in favor of context frames "
|
|
45
|
+
"(LLMContextFrame or OpenAILLMContextFrame), so this aggregator is "
|
|
46
|
+
"not needed anymore. See the 12* examples for the new recommended "
|
|
47
|
+
"pattern.",
|
|
48
|
+
DeprecationWarning,
|
|
49
|
+
stacklevel=2,
|
|
50
|
+
)
|
|
31
51
|
super().__init__()
|
|
32
52
|
self._describe_text = None
|
|
33
53
|
|
|
@@ -47,12 +67,14 @@ class VisionImageFrameAggregator(FrameProcessor):
|
|
|
47
67
|
self._describe_text = frame.text
|
|
48
68
|
elif isinstance(frame, InputImageRawFrame):
|
|
49
69
|
if self._describe_text:
|
|
50
|
-
|
|
70
|
+
context = OpenAILLMContext()
|
|
71
|
+
context.add_image_frame_message(
|
|
51
72
|
text=self._describe_text,
|
|
52
73
|
image=frame.image,
|
|
53
74
|
size=frame.size,
|
|
54
75
|
format=frame.format,
|
|
55
76
|
)
|
|
77
|
+
frame = OpenAILLMContextFrame(context)
|
|
56
78
|
await self.push_frame(frame)
|
|
57
79
|
self._describe_text = None
|
|
58
80
|
else:
|
|
@@ -137,12 +137,12 @@ class AudioBufferProcessor(FrameProcessor):
|
|
|
137
137
|
return self._num_channels
|
|
138
138
|
|
|
139
139
|
def has_audio(self) -> bool:
|
|
140
|
-
"""Check if
|
|
140
|
+
"""Check if either user or bot audio buffers contain data.
|
|
141
141
|
|
|
142
142
|
Returns:
|
|
143
|
-
True if
|
|
143
|
+
True if either buffer contains audio data.
|
|
144
144
|
"""
|
|
145
|
-
return self._buffer_has_audio(self._user_audio_buffer)
|
|
145
|
+
return self._buffer_has_audio(self._user_audio_buffer) or self._buffer_has_audio(
|
|
146
146
|
self._bot_audio_buffer
|
|
147
147
|
)
|
|
148
148
|
|
|
@@ -229,9 +229,12 @@ class AudioBufferProcessor(FrameProcessor):
|
|
|
229
229
|
# Save time of frame so we can compute silence.
|
|
230
230
|
self._last_bot_frame_at = time.time()
|
|
231
231
|
|
|
232
|
-
if self._buffer_size > 0 and
|
|
232
|
+
if self._buffer_size > 0 and (
|
|
233
|
+
len(self._user_audio_buffer) >= self._buffer_size
|
|
234
|
+
or len(self._bot_audio_buffer) >= self._buffer_size
|
|
235
|
+
):
|
|
233
236
|
await self._call_on_audio_data_handler()
|
|
234
|
-
self.
|
|
237
|
+
self._reset_primary_audio_buffers()
|
|
235
238
|
|
|
236
239
|
# Process turn recording with preprocessed data.
|
|
237
240
|
if self._enable_turn_audio:
|
|
@@ -272,9 +275,15 @@ class AudioBufferProcessor(FrameProcessor):
|
|
|
272
275
|
|
|
273
276
|
async def _call_on_audio_data_handler(self):
|
|
274
277
|
"""Call the audio data event handlers with buffered audio."""
|
|
275
|
-
if not self.
|
|
278
|
+
if not self._recording:
|
|
276
279
|
return
|
|
277
280
|
|
|
281
|
+
if len(self._user_audio_buffer) == 0 and len(self._bot_audio_buffer) == 0:
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
self._align_track_buffers()
|
|
285
|
+
flush_time = time.time()
|
|
286
|
+
|
|
278
287
|
# Call original handler with merged audio
|
|
279
288
|
merged_audio = self.merge_audio_buffers()
|
|
280
289
|
await self._call_event_handler(
|
|
@@ -290,23 +299,49 @@ class AudioBufferProcessor(FrameProcessor):
|
|
|
290
299
|
self._num_channels,
|
|
291
300
|
)
|
|
292
301
|
|
|
302
|
+
self._last_user_frame_at = flush_time
|
|
303
|
+
self._last_bot_frame_at = flush_time
|
|
304
|
+
|
|
293
305
|
def _buffer_has_audio(self, buffer: bytearray) -> bool:
|
|
294
306
|
"""Check if a buffer contains audio data."""
|
|
295
307
|
return buffer is not None and len(buffer) > 0
|
|
296
308
|
|
|
297
309
|
def _reset_recording(self):
|
|
298
310
|
"""Reset recording state and buffers."""
|
|
299
|
-
self.
|
|
311
|
+
self._reset_all_audio_buffers()
|
|
300
312
|
self._last_user_frame_at = time.time()
|
|
301
313
|
self._last_bot_frame_at = time.time()
|
|
302
314
|
|
|
303
|
-
def
|
|
315
|
+
def _reset_all_audio_buffers(self):
|
|
304
316
|
"""Reset all audio buffers to empty state."""
|
|
317
|
+
self._reset_primary_audio_buffers()
|
|
318
|
+
self._reset_turn_audio_buffers()
|
|
319
|
+
|
|
320
|
+
def _reset_primary_audio_buffers(self):
|
|
321
|
+
"""Clear user and bot buffers while preserving turn buffers and timestamps."""
|
|
305
322
|
self._user_audio_buffer = bytearray()
|
|
306
323
|
self._bot_audio_buffer = bytearray()
|
|
324
|
+
|
|
325
|
+
def _reset_turn_audio_buffers(self):
|
|
326
|
+
"""Clear user and bot turn buffers while preserving primary buffers and timestamps."""
|
|
307
327
|
self._user_turn_audio_buffer = bytearray()
|
|
308
328
|
self._bot_turn_audio_buffer = bytearray()
|
|
309
329
|
|
|
330
|
+
def _align_track_buffers(self):
|
|
331
|
+
"""Pad the shorter track with silence so both tracks stay in sync."""
|
|
332
|
+
user_len = len(self._user_audio_buffer)
|
|
333
|
+
bot_len = len(self._bot_audio_buffer)
|
|
334
|
+
if user_len == bot_len:
|
|
335
|
+
return
|
|
336
|
+
|
|
337
|
+
target_len = max(user_len, bot_len)
|
|
338
|
+
if user_len < target_len:
|
|
339
|
+
self._user_audio_buffer.extend(b"\x00" * (target_len - user_len))
|
|
340
|
+
self._last_user_frame_at = max(self._last_user_frame_at, self._last_bot_frame_at)
|
|
341
|
+
if bot_len < target_len:
|
|
342
|
+
self._bot_audio_buffer.extend(b"\x00" * (target_len - bot_len))
|
|
343
|
+
self._last_bot_frame_at = max(self._last_bot_frame_at, self._last_user_frame_at)
|
|
344
|
+
|
|
310
345
|
async def _resample_input_audio(self, frame: InputAudioRawFrame) -> bytes:
|
|
311
346
|
"""Resample audio frame to the target sample rate."""
|
|
312
347
|
return await self._input_resampler.resample(
|
|
@@ -4,15 +4,13 @@ from pipecat.frames.frames import (
|
|
|
4
4
|
BotSpeakingFrame,
|
|
5
5
|
CancelFrame,
|
|
6
6
|
DTMFUpdateSettingsFrame,
|
|
7
|
+
EndDTMFCaptureFrame,
|
|
7
8
|
EndFrame,
|
|
8
9
|
Frame,
|
|
9
10
|
InputDTMFFrame,
|
|
11
|
+
StartDTMFCaptureFrame,
|
|
10
12
|
StartInterruptionFrame,
|
|
11
|
-
StartUserIdleProcessorFrame,
|
|
12
|
-
StopUserIdleProcessorFrame,
|
|
13
13
|
TranscriptionFrame,
|
|
14
|
-
UserStartedSpeakingFrame,
|
|
15
|
-
UserStoppedSpeakingFrame,
|
|
16
14
|
WaitForDTMFFrame,
|
|
17
15
|
)
|
|
18
16
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|
@@ -44,95 +42,78 @@ class DTMFAggregator(FrameProcessor):
|
|
|
44
42
|
self._idle_timeout = timeout
|
|
45
43
|
self._digits = digits
|
|
46
44
|
self._digit_event = asyncio.Event()
|
|
47
|
-
self.
|
|
45
|
+
self._aggregation_task = None
|
|
48
46
|
self._end_on = end_on if end_on else set()
|
|
49
47
|
self._reset_on = reset_on if reset_on else set()
|
|
50
|
-
self.
|
|
51
|
-
|
|
52
|
-
async def _start_idle_processor(self):
|
|
53
|
-
await self.push_frame(StartUserIdleProcessorFrame(), FrameDirection.UPSTREAM)
|
|
54
|
-
self._stopped_idle_processor = False
|
|
55
|
-
|
|
56
|
-
async def _stop_idle_processor(self):
|
|
57
|
-
await self.push_frame(StopUserIdleProcessorFrame(), FrameDirection.UPSTREAM)
|
|
58
|
-
self._stopped_idle_processor = True
|
|
48
|
+
self._dtmf_capture_active = False
|
|
59
49
|
|
|
60
50
|
async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
|
|
61
51
|
# Handle DTMF frames.
|
|
62
52
|
await super().process_frame(frame, direction)
|
|
63
|
-
await self.push_frame(frame, direction)
|
|
64
|
-
if isinstance(frame, InputDTMFFrame):
|
|
65
|
-
# Start the digit aggregation task if it's not running yet.
|
|
66
|
-
if self._digit_aggregate_task is None:
|
|
67
|
-
self._digit_aggregate_task = self.create_task(self._digit_agg_handler(direction))
|
|
68
|
-
|
|
69
|
-
# Append the incoming digit.
|
|
70
|
-
if frame.button.value in self._reset_on:
|
|
71
|
-
self._aggregation = ""
|
|
72
|
-
elif frame.button.value in self._end_on:
|
|
73
|
-
await self.flush_aggregation(direction)
|
|
74
|
-
self._aggregation = ""
|
|
75
|
-
else:
|
|
76
|
-
self._digit_event.set()
|
|
77
|
-
self._aggregation += frame.button.value
|
|
78
|
-
|
|
79
|
-
# Flush if the aggregated digits reach the specified length.
|
|
80
|
-
if self._digits and len(self._aggregation) == self._digits:
|
|
81
|
-
await self.flush_aggregation(direction)
|
|
82
|
-
self._aggregation = ""
|
|
83
|
-
if self._stopped_idle_processor:
|
|
84
|
-
await self._start_idle_processor()
|
|
85
53
|
|
|
54
|
+
if isinstance(frame, InputDTMFFrame):
|
|
55
|
+
# Push the DTMF frame downstream first
|
|
56
|
+
await self.push_frame(frame, direction)
|
|
57
|
+
# Then handle it for proper frame ordering
|
|
58
|
+
await self._handle_dtmf_frame(frame)
|
|
86
59
|
elif isinstance(frame, (EndFrame, CancelFrame)):
|
|
87
60
|
# For EndFrame, flush any pending aggregation and stop the digit aggregation task.
|
|
88
61
|
if self._aggregation:
|
|
89
|
-
await self.flush_aggregation(
|
|
90
|
-
if self.
|
|
91
|
-
await self.
|
|
62
|
+
await self.flush_aggregation()
|
|
63
|
+
if self._aggregation_task:
|
|
64
|
+
await self._stop_aggregation_task()
|
|
65
|
+
await self.push_frame(frame, direction)
|
|
92
66
|
elif isinstance(frame, WaitForDTMFFrame):
|
|
93
67
|
self.logger.debug("Received WaitForDTMFFrame: Waiting for DTMF input")
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
self._digit_event.set()
|
|
99
|
-
await self._stop_idle_processor()
|
|
68
|
+
self._create_aggregation_task(raise_timeout=True)
|
|
69
|
+
self._digit_event.set() # Trigger the timeout handler
|
|
70
|
+
await self._start_dtmf_capture()
|
|
71
|
+
await self.push_frame(frame, direction)
|
|
100
72
|
elif isinstance(frame, StartInterruptionFrame):
|
|
101
|
-
self.logger.debug("Received StartInterruptionFrame
|
|
102
|
-
if self._stopped_idle_processor:
|
|
103
|
-
await self._start_idle_processor()
|
|
73
|
+
self.logger.debug("Received StartInterruptionFrame")
|
|
104
74
|
if self._aggregation:
|
|
105
|
-
await self.flush_aggregation(
|
|
75
|
+
await self.flush_aggregation()
|
|
76
|
+
await self._end_dtmf_capture()
|
|
77
|
+
await self.push_frame(frame, direction)
|
|
106
78
|
elif isinstance(frame, BotSpeakingFrame):
|
|
107
|
-
|
|
79
|
+
# Signal the aggregation task to continue when bot speaks
|
|
80
|
+
if self._aggregation_task is not None:
|
|
108
81
|
self._digit_event.set()
|
|
82
|
+
await self.push_frame(frame, direction)
|
|
109
83
|
elif isinstance(frame, DTMFUpdateSettingsFrame):
|
|
110
84
|
await self._update_settings(frame.settings)
|
|
111
85
|
# Don't pass the settings frame downstream
|
|
86
|
+
else:
|
|
87
|
+
# Pass all other frames through
|
|
88
|
+
await self.push_frame(frame, direction)
|
|
112
89
|
|
|
113
90
|
async def _update_settings(self, settings: dict) -> None:
|
|
114
91
|
"""Update DTMF aggregator settings dynamically.
|
|
115
|
-
|
|
92
|
+
|
|
116
93
|
Args:
|
|
117
94
|
settings: Dictionary containing new DTMF settings
|
|
118
95
|
Supported keys: timeout, digits, end, reset
|
|
119
96
|
"""
|
|
120
97
|
settings_changed = False
|
|
121
|
-
|
|
98
|
+
|
|
122
99
|
if "timeout" in settings and settings["timeout"] is not None:
|
|
123
100
|
new_timeout = float(settings["timeout"])
|
|
124
101
|
if new_timeout != self._idle_timeout:
|
|
125
|
-
self.logger.debug(
|
|
102
|
+
self.logger.debug(
|
|
103
|
+
f"Updating DTMF timeout from {self._idle_timeout} to {new_timeout}"
|
|
104
|
+
)
|
|
126
105
|
self._idle_timeout = new_timeout
|
|
127
106
|
settings_changed = True
|
|
128
|
-
|
|
107
|
+
|
|
129
108
|
if "digits" in settings:
|
|
130
109
|
new_digits = settings["digits"]
|
|
131
110
|
if new_digits != self._digits:
|
|
132
|
-
self.logger.debug(
|
|
111
|
+
self.logger.debug(
|
|
112
|
+
f"Updating DTMF digits from {self._digits} to {new_digits}"
|
|
113
|
+
)
|
|
133
114
|
self._digits = new_digits
|
|
134
115
|
settings_changed = True
|
|
135
|
-
|
|
116
|
+
|
|
136
117
|
if "end" in settings:
|
|
137
118
|
# Convert single string to set if needed
|
|
138
119
|
end_value = settings["end"]
|
|
@@ -142,12 +123,14 @@ class DTMFAggregator(FrameProcessor):
|
|
|
142
123
|
new_end_on = {end_value} if end_value else set()
|
|
143
124
|
else:
|
|
144
125
|
new_end_on = set(end_value)
|
|
145
|
-
|
|
126
|
+
|
|
146
127
|
if new_end_on != self._end_on:
|
|
147
|
-
self.logger.debug(
|
|
128
|
+
self.logger.debug(
|
|
129
|
+
f"Updating DTMF end_on from {self._end_on} to {new_end_on}"
|
|
130
|
+
)
|
|
148
131
|
self._end_on = new_end_on
|
|
149
132
|
settings_changed = True
|
|
150
|
-
|
|
133
|
+
|
|
151
134
|
if "reset" in settings:
|
|
152
135
|
# Convert single string to set if needed
|
|
153
136
|
reset_value = settings["reset"]
|
|
@@ -157,58 +140,116 @@ class DTMFAggregator(FrameProcessor):
|
|
|
157
140
|
new_reset_on = {reset_value} if reset_value else set()
|
|
158
141
|
else:
|
|
159
142
|
new_reset_on = set(reset_value)
|
|
160
|
-
|
|
143
|
+
|
|
161
144
|
if new_reset_on != self._reset_on:
|
|
162
|
-
self.logger.debug(
|
|
145
|
+
self.logger.debug(
|
|
146
|
+
f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}"
|
|
147
|
+
)
|
|
163
148
|
self._reset_on = new_reset_on
|
|
164
149
|
settings_changed = True
|
|
165
|
-
|
|
150
|
+
|
|
166
151
|
if settings_changed:
|
|
167
152
|
self.logger.info(f"DTMF settings updated successfully")
|
|
168
153
|
|
|
169
|
-
async def
|
|
170
|
-
"""
|
|
171
|
-
|
|
172
|
-
|
|
154
|
+
async def _handle_dtmf_frame(self, frame: InputDTMFFrame):
|
|
155
|
+
"""Handle DTMF input frame processing."""
|
|
156
|
+
# Create aggregation task if needed
|
|
157
|
+
if self._aggregation_task is None:
|
|
158
|
+
self._create_aggregation_task()
|
|
159
|
+
|
|
160
|
+
digit_value = frame.button.value
|
|
161
|
+
|
|
162
|
+
# Handle reset digits
|
|
163
|
+
if digit_value in self._reset_on:
|
|
164
|
+
self._aggregation = ""
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
# Handle end digits
|
|
168
|
+
if digit_value in self._end_on:
|
|
169
|
+
if self._aggregation: # Only flush if we have aggregation
|
|
170
|
+
await self.flush_aggregation()
|
|
171
|
+
return
|
|
172
|
+
|
|
173
|
+
# Add digit to aggregation
|
|
174
|
+
self._aggregation += digit_value
|
|
175
|
+
|
|
176
|
+
# Signal the aggregation task that a digit was received
|
|
177
|
+
self._digit_event.set()
|
|
178
|
+
|
|
179
|
+
# Check if we reached the digit limit
|
|
180
|
+
if self._digits and len(self._aggregation) == self._digits:
|
|
181
|
+
await self.flush_aggregation()
|
|
182
|
+
|
|
183
|
+
def _create_aggregation_task(self, raise_timeout: bool = False) -> None:
|
|
184
|
+
"""Creates the aggregation task if it hasn't been created yet."""
|
|
185
|
+
if not self._aggregation_task:
|
|
186
|
+
self._aggregation_task = self.create_task(
|
|
187
|
+
self._aggregation_task_handler(raise_timeout)
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
async def _stop_aggregation_task(self) -> None:
|
|
191
|
+
"""Stops the aggregation task."""
|
|
192
|
+
if self._aggregation_task:
|
|
193
|
+
await self.cancel_task(self._aggregation_task)
|
|
194
|
+
self._aggregation_task = None
|
|
195
|
+
|
|
196
|
+
async def _aggregation_task_handler(self, raise_timeout=False):
|
|
197
|
+
"""Background task that handles timeout-based flushing."""
|
|
173
198
|
while True:
|
|
174
199
|
try:
|
|
175
200
|
# Wait for a new digit signal with a timeout.
|
|
176
|
-
await asyncio.wait_for(
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
await self.flush_aggregation(direction, raise_timeout)
|
|
180
|
-
finally:
|
|
181
|
-
# Clear the event for the next cycle.
|
|
201
|
+
await asyncio.wait_for(
|
|
202
|
+
self._digit_event.wait(), timeout=self._idle_timeout
|
|
203
|
+
)
|
|
182
204
|
self._digit_event.clear()
|
|
205
|
+
except asyncio.TimeoutError:
|
|
206
|
+
# No new digit arrived within the timeout period; flush if needed
|
|
207
|
+
await self.flush_aggregation(raise_timeout=raise_timeout)
|
|
183
208
|
|
|
184
|
-
async def flush_aggregation(self,
|
|
209
|
+
async def flush_aggregation(self, *, raise_timeout: bool = False):
|
|
185
210
|
"""Flush the aggregated digits by emitting a TranscriptionFrame downstream."""
|
|
186
211
|
if self._aggregation:
|
|
187
|
-
#
|
|
212
|
+
# Create transcription frame
|
|
188
213
|
aggregated_frame = TranscriptionFrame(
|
|
189
214
|
f"User inputted: {self._aggregation}.", "", time_now_iso8601()
|
|
190
215
|
)
|
|
191
216
|
aggregated_frame.metadata["push_aggregation"] = True
|
|
192
|
-
|
|
193
|
-
|
|
217
|
+
|
|
218
|
+
# Send interruption frame (as per original design)
|
|
219
|
+
await self.push_frame(StartInterruptionFrame(), FrameDirection.DOWNSTREAM)
|
|
220
|
+
|
|
221
|
+
# Push the transcription frame
|
|
222
|
+
await self.push_frame(aggregated_frame, FrameDirection.DOWNSTREAM)
|
|
223
|
+
|
|
224
|
+
# Reset state
|
|
194
225
|
self._aggregation = ""
|
|
195
|
-
|
|
226
|
+
await self._end_dtmf_capture()
|
|
227
|
+
|
|
228
|
+
elif raise_timeout and not self._aggregation:
|
|
229
|
+
# Timeout with no aggregation (WaitForDTMFFrame case)
|
|
196
230
|
transcript_frame = TranscriptionFrame(
|
|
197
231
|
"User didn't press any digits on the keyboard.", "", time_now_iso8601()
|
|
198
232
|
)
|
|
199
233
|
transcript_frame.metadata["push_aggregation"] = True
|
|
200
|
-
await self.push_frame(transcript_frame)
|
|
201
|
-
|
|
202
|
-
|
|
234
|
+
await self.push_frame(transcript_frame, FrameDirection.DOWNSTREAM)
|
|
235
|
+
await self._end_dtmf_capture()
|
|
236
|
+
|
|
237
|
+
async def _start_dtmf_capture(self):
|
|
238
|
+
"""Signal the start of DTMF capture upstream."""
|
|
239
|
+
if self._dtmf_capture_active:
|
|
240
|
+
return
|
|
241
|
+
await self.push_frame(StartDTMFCaptureFrame(), FrameDirection.UPSTREAM)
|
|
242
|
+
self._dtmf_capture_active = True
|
|
203
243
|
|
|
204
|
-
async def
|
|
205
|
-
"""
|
|
206
|
-
if self.
|
|
207
|
-
|
|
208
|
-
|
|
244
|
+
async def _end_dtmf_capture(self):
|
|
245
|
+
"""Signal the end of DTMF capture upstream."""
|
|
246
|
+
if not self._dtmf_capture_active:
|
|
247
|
+
return
|
|
248
|
+
await self.push_frame(EndDTMFCaptureFrame(), FrameDirection.UPSTREAM)
|
|
249
|
+
self._dtmf_capture_active = False
|
|
209
250
|
|
|
210
251
|
async def cleanup(self) -> None:
|
|
211
252
|
"""Cleans up resources, ensuring that the digit aggregation task is cancelled."""
|
|
212
253
|
await super().cleanup()
|
|
213
|
-
if self.
|
|
214
|
-
await self.
|
|
254
|
+
if self._aggregation_task:
|
|
255
|
+
await self._stop_aggregation_task()
|
|
@@ -25,14 +25,17 @@ from pipecat.frames.frames import (
|
|
|
25
25
|
FunctionCallResultFrame,
|
|
26
26
|
InputAudioRawFrame,
|
|
27
27
|
InterimTranscriptionFrame,
|
|
28
|
+
InterruptionFrame,
|
|
28
29
|
StartFrame,
|
|
29
30
|
StartInterruptionFrame,
|
|
31
|
+
StartDTMFCaptureFrame,
|
|
30
32
|
STTMuteFrame,
|
|
31
33
|
TranscriptionFrame,
|
|
32
34
|
UserStartedSpeakingFrame,
|
|
33
35
|
UserStoppedSpeakingFrame,
|
|
34
36
|
VADUserStartedSpeakingFrame,
|
|
35
37
|
VADUserStoppedSpeakingFrame,
|
|
38
|
+
EndDTMFCaptureFrame,
|
|
36
39
|
)
|
|
37
40
|
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
|
|
38
41
|
|
|
@@ -58,6 +61,7 @@ class STTMuteStrategy(Enum):
|
|
|
58
61
|
FUNCTION_CALL = "function_call"
|
|
59
62
|
ALWAYS = "always"
|
|
60
63
|
CUSTOM = "custom"
|
|
64
|
+
DTMF_CAPTURE = "dtmf_capture"
|
|
61
65
|
|
|
62
66
|
|
|
63
67
|
@dataclass
|
|
@@ -120,6 +124,7 @@ class STTMuteFilter(FrameProcessor):
|
|
|
120
124
|
self._function_call_in_progress = False
|
|
121
125
|
self._is_muted = False # Initialize as unmuted, will set state on StartFrame if needed
|
|
122
126
|
self._voicemail_detection_enabled = False # Default to False
|
|
127
|
+
self._dtmf_capture_active = False
|
|
123
128
|
|
|
124
129
|
@property
|
|
125
130
|
def is_muted(self) -> bool:
|
|
@@ -165,6 +170,10 @@ class STTMuteFilter(FrameProcessor):
|
|
|
165
170
|
if should_mute:
|
|
166
171
|
return True
|
|
167
172
|
|
|
173
|
+
case STTMuteStrategy.DTMF_CAPTURE:
|
|
174
|
+
if self._dtmf_capture_active:
|
|
175
|
+
return True
|
|
176
|
+
|
|
168
177
|
return False
|
|
169
178
|
|
|
170
179
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
@@ -205,12 +214,20 @@ class STTMuteFilter(FrameProcessor):
|
|
|
205
214
|
self._first_speech_handled = True
|
|
206
215
|
should_mute = await self._should_mute()
|
|
207
216
|
self.logger.debug(f"BotStoppedSpeaking: should mute={should_mute}")
|
|
217
|
+
elif isinstance(frame, StartDTMFCaptureFrame):
|
|
218
|
+
self._dtmf_capture_active = True
|
|
219
|
+
should_mute = await self._should_mute()
|
|
220
|
+
elif isinstance(frame, EndDTMFCaptureFrame):
|
|
221
|
+
self._dtmf_capture_active = False
|
|
222
|
+
should_mute = await self._should_mute()
|
|
208
223
|
elif isinstance(frame, STTMuteFrame):
|
|
224
|
+
# TODO: Duplication of frame is actually happening. We get this frame from the downstream and then we again push it downstream. Also we're psuhing is upstream and again push it upstream in _handle_mute_state.
|
|
209
225
|
should_mute = frame.mute
|
|
210
226
|
|
|
211
227
|
# Then push the original frame
|
|
212
228
|
# Conditionally include InputAudioRawFrame in suppression tuple based on voicemail_detection_enabled
|
|
213
229
|
suppression_types = (
|
|
230
|
+
InterruptionFrame,
|
|
214
231
|
StartInterruptionFrame,
|
|
215
232
|
VADUserStartedSpeakingFrame,
|
|
216
233
|
VADUserStoppedSpeakingFrame,
|