dv-pipecat-ai 0.0.82.dev68__py3-none-any.whl → 0.0.82.dev70__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.82.dev68
3
+ Version: 0.0.82.dev70
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.82.dev68.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.82.dev70.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -74,7 +74,7 @@ pipecat/extensions/voicemail/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
74
74
  pipecat/extensions/voicemail/voicemail_detector.py,sha256=g3L1m3cPJzsadeB5a8WRC9klH0D8m7xfPgB2YEaL6Do,29983
75
75
  pipecat/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
76
  pipecat/frames/frames.proto,sha256=JXZm3VXLR8zMOUcOuhVoe2mhM3MQIQGMJXLopdJO_5Y,839
77
- pipecat/frames/frames.py,sha256=ASeOObRvTRwbFBCXOHVEiKyLZZjZLhfouXIBhccEsa0,45163
77
+ pipecat/frames/frames.py,sha256=oqoo7p-uJOqak50mxhCGq7S0TusM0I4qp3QAftKHQnw,45428
78
78
  pipecat/frames/protobufs/frames_pb2.py,sha256=VHgGV_W7qQ4sfQK6RHb5_DggLm3PiSYMr6aBZ8_p1cQ,2590
79
79
  pipecat/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
80
80
  pipecat/metrics/metrics.py,sha256=bdZNciEtLTtA-xgoKDz2RJAy6fKrXkTwz3pryVHzc2M,2713
@@ -102,7 +102,7 @@ pipecat/pipeline/to_be_updated/merge_pipeline.py,sha256=jLEWdufIW3z1xZhdoLowdJ_S
102
102
  pipecat/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
103
  pipecat/processors/async_generator.py,sha256=qPOZxk5eOad_NrF_Z06vWZ6deXIxb9AKZKYO2e5pkJs,2385
104
104
  pipecat/processors/consumer_processor.py,sha256=DrWCKnfblknZJ0bLmR_unIeJ1axQw4IPUn2IB3KLGGA,3228
105
- pipecat/processors/dtmf_aggregator.py,sha256=Qucrbq66Oj3cFZV_uDLcVmLk44xJ2_9h4lDDo1an3dE,9406
105
+ pipecat/processors/dtmf_aggregator.py,sha256=mo_IXUlsnVl-_Xn8sbTGnRF4Lkts0h6E3uauGbeFyWs,10204
106
106
  pipecat/processors/frame_processor.py,sha256=VlU1h01FUilQ9UGzn7uuXELtNaASwbiMQxPChySJ7_g,29727
107
107
  pipecat/processors/idle_frame_processor.py,sha256=z8AuhGap61lA5K35P6XCaOpn4kkmK_9NZNppbpQxheU,3124
108
108
  pipecat/processors/logger.py,sha256=VGNwxQSc_F0rS3KBmfqas7f5aFyRQKfeljozOxfGXk4,2393
@@ -110,7 +110,7 @@ pipecat/processors/producer_processor.py,sha256=iIIOHZd77APvUGP7JqFbznAHUnCULcq_
110
110
  pipecat/processors/text_transformer.py,sha256=LnfWJYzntJhZhrQ1lgSSY4D4VbHtrQJgrC227M69ZYU,1718
111
111
  pipecat/processors/transcript_processor.py,sha256=CG9yej6WOiy_HhagNXjxkISHkHii0JDfK_V6opseC2E,11740
112
112
  pipecat/processors/two_stage_user_idle_processor.py,sha256=uf2aZh_lfW-eMxmFogP3R4taAJ1yXOSqjKsR7oXtD0Y,2938
113
- pipecat/processors/user_idle_processor.py,sha256=mGYv6UYxU7Qbgg4pTuGxDmZxnlyEtwMWaXtrQ9_fvaY,7969
113
+ pipecat/processors/user_idle_processor.py,sha256=qRBDzbXBQp07qV7Uh_p4-349BE2Un6hg2iqIAmNGcT0,8562
114
114
  pipecat/processors/aggregators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
115
115
  pipecat/processors/aggregators/dtmf_aggregator.py,sha256=nngjLiaOtcZtuCNpYPyfUVLtUBUC6USuvS0tSdA9_zo,5054
116
116
  pipecat/processors/aggregators/gated.py,sha256=tii0sRrBkRW6y9Xq5iTWPnqlOEejU4VqPIPtdOa61pc,3073
@@ -129,7 +129,7 @@ pipecat/processors/filters/frame_filter.py,sha256=ZPtHToASfbbtwAdrnQH8POKIvT8hF0
129
129
  pipecat/processors/filters/function_filter.py,sha256=QNQZBIe1gzSPNI_4Zg2fgyeUhX-AmbIMp7r_XWNhwjU,2400
130
130
  pipecat/processors/filters/identity_filter.py,sha256=YNQWNNnuHivNwJa71Gc7A6ZHHq5Zw_kvuNrq9LUKK44,1418
131
131
  pipecat/processors/filters/null_filter.py,sha256=CourFfNXyhaesksiBuXxv5-mFSDpy6e9bOJ04p3iK40,1467
132
- pipecat/processors/filters/stt_mute_filter.py,sha256=BP1PX2Ka80ZZV1Mpp4OH9xA3V6cntsseQ7VUnXREWnw,9356
132
+ pipecat/processors/filters/stt_mute_filter.py,sha256=a9Pgp-z1pNQtDIKBtzdP4yFLf-3EhAoQAd0XSXWLpsQ,10147
133
133
  pipecat/processors/filters/wake_check_filter.py,sha256=EKOuw_DCK4EWJ794xS8Xza-QQImD-pjgWYp0wdyvHjI,5099
134
134
  pipecat/processors/filters/wake_notifier_filter.py,sha256=1yV3Tw8OROCS97nuZNs4igcNvRQyYu1RG2gNvYMWxKc,2077
135
135
  pipecat/processors/frameworks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -290,7 +290,7 @@ pipecat/services/sambanova/__init__.py,sha256=oTXExLic-qTcsfsiWmssf3Elclf3IIWoN4
290
290
  pipecat/services/sambanova/llm.py,sha256=5XVfPLEk__W8ykFqLdV95ZUhlGGkAaJwmbciLdZYtTc,8976
291
291
  pipecat/services/sambanova/stt.py,sha256=ZZgEZ7WQjLFHbCko-3LNTtVajjtfUvbtVLtFcaNadVQ,2536
292
292
  pipecat/services/sarvam/__init__.py,sha256=B4TN_tTHV9fWg0aSoPvfQlXISA0nJaQ9-u08I9UWvH4,280
293
- pipecat/services/sarvam/stt.py,sha256=cSrQaDpixNQh4tl8r2xRNREHjKKcyLmrFDLa-Lp4Hl4,15465
293
+ pipecat/services/sarvam/stt.py,sha256=p9Iq4loMwnftNZ_S0WoFSoX7iBbRKyja6RsVWbpj508,19314
294
294
  pipecat/services/sarvam/tts.py,sha256=K-AtWE1Q0ZZwshLP-7sCDmOSIWhuKOj91BCCE4N9XAk,25010
295
295
  pipecat/services/simli/__init__.py,sha256=cbDcqOaGsEgKbGYKpJ1Vv7LN4ZjOWA04sE84WW5vgQI,257
296
296
  pipecat/services/simli/video.py,sha256=fVMYsCE5epH9rTdhN_tyPPJw7W6TCMHCOe2akKHWduw,8330
@@ -378,7 +378,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=HwDCqLGijhYD3F8nxDuQmEw-YkRw0
378
378
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
379
379
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
380
380
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
381
- dv_pipecat_ai-0.0.82.dev68.dist-info/METADATA,sha256=tRV7JwvNl-emWJwrua577U-gfTxxMtB2RY_ZeI4Qpro,32692
382
- dv_pipecat_ai-0.0.82.dev68.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
383
- dv_pipecat_ai-0.0.82.dev68.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
384
- dv_pipecat_ai-0.0.82.dev68.dist-info/RECORD,,
381
+ dv_pipecat_ai-0.0.82.dev70.dist-info/METADATA,sha256=YeEWgQg0UE5-naruvtBkTnRuW-3TemsWbGjDsSz-zl4,32692
382
+ dv_pipecat_ai-0.0.82.dev70.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
383
+ dv_pipecat_ai-0.0.82.dev70.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
384
+ dv_pipecat_ai-0.0.82.dev70.dist-info/RECORD,,
pipecat/frames/frames.py CHANGED
@@ -1306,6 +1306,20 @@ class SpeechControlParamsFrame(SystemFrame):
1306
1306
  turn_params: Optional[SmartTurnParams] = None
1307
1307
 
1308
1308
 
1309
+ @dataclass
1310
+ class StartDTMFCaptureFrame(SystemFrame):
1311
+ """System frame indicating the bot is actively collecting DTMF input."""
1312
+
1313
+ pass
1314
+
1315
+
1316
+ @dataclass
1317
+ class EndDTMFCaptureFrame(SystemFrame):
1318
+ """System frame indicating DTMF collection has finished."""
1319
+
1320
+ pass
1321
+
1322
+
1309
1323
  #
1310
1324
  # Control frames
1311
1325
  #
@@ -1476,7 +1490,7 @@ class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
1476
1490
  @dataclass
1477
1491
  class DTMFUpdateSettingsFrame(ServiceUpdateSettingsFrame):
1478
1492
  """Frame for updating DTMF aggregator settings.
1479
-
1493
+
1480
1494
  Updates DTMF configuration dynamically during conversation flow.
1481
1495
  Settings can include: timeout, digits, end, reset parameters.
1482
1496
  """
@@ -4,15 +4,13 @@ from pipecat.frames.frames import (
4
4
  BotSpeakingFrame,
5
5
  CancelFrame,
6
6
  DTMFUpdateSettingsFrame,
7
+ EndDTMFCaptureFrame,
7
8
  EndFrame,
8
9
  Frame,
9
10
  InputDTMFFrame,
11
+ StartDTMFCaptureFrame,
10
12
  StartInterruptionFrame,
11
- StartUserIdleProcessorFrame,
12
- StopUserIdleProcessorFrame,
13
13
  TranscriptionFrame,
14
- UserStartedSpeakingFrame,
15
- UserStoppedSpeakingFrame,
16
14
  WaitForDTMFFrame,
17
15
  )
18
16
  from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -44,95 +42,78 @@ class DTMFAggregator(FrameProcessor):
44
42
  self._idle_timeout = timeout
45
43
  self._digits = digits
46
44
  self._digit_event = asyncio.Event()
47
- self._digit_aggregate_task = None
45
+ self._aggregation_task = None
48
46
  self._end_on = end_on if end_on else set()
49
47
  self._reset_on = reset_on if reset_on else set()
50
- self._stopped_idle_processor = False
51
-
52
- async def _start_idle_processor(self):
53
- await self.push_frame(StartUserIdleProcessorFrame(), FrameDirection.UPSTREAM)
54
- self._stopped_idle_processor = False
55
-
56
- async def _stop_idle_processor(self):
57
- await self.push_frame(StopUserIdleProcessorFrame(), FrameDirection.UPSTREAM)
58
- self._stopped_idle_processor = True
48
+ self._dtmf_capture_active = False
59
49
 
60
50
  async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
61
51
  # Handle DTMF frames.
62
52
  await super().process_frame(frame, direction)
63
- await self.push_frame(frame, direction)
64
- if isinstance(frame, InputDTMFFrame):
65
- # Start the digit aggregation task if it's not running yet.
66
- if self._digit_aggregate_task is None:
67
- self._digit_aggregate_task = self.create_task(self._digit_agg_handler(direction))
68
-
69
- # Append the incoming digit.
70
- if frame.button.value in self._reset_on:
71
- self._aggregation = ""
72
- elif frame.button.value in self._end_on:
73
- await self.flush_aggregation(direction)
74
- self._aggregation = ""
75
- else:
76
- self._digit_event.set()
77
- self._aggregation += frame.button.value
78
-
79
- # Flush if the aggregated digits reach the specified length.
80
- if self._digits and len(self._aggregation) == self._digits:
81
- await self.flush_aggregation(direction)
82
- self._aggregation = ""
83
- if self._stopped_idle_processor:
84
- await self._start_idle_processor()
85
53
 
54
+ if isinstance(frame, InputDTMFFrame):
55
+ # Push the DTMF frame downstream first
56
+ await self.push_frame(frame, direction)
57
+ # Then handle it for proper frame ordering
58
+ await self._handle_dtmf_frame(frame)
86
59
  elif isinstance(frame, (EndFrame, CancelFrame)):
87
60
  # For EndFrame, flush any pending aggregation and stop the digit aggregation task.
88
61
  if self._aggregation:
89
- await self.flush_aggregation(direction)
90
- if self._digit_aggregate_task:
91
- await self._stop_digit_aggregate_task()
62
+ await self.flush_aggregation()
63
+ if self._aggregation_task:
64
+ await self._stop_aggregation_task()
65
+ await self.push_frame(frame, direction)
92
66
  elif isinstance(frame, WaitForDTMFFrame):
93
67
  self.logger.debug("Received WaitForDTMFFrame: Waiting for DTMF input")
94
- if self._digit_aggregate_task is None:
95
- self._digit_aggregate_task = self.create_task(
96
- self._digit_agg_handler(direction, raise_timeout=True)
97
- )
98
- self._digit_event.set()
99
- await self._stop_idle_processor()
68
+ self._create_aggregation_task(raise_timeout=True)
69
+ self._digit_event.set() # Trigger the timeout handler
70
+ await self._start_dtmf_capture()
71
+ await self.push_frame(frame, direction)
100
72
  elif isinstance(frame, StartInterruptionFrame):
101
- self.logger.debug("Received StartInterruptionFrame: Starting idle processor")
102
- if self._stopped_idle_processor:
103
- await self._start_idle_processor()
73
+ self.logger.debug("Received StartInterruptionFrame")
104
74
  if self._aggregation:
105
- await self.flush_aggregation(direction)
75
+ await self.flush_aggregation()
76
+ await self._end_dtmf_capture()
77
+ await self.push_frame(frame, direction)
106
78
  elif isinstance(frame, BotSpeakingFrame):
107
- if self._digit_aggregate_task is not None:
79
+ # Signal the aggregation task to continue when bot speaks
80
+ if self._aggregation_task is not None:
108
81
  self._digit_event.set()
82
+ await self.push_frame(frame, direction)
109
83
  elif isinstance(frame, DTMFUpdateSettingsFrame):
110
84
  await self._update_settings(frame.settings)
111
85
  # Don't pass the settings frame downstream
86
+ else:
87
+ # Pass all other frames through
88
+ await self.push_frame(frame, direction)
112
89
 
113
90
  async def _update_settings(self, settings: dict) -> None:
114
91
  """Update DTMF aggregator settings dynamically.
115
-
92
+
116
93
  Args:
117
94
  settings: Dictionary containing new DTMF settings
118
95
  Supported keys: timeout, digits, end, reset
119
96
  """
120
97
  settings_changed = False
121
-
98
+
122
99
  if "timeout" in settings and settings["timeout"] is not None:
123
100
  new_timeout = float(settings["timeout"])
124
101
  if new_timeout != self._idle_timeout:
125
- self.logger.debug(f"Updating DTMF timeout from {self._idle_timeout} to {new_timeout}")
102
+ self.logger.debug(
103
+ f"Updating DTMF timeout from {self._idle_timeout} to {new_timeout}"
104
+ )
126
105
  self._idle_timeout = new_timeout
127
106
  settings_changed = True
128
-
107
+
129
108
  if "digits" in settings:
130
109
  new_digits = settings["digits"]
131
110
  if new_digits != self._digits:
132
- self.logger.debug(f"Updating DTMF digits from {self._digits} to {new_digits}")
111
+ self.logger.debug(
112
+ f"Updating DTMF digits from {self._digits} to {new_digits}"
113
+ )
133
114
  self._digits = new_digits
134
115
  settings_changed = True
135
-
116
+
136
117
  if "end" in settings:
137
118
  # Convert single string to set if needed
138
119
  end_value = settings["end"]
@@ -142,12 +123,14 @@ class DTMFAggregator(FrameProcessor):
142
123
  new_end_on = {end_value} if end_value else set()
143
124
  else:
144
125
  new_end_on = set(end_value)
145
-
126
+
146
127
  if new_end_on != self._end_on:
147
- self.logger.debug(f"Updating DTMF end_on from {self._end_on} to {new_end_on}")
128
+ self.logger.debug(
129
+ f"Updating DTMF end_on from {self._end_on} to {new_end_on}"
130
+ )
148
131
  self._end_on = new_end_on
149
132
  settings_changed = True
150
-
133
+
151
134
  if "reset" in settings:
152
135
  # Convert single string to set if needed
153
136
  reset_value = settings["reset"]
@@ -157,58 +140,116 @@ class DTMFAggregator(FrameProcessor):
157
140
  new_reset_on = {reset_value} if reset_value else set()
158
141
  else:
159
142
  new_reset_on = set(reset_value)
160
-
143
+
161
144
  if new_reset_on != self._reset_on:
162
- self.logger.debug(f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}")
145
+ self.logger.debug(
146
+ f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}"
147
+ )
163
148
  self._reset_on = new_reset_on
164
149
  settings_changed = True
165
-
150
+
166
151
  if settings_changed:
167
152
  self.logger.info(f"DTMF settings updated successfully")
168
153
 
169
- async def _digit_agg_handler(self, direction: FrameDirection, raise_timeout=False):
170
- """Idle task that waits for new DTMF activity. If no new digit is received within
171
- the timeout period, the current aggregation is flushed.
172
- """
154
+ async def _handle_dtmf_frame(self, frame: InputDTMFFrame):
155
+ """Handle DTMF input frame processing."""
156
+ # Create aggregation task if needed
157
+ if self._aggregation_task is None:
158
+ self._create_aggregation_task()
159
+
160
+ digit_value = frame.button.value
161
+
162
+ # Handle reset digits
163
+ if digit_value in self._reset_on:
164
+ self._aggregation = ""
165
+ return
166
+
167
+ # Handle end digits
168
+ if digit_value in self._end_on:
169
+ if self._aggregation: # Only flush if we have aggregation
170
+ await self.flush_aggregation()
171
+ return
172
+
173
+ # Add digit to aggregation
174
+ self._aggregation += digit_value
175
+
176
+ # Signal the aggregation task that a digit was received
177
+ self._digit_event.set()
178
+
179
+ # Check if we reached the digit limit
180
+ if self._digits and len(self._aggregation) == self._digits:
181
+ await self.flush_aggregation()
182
+
183
+ def _create_aggregation_task(self, raise_timeout: bool = False) -> None:
184
+ """Creates the aggregation task if it hasn't been created yet."""
185
+ if not self._aggregation_task:
186
+ self._aggregation_task = self.create_task(
187
+ self._aggregation_task_handler(raise_timeout)
188
+ )
189
+
190
+ async def _stop_aggregation_task(self) -> None:
191
+ """Stops the aggregation task."""
192
+ if self._aggregation_task:
193
+ await self.cancel_task(self._aggregation_task)
194
+ self._aggregation_task = None
195
+
196
+ async def _aggregation_task_handler(self, raise_timeout=False):
197
+ """Background task that handles timeout-based flushing."""
173
198
  while True:
174
199
  try:
175
200
  # Wait for a new digit signal with a timeout.
176
- await asyncio.wait_for(self._digit_event.wait(), timeout=self._idle_timeout)
177
- except asyncio.TimeoutError:
178
- # No new digit arrived within the timeout period; flush aggregation if non-empty.
179
- await self.flush_aggregation(direction, raise_timeout)
180
- finally:
181
- # Clear the event for the next cycle.
201
+ await asyncio.wait_for(
202
+ self._digit_event.wait(), timeout=self._idle_timeout
203
+ )
182
204
  self._digit_event.clear()
205
+ except asyncio.TimeoutError:
206
+ # No new digit arrived within the timeout period; flush if needed
207
+ await self.flush_aggregation(raise_timeout=raise_timeout)
183
208
 
184
- async def flush_aggregation(self, direction: FrameDirection, raise_timeout=False):
209
+ async def flush_aggregation(self, *, raise_timeout: bool = False):
185
210
  """Flush the aggregated digits by emitting a TranscriptionFrame downstream."""
186
211
  if self._aggregation:
187
- # Todo: Change to different frame type if we decide to handle it in llm processor separately.
212
+ # Create transcription frame
188
213
  aggregated_frame = TranscriptionFrame(
189
214
  f"User inputted: {self._aggregation}.", "", time_now_iso8601()
190
215
  )
191
216
  aggregated_frame.metadata["push_aggregation"] = True
192
- await self.push_frame(StartInterruptionFrame())
193
- await self.push_frame(aggregated_frame, direction)
217
+
218
+ # Send interruption frame (as per original design)
219
+ await self.push_frame(StartInterruptionFrame(), FrameDirection.DOWNSTREAM)
220
+
221
+ # Push the transcription frame
222
+ await self.push_frame(aggregated_frame, FrameDirection.DOWNSTREAM)
223
+
224
+ # Reset state
194
225
  self._aggregation = ""
195
- elif raise_timeout and self._stopped_idle_processor:
226
+ await self._end_dtmf_capture()
227
+
228
+ elif raise_timeout and not self._aggregation:
229
+ # Timeout with no aggregation (WaitForDTMFFrame case)
196
230
  transcript_frame = TranscriptionFrame(
197
231
  "User didn't press any digits on the keyboard.", "", time_now_iso8601()
198
232
  )
199
233
  transcript_frame.metadata["push_aggregation"] = True
200
- await self.push_frame(transcript_frame)
201
- if self._stopped_idle_processor:
202
- await self._start_idle_processor()
234
+ await self.push_frame(transcript_frame, FrameDirection.DOWNSTREAM)
235
+ await self._end_dtmf_capture()
236
+
237
+ async def _start_dtmf_capture(self):
238
+ """Signal the start of DTMF capture upstream."""
239
+ if self._dtmf_capture_active:
240
+ return
241
+ await self.push_frame(StartDTMFCaptureFrame(), FrameDirection.UPSTREAM)
242
+ self._dtmf_capture_active = True
203
243
 
204
- async def _stop_digit_aggregate_task(self):
205
- """Cancels the digit aggregation task if it exists."""
206
- if self._digit_aggregate_task:
207
- await self.cancel_task(self._digit_aggregate_task)
208
- self._digit_aggregate_task = None
244
+ async def _end_dtmf_capture(self):
245
+ """Signal the end of DTMF capture upstream."""
246
+ if not self._dtmf_capture_active:
247
+ return
248
+ await self.push_frame(EndDTMFCaptureFrame(), FrameDirection.UPSTREAM)
249
+ self._dtmf_capture_active = False
209
250
 
210
251
  async def cleanup(self) -> None:
211
252
  """Cleans up resources, ensuring that the digit aggregation task is cancelled."""
212
253
  await super().cleanup()
213
- if self._digit_aggregate_task:
214
- await self._stop_digit_aggregate_task()
254
+ if self._aggregation_task:
255
+ await self._stop_aggregation_task()
@@ -27,12 +27,14 @@ from pipecat.frames.frames import (
27
27
  InterimTranscriptionFrame,
28
28
  StartFrame,
29
29
  StartInterruptionFrame,
30
+ StartDTMFCaptureFrame,
30
31
  STTMuteFrame,
31
32
  TranscriptionFrame,
32
33
  UserStartedSpeakingFrame,
33
34
  UserStoppedSpeakingFrame,
34
35
  VADUserStartedSpeakingFrame,
35
36
  VADUserStoppedSpeakingFrame,
37
+ EndDTMFCaptureFrame,
36
38
  )
37
39
  from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
38
40
 
@@ -58,6 +60,7 @@ class STTMuteStrategy(Enum):
58
60
  FUNCTION_CALL = "function_call"
59
61
  ALWAYS = "always"
60
62
  CUSTOM = "custom"
63
+ DTMF_CAPTURE = "dtmf_capture"
61
64
 
62
65
 
63
66
  @dataclass
@@ -120,6 +123,7 @@ class STTMuteFilter(FrameProcessor):
120
123
  self._function_call_in_progress = False
121
124
  self._is_muted = False # Initialize as unmuted, will set state on StartFrame if needed
122
125
  self._voicemail_detection_enabled = False # Default to False
126
+ self._dtmf_capture_active = False
123
127
 
124
128
  @property
125
129
  def is_muted(self) -> bool:
@@ -165,6 +169,10 @@ class STTMuteFilter(FrameProcessor):
165
169
  if should_mute:
166
170
  return True
167
171
 
172
+ case STTMuteStrategy.DTMF_CAPTURE:
173
+ if self._dtmf_capture_active:
174
+ return True
175
+
168
176
  return False
169
177
 
170
178
  async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -205,7 +213,14 @@ class STTMuteFilter(FrameProcessor):
205
213
  self._first_speech_handled = True
206
214
  should_mute = await self._should_mute()
207
215
  self.logger.debug(f"BotStoppedSpeaking: should mute={should_mute}")
216
+ elif isinstance(frame, StartDTMFCaptureFrame):
217
+ self._dtmf_capture_active = True
218
+ should_mute = await self._should_mute()
219
+ elif isinstance(frame, EndDTMFCaptureFrame):
220
+ self._dtmf_capture_active = False
221
+ should_mute = await self._should_mute()
208
222
  elif isinstance(frame, STTMuteFrame):
223
+ # TODO: Duplication of frame is actually happening. We get this frame from the downstream and then we again push it downstream. Also we're psuhing is upstream and again push it upstream in _handle_mute_state.
209
224
  should_mute = frame.mute
210
225
 
211
226
  # Then push the original frame
@@ -15,17 +15,17 @@ from loguru import logger
15
15
  from pipecat.frames.frames import (
16
16
  BotSpeakingFrame,
17
17
  CancelFrame,
18
+ EndDTMFCaptureFrame,
18
19
  EndFrame,
19
20
  Frame,
20
21
  FunctionCallCancelFrame,
21
22
  FunctionCallInProgressFrame,
22
23
  FunctionCallResultFrame,
23
24
  InputDTMFFrame,
25
+ StartDTMFCaptureFrame,
26
+ StartFrame,
24
27
  StartUserIdleProcessorFrame,
25
28
  StopUserIdleProcessorFrame,
26
- FunctionCallInProgressFrame,
27
- FunctionCallResultFrame,
28
- StartFrame,
29
29
  UserStartedSpeakingFrame,
30
30
  UserStoppedSpeakingFrame,
31
31
  )
@@ -83,6 +83,8 @@ class UserIdleProcessor(FrameProcessor):
83
83
  self._timeout = timeout
84
84
  self._retry_count = 0
85
85
  self._interrupted = False
86
+ self._function_call_active = False
87
+ self._dtmf_capture_active = False
86
88
  self._conversation_started = False
87
89
  self._idle_task = None
88
90
  self._idle_event = asyncio.Event()
@@ -180,10 +182,20 @@ class UserIdleProcessor(FrameProcessor):
180
182
  self._idle_event.set()
181
183
  elif isinstance(frame, FunctionCallInProgressFrame):
182
184
  # Function calls can take longer than the timeout, so we want to prevent idle callbacks
185
+ self._function_call_active = True
183
186
  self._interrupted = True
184
187
  self._idle_event.set()
185
188
  elif isinstance(frame, FunctionCallResultFrame):
186
- self._interrupted = False
189
+ self._function_call_active = False
190
+ self._interrupted = self._dtmf_capture_active
191
+ self._idle_event.set()
192
+ elif isinstance(frame, StartDTMFCaptureFrame):
193
+ self._dtmf_capture_active = True
194
+ self._interrupted = True
195
+ self._idle_event.set()
196
+ elif isinstance(frame, EndDTMFCaptureFrame):
197
+ self._dtmf_capture_active = False
198
+ self._interrupted = self._function_call_active
187
199
  self._idle_event.set()
188
200
  elif isinstance(frame, StartUserIdleProcessorFrame):
189
201
  if not self._idle_task:
@@ -31,6 +31,9 @@ from pipecat.utils.tracing.service_decorators import traced_stt
31
31
  try:
32
32
  import websockets
33
33
  from sarvamai import AsyncSarvamAI
34
+ from sarvamai.speech_to_text_streaming.socket_client import (
35
+ AsyncSpeechToTextStreamingSocketClient,
36
+ )
34
37
  from sarvamai.speech_to_text_translate_streaming.socket_client import (
35
38
  AsyncSpeechToTextTranslateStreamingSocketClient,
36
39
  )
@@ -41,11 +44,11 @@ except ModuleNotFoundError as e:
41
44
  raise Exception(f"Missing module: {e}")
42
45
 
43
46
 
44
- def language_to_sarvam_language(language: Language) -> str:
45
- """Convert Language enum to Sarvam language code.
47
+ def language_to_sarvam_language(language) -> str:
48
+ """Convert Language enum or string to Sarvam language code.
46
49
 
47
50
  Args:
48
- language: The Language enum to convert.
51
+ language: The Language enum or language code string to convert.
49
52
 
50
53
  Returns:
51
54
  The corresponding Sarvam language code string.
@@ -53,6 +56,30 @@ def language_to_sarvam_language(language: Language) -> str:
53
56
  Raises:
54
57
  ValueError: If the language is not supported by Sarvam.
55
58
  """
59
+ # If already a string in the right format, return it
60
+ if isinstance(language, str):
61
+ if "-" in language: # Already in format like "hi-IN"
62
+ return language
63
+ # Convert short codes to full format
64
+ lang_map = {
65
+ "hi": "hi-IN",
66
+ "bn": "bn-IN",
67
+ "gu": "gu-IN",
68
+ "kn": "kn-IN",
69
+ "ml": "ml-IN",
70
+ "mr": "mr-IN",
71
+ "ta": "ta-IN",
72
+ "te": "te-IN",
73
+ "pa": "pa-IN",
74
+ "or": "od-IN",
75
+ "as": "as-IN",
76
+ "en": "en-IN",
77
+ }
78
+ if language.lower() in lang_map:
79
+ return lang_map[language.lower()]
80
+ raise ValueError(f"Unsupported language string: {language}")
81
+
82
+ # Handle Language enum
56
83
  match language:
57
84
  case Language.BN_IN:
58
85
  return "bn-IN"
@@ -133,6 +160,13 @@ class SarvamSTTService(STTService):
133
160
  """Sarvam speech-to-text service.
134
161
 
135
162
  Provides real-time speech recognition using Sarvam's WebSocket API.
163
+ Supports both Saarika (transcription) and Saaras (translation) models.
164
+
165
+ Models:
166
+ - Saarika (saarika:v2.5): Transcription in a single language
167
+ - Saaras (saaras:v2.5): Translation from source language to target language
168
+
169
+ The service automatically selects the correct endpoint based on the model name.
136
170
  """
137
171
 
138
172
  def __init__(
@@ -253,6 +287,7 @@ class SarvamSTTService(STTService):
253
287
  # Convert audio bytes to base64 for Sarvam API
254
288
  audio_base64 = base64.b64encode(audio).decode("utf-8")
255
289
 
290
+ # Sarvam requires 'audio/wav' encoding (even for raw PCM data)
256
291
  message = {
257
292
  "audio": {
258
293
  "data": audio_base64,
@@ -273,33 +308,47 @@ class SarvamSTTService(STTService):
273
308
 
274
309
  async def _connect(self):
275
310
  """Connect to Sarvam WebSocket API directly."""
276
- logger.debug("Connecting to Sarvam")
311
+ logger.debug(f"Connecting to Sarvam with model: {self._model}")
277
312
 
278
313
  try:
279
- # Build WebSocket URL and headers manually
280
- ws_url = (
281
- self._client._client_wrapper.get_environment().production
282
- + "/speech-to-text-translate/ws"
283
- )
314
+ base_url = self._client._client_wrapper.get_environment().production
315
+
316
+ # Choose endpoint and socket class based on model
317
+ if self._model.startswith("saarika"):
318
+ # Saarika = Transcription endpoint
319
+ path = "/speech-to-text/ws"
320
+ query_params = {
321
+ "language-code": language_to_sarvam_language(self._language),
322
+ "model": self._model,
323
+ "vad_signals": "true",
324
+ }
325
+ socket_cls = AsyncSpeechToTextStreamingSocketClient
326
+ logger.debug(
327
+ f"Using Saarika transcription endpoint with language: {self._language}"
328
+ )
329
+ else:
330
+ # Saaras = Translation endpoint
331
+ path = "/speech-to-text-translate/ws"
332
+ query_params = {
333
+ "model": self._model,
334
+ "vad_signals": "true",
335
+ }
336
+ socket_cls = AsyncSpeechToTextTranslateStreamingSocketClient
337
+ logger.debug("Using Saaras translation endpoint")
284
338
 
285
- # Add query parameters
286
- query_params = {"model": self._model, "vad_signals": "true"}
287
- query_string = urlencode(query_params)
288
- ws_url = ws_url + f"?{query_string}"
339
+ ws_url = f"{base_url}{path}?{urlencode(query_params)}"
289
340
 
290
341
  # Get headers
291
342
  headers = self._client._client_wrapper.get_headers()
292
343
  headers["Api-Subscription-Key"] = self._api_key
293
344
 
294
- # Connect to WebSocket directly
345
+ # Connect to WebSocket
295
346
  self._websocket_connection = await websockets.connect(
296
347
  ws_url, additional_headers=headers
297
348
  )
298
349
 
299
350
  # Create the socket client wrapper
300
- self._websocket = AsyncSpeechToTextTranslateStreamingSocketClient(
301
- websocket=self._websocket_connection
302
- )
351
+ self._websocket = socket_cls(websocket=self._websocket_connection)
303
352
 
304
353
  # Start listening for messages
305
354
  self._listening_task = asyncio.create_task(self._listen_for_messages())
@@ -309,7 +358,10 @@ class SarvamSTTService(STTService):
309
358
  except websockets.exceptions.InvalidStatusCode as e:
310
359
  error_msg = f"Failed to connect to Sarvam: HTTP {e.status_code}"
311
360
  if e.status_code == 403:
312
- error_msg += f" - Access denied. Your API key may not have access to model '{self._model}'. Available models: saaras:v2, saaras:v2.5"
361
+ if self._model.startswith("saarika"):
362
+ error_msg += f" - Access denied. Check: 1) API key has Saarika access, 2) Model '{self._model}' exists (try saarika:v2.5), 3) Using correct endpoint (transcription)"
363
+ else:
364
+ error_msg += f" - Access denied. Check: 1) API key has Saaras access, 2) Model '{self._model}' exists (try saaras:v2.5), 3) Using correct endpoint (translation)"
313
365
  elif e.status_code == 401:
314
366
  error_msg += " - Invalid API key"
315
367
  logger.error(error_msg)
@@ -370,21 +422,60 @@ class SarvamSTTService(STTService):
370
422
  async def _handle_response(self, response):
371
423
  """Handle transcription response from Sarvam.
372
424
 
425
+ Handles both Saarika (transcription) and Saaras (translation) message formats.
426
+
373
427
  Args:
374
428
  response: The response object from Sarvam WebSocket.
375
429
  """
376
430
  logger.debug(f"Received response: {response}")
377
431
 
378
432
  try:
379
- if response["type"] == "error":
433
+ msg_type = response.get("type")
434
+
435
+ # Error handling
436
+ if msg_type == "error":
380
437
  error_msg = response.get("data", {}).get("message", "Unknown error")
381
438
  logger.error(f"Sarvam API error: {error_msg}")
382
439
  await self.push_error(ErrorFrame(f"Sarvam API error: {error_msg}"))
383
- # Close connection on error
384
440
  await self._disconnect()
385
441
  return
386
442
 
387
- if response["type"] == "events":
443
+ # Modern Saarika/Saaras message format
444
+ if msg_type == "speech_start":
445
+ await self.start_metrics()
446
+ logger.debug("User started speaking")
447
+ await self._call_event_handler("on_speech_started")
448
+ return
449
+
450
+ if msg_type == "speech_end":
451
+ logger.debug("User stopped speaking")
452
+ await self._call_event_handler("on_speech_ended")
453
+ return
454
+
455
+ if msg_type == "transcript":
456
+ await self.stop_ttfb_metrics()
457
+ # Handle both Saarika (text) and Saaras (text + text_translated)
458
+ transcript = response.get("text") or response.get("text_translated") or ""
459
+ language_code = (
460
+ response.get("source_language_code") or response.get("language_code") or "hi-IN"
461
+ )
462
+ language = self._map_language_code_to_enum(language_code)
463
+
464
+ if transcript.strip():
465
+ await self.push_frame(
466
+ TranscriptionFrame(
467
+ transcript,
468
+ self._user_id,
469
+ time_now_iso8601(),
470
+ language,
471
+ result=response,
472
+ )
473
+ )
474
+ await self.stop_processing_metrics()
475
+ return
476
+
477
+ # Legacy format (backward compatibility)
478
+ if msg_type == "events":
388
479
  parsed = EventResponse(**response)
389
480
  signal = parsed.data.signal_type
390
481
  timestamp = parsed.data.occured_at
@@ -397,14 +488,13 @@ class SarvamSTTService(STTService):
397
488
  elif signal == VADSignal.END:
398
489
  logger.debug("User stopped speaking")
399
490
  await self._call_event_handler("on_speech_ended")
491
+ return
400
492
 
401
- elif response["type"] == "data":
493
+ if msg_type == "data":
402
494
  await self.stop_ttfb_metrics()
403
495
  parsed = TranscriptionResponse(**response)
404
496
  transcript = parsed.data.transcript
405
- language_code = parsed.data.language_code
406
- if language_code is None:
407
- language_code = "hi-IN"
497
+ language_code = parsed.data.language_code or "hi-IN"
408
498
  language = self._map_language_code_to_enum(language_code)
409
499
 
410
500
  if transcript and transcript.strip():
@@ -417,8 +507,8 @@ class SarvamSTTService(STTService):
417
507
  result=response,
418
508
  )
419
509
  )
420
-
421
510
  await self.stop_processing_metrics()
511
+ return
422
512
 
423
513
  except Exception as e:
424
514
  logger.error(f"Error handling Sarvam response: {e}")