PyPI - dv-pipecat-ai - Versions diffs - 0.0.82.dev68__py3-none-any.whl → 0.0.82.dev70__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.82.dev68py3-none-any.whl → 0.0.82.dev70py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (10) hide show

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev70.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dv-pipecat-ai
-Version: 0.0.82.dev68
+Version: 0.0.82.dev70
 Summary: An open source framework for voice (and multimodal) assistants
 License-Expression: BSD-2-Clause
 Project-URL: Source, https://github.com/pipecat-ai/pipecat

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev70.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-dv_pipecat_ai-0.0.82.dev68.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
+dv_pipecat_ai-0.0.82.dev70.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
 pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
 pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -74,7 +74,7 @@ pipecat/extensions/voicemail/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 pipecat/extensions/voicemail/voicemail_detector.py,sha256=g3L1m3cPJzsadeB5a8WRC9klH0D8m7xfPgB2YEaL6Do,29983
 pipecat/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/frames/frames.proto,sha256=JXZm3VXLR8zMOUcOuhVoe2mhM3MQIQGMJXLopdJO_5Y,839
-pipecat/frames/frames.py,sha256=ASeOObRvTRwbFBCXOHVEiKyLZZjZLhfouXIBhccEsa0,45163
+pipecat/frames/frames.py,sha256=oqoo7p-uJOqak50mxhCGq7S0TusM0I4qp3QAftKHQnw,45428
 pipecat/frames/protobufs/frames_pb2.py,sha256=VHgGV_W7qQ4sfQK6RHb5_DggLm3PiSYMr6aBZ8_p1cQ,2590
 pipecat/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/metrics/metrics.py,sha256=bdZNciEtLTtA-xgoKDz2RJAy6fKrXkTwz3pryVHzc2M,2713
@@ -102,7 +102,7 @@ pipecat/pipeline/to_be_updated/merge_pipeline.py,sha256=jLEWdufIW3z1xZhdoLowdJ_S
 pipecat/processors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/processors/async_generator.py,sha256=qPOZxk5eOad_NrF_Z06vWZ6deXIxb9AKZKYO2e5pkJs,2385
 pipecat/processors/consumer_processor.py,sha256=DrWCKnfblknZJ0bLmR_unIeJ1axQw4IPUn2IB3KLGGA,3228
-pipecat/processors/dtmf_aggregator.py,sha256=Qucrbq66Oj3cFZV_uDLcVmLk44xJ2_9h4lDDo1an3dE,9406
+pipecat/processors/dtmf_aggregator.py,sha256=mo_IXUlsnVl-_Xn8sbTGnRF4Lkts0h6E3uauGbeFyWs,10204
 pipecat/processors/frame_processor.py,sha256=VlU1h01FUilQ9UGzn7uuXELtNaASwbiMQxPChySJ7_g,29727
 pipecat/processors/idle_frame_processor.py,sha256=z8AuhGap61lA5K35P6XCaOpn4kkmK_9NZNppbpQxheU,3124
 pipecat/processors/logger.py,sha256=VGNwxQSc_F0rS3KBmfqas7f5aFyRQKfeljozOxfGXk4,2393
@@ -110,7 +110,7 @@ pipecat/processors/producer_processor.py,sha256=iIIOHZd77APvUGP7JqFbznAHUnCULcq_
 pipecat/processors/text_transformer.py,sha256=LnfWJYzntJhZhrQ1lgSSY4D4VbHtrQJgrC227M69ZYU,1718
 pipecat/processors/transcript_processor.py,sha256=CG9yej6WOiy_HhagNXjxkISHkHii0JDfK_V6opseC2E,11740
 pipecat/processors/two_stage_user_idle_processor.py,sha256=uf2aZh_lfW-eMxmFogP3R4taAJ1yXOSqjKsR7oXtD0Y,2938
-pipecat/processors/user_idle_processor.py,sha256=mGYv6UYxU7Qbgg4pTuGxDmZxnlyEtwMWaXtrQ9_fvaY,7969
+pipecat/processors/user_idle_processor.py,sha256=qRBDzbXBQp07qV7Uh_p4-349BE2Un6hg2iqIAmNGcT0,8562
 pipecat/processors/aggregators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/processors/aggregators/dtmf_aggregator.py,sha256=nngjLiaOtcZtuCNpYPyfUVLtUBUC6USuvS0tSdA9_zo,5054
 pipecat/processors/aggregators/gated.py,sha256=tii0sRrBkRW6y9Xq5iTWPnqlOEejU4VqPIPtdOa61pc,3073
@@ -129,7 +129,7 @@ pipecat/processors/filters/frame_filter.py,sha256=ZPtHToASfbbtwAdrnQH8POKIvT8hF0
 pipecat/processors/filters/function_filter.py,sha256=QNQZBIe1gzSPNI_4Zg2fgyeUhX-AmbIMp7r_XWNhwjU,2400
 pipecat/processors/filters/identity_filter.py,sha256=YNQWNNnuHivNwJa71Gc7A6ZHHq5Zw_kvuNrq9LUKK44,1418
 pipecat/processors/filters/null_filter.py,sha256=CourFfNXyhaesksiBuXxv5-mFSDpy6e9bOJ04p3iK40,1467
-pipecat/processors/filters/stt_mute_filter.py,sha256=BP1PX2Ka80ZZV1Mpp4OH9xA3V6cntsseQ7VUnXREWnw,9356
+pipecat/processors/filters/stt_mute_filter.py,sha256=a9Pgp-z1pNQtDIKBtzdP4yFLf-3EhAoQAd0XSXWLpsQ,10147
 pipecat/processors/filters/wake_check_filter.py,sha256=EKOuw_DCK4EWJ794xS8Xza-QQImD-pjgWYp0wdyvHjI,5099
 pipecat/processors/filters/wake_notifier_filter.py,sha256=1yV3Tw8OROCS97nuZNs4igcNvRQyYu1RG2gNvYMWxKc,2077
 pipecat/processors/frameworks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -290,7 +290,7 @@ pipecat/services/sambanova/__init__.py,sha256=oTXExLic-qTcsfsiWmssf3Elclf3IIWoN4
 pipecat/services/sambanova/llm.py,sha256=5XVfPLEk__W8ykFqLdV95ZUhlGGkAaJwmbciLdZYtTc,8976
 pipecat/services/sambanova/stt.py,sha256=ZZgEZ7WQjLFHbCko-3LNTtVajjtfUvbtVLtFcaNadVQ,2536
 pipecat/services/sarvam/__init__.py,sha256=B4TN_tTHV9fWg0aSoPvfQlXISA0nJaQ9-u08I9UWvH4,280
-pipecat/services/sarvam/stt.py,sha256=cSrQaDpixNQh4tl8r2xRNREHjKKcyLmrFDLa-Lp4Hl4,15465
+pipecat/services/sarvam/stt.py,sha256=p9Iq4loMwnftNZ_S0WoFSoX7iBbRKyja6RsVWbpj508,19314
 pipecat/services/sarvam/tts.py,sha256=K-AtWE1Q0ZZwshLP-7sCDmOSIWhuKOj91BCCE4N9XAk,25010
 pipecat/services/simli/__init__.py,sha256=cbDcqOaGsEgKbGYKpJ1Vv7LN4ZjOWA04sE84WW5vgQI,257
 pipecat/services/simli/video.py,sha256=fVMYsCE5epH9rTdhN_tyPPJw7W6TCMHCOe2akKHWduw,8330
@@ -378,7 +378,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=HwDCqLGijhYD3F8nxDuQmEw-YkRw0
 pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
 pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
 pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
-dv_pipecat_ai-0.0.82.dev68.dist-info/METADATA,sha256=tRV7JwvNl-emWJwrua577U-gfTxxMtB2RY_ZeI4Qpro,32692
-dv_pipecat_ai-0.0.82.dev68.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dv_pipecat_ai-0.0.82.dev68.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
-dv_pipecat_ai-0.0.82.dev68.dist-info/RECORD,,
+dv_pipecat_ai-0.0.82.dev70.dist-info/METADATA,sha256=YeEWgQg0UE5-naruvtBkTnRuW-3TemsWbGjDsSz-zl4,32692
+dv_pipecat_ai-0.0.82.dev70.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dv_pipecat_ai-0.0.82.dev70.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
+dv_pipecat_ai-0.0.82.dev70.dist-info/RECORD,,

pipecat/frames/frames.py CHANGED Viewed

@@ -1306,6 +1306,20 @@ class SpeechControlParamsFrame(SystemFrame):
     turn_params: Optional[SmartTurnParams] = None
+@dataclass
+class StartDTMFCaptureFrame(SystemFrame):
+    """System frame indicating the bot is actively collecting DTMF input."""
+    pass
+@dataclass
+class EndDTMFCaptureFrame(SystemFrame):
+    """System frame indicating DTMF collection has finished."""
+    pass
 #
 # Control frames
 #
@@ -1476,7 +1490,7 @@ class STTUpdateSettingsFrame(ServiceUpdateSettingsFrame):
 @dataclass
 class DTMFUpdateSettingsFrame(ServiceUpdateSettingsFrame):
     """Frame for updating DTMF aggregator settings.
     Updates DTMF configuration dynamically during conversation flow.
     Settings can include: timeout, digits, end, reset parameters.
     """

pipecat/processors/dtmf_aggregator.py CHANGED Viewed

@@ -4,15 +4,13 @@ from pipecat.frames.frames import (
     BotSpeakingFrame,
     CancelFrame,
     DTMFUpdateSettingsFrame,
+    EndDTMFCaptureFrame,
     EndFrame,
     Frame,
     InputDTMFFrame,
+    StartDTMFCaptureFrame,
     StartInterruptionFrame,
-    StartUserIdleProcessorFrame,
-    StopUserIdleProcessorFrame,
     TranscriptionFrame,
-    UserStartedSpeakingFrame,
-    UserStoppedSpeakingFrame,
     WaitForDTMFFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -44,95 +42,78 @@ class DTMFAggregator(FrameProcessor):
         self._idle_timeout = timeout
         self._digits = digits
         self._digit_event = asyncio.Event()
-        self._digit_aggregate_task = None
+        self._aggregation_task = None
         self._end_on = end_on if end_on else set()
         self._reset_on = reset_on if reset_on else set()
-        self._stopped_idle_processor = False
-    async def _start_idle_processor(self):
-        await self.push_frame(StartUserIdleProcessorFrame(), FrameDirection.UPSTREAM)
-        self._stopped_idle_processor = False
-    async def _stop_idle_processor(self):
-        await self.push_frame(StopUserIdleProcessorFrame(), FrameDirection.UPSTREAM)
-        self._stopped_idle_processor = True
+        self._dtmf_capture_active = False
     async def process_frame(self, frame: Frame, direction: FrameDirection) -> None:
         # Handle DTMF frames.
         await super().process_frame(frame, direction)
-        await self.push_frame(frame, direction)
-        if isinstance(frame, InputDTMFFrame):
-            # Start the digit aggregation task if it's not running yet.
-            if self._digit_aggregate_task is None:
-                self._digit_aggregate_task = self.create_task(self._digit_agg_handler(direction))
-            # Append the incoming digit.
-            if frame.button.value in self._reset_on:
-                self._aggregation = ""
-            elif frame.button.value in self._end_on:
-                await self.flush_aggregation(direction)
-                self._aggregation = ""
-            else:
-                self._digit_event.set()
-                self._aggregation += frame.button.value
-                # Flush if the aggregated digits reach the specified length.
-                if self._digits and len(self._aggregation) == self._digits:
-                    await self.flush_aggregation(direction)
-                    self._aggregation = ""
-            if self._stopped_idle_processor:
-                await self._start_idle_processor()
+        if isinstance(frame, InputDTMFFrame):
+            # Push the DTMF frame downstream first
+            await self.push_frame(frame, direction)
+            # Then handle it for proper frame ordering
+            await self._handle_dtmf_frame(frame)
         elif isinstance(frame, (EndFrame, CancelFrame)):
             # For EndFrame, flush any pending aggregation and stop the digit aggregation task.
             if self._aggregation:
-                await self.flush_aggregation(direction)
-            if self._digit_aggregate_task:
-                await self._stop_digit_aggregate_task()
+                await self.flush_aggregation()
+            if self._aggregation_task:
+                await self._stop_aggregation_task()
+            await self.push_frame(frame, direction)
         elif isinstance(frame, WaitForDTMFFrame):
             self.logger.debug("Received WaitForDTMFFrame: Waiting for DTMF input")
-            if self._digit_aggregate_task is None:
-                self._digit_aggregate_task = self.create_task(
-                    self._digit_agg_handler(direction, raise_timeout=True)
-                )
-                self._digit_event.set()
-            await self._stop_idle_processor()
+            self._create_aggregation_task(raise_timeout=True)
+            self._digit_event.set()  # Trigger the timeout handler
+            await self._start_dtmf_capture()
+            await self.push_frame(frame, direction)
         elif isinstance(frame, StartInterruptionFrame):
-            self.logger.debug("Received StartInterruptionFrame: Starting idle processor")
-            if self._stopped_idle_processor:
-                await self._start_idle_processor()
+            self.logger.debug("Received StartInterruptionFrame")
             if self._aggregation:
-                await self.flush_aggregation(direction)
+                await self.flush_aggregation()
+            await self._end_dtmf_capture()
+            await self.push_frame(frame, direction)
         elif isinstance(frame, BotSpeakingFrame):
-            if self._digit_aggregate_task is not None:
+            # Signal the aggregation task to continue when bot speaks
+            if self._aggregation_task is not None:
                 self._digit_event.set()
+            await self.push_frame(frame, direction)
         elif isinstance(frame, DTMFUpdateSettingsFrame):
             await self._update_settings(frame.settings)
             # Don't pass the settings frame downstream
+        else:
+            # Pass all other frames through
+            await self.push_frame(frame, direction)
     async def _update_settings(self, settings: dict) -> None:
         """Update DTMF aggregator settings dynamically.
         Args:
             settings: Dictionary containing new DTMF settings
                      Supported keys: timeout, digits, end, reset
         """
         settings_changed = False
         if "timeout" in settings and settings["timeout"] is not None:
             new_timeout = float(settings["timeout"])
             if new_timeout != self._idle_timeout:
-                self.logger.debug(f"Updating DTMF timeout from {self._idle_timeout} to {new_timeout}")
+                self.logger.debug(
+                    f"Updating DTMF timeout from {self._idle_timeout} to {new_timeout}"
+                )
                 self._idle_timeout = new_timeout
                 settings_changed = True
         if "digits" in settings:
             new_digits = settings["digits"]
             if new_digits != self._digits:
-                self.logger.debug(f"Updating DTMF digits from {self._digits} to {new_digits}")
+                self.logger.debug(
+                    f"Updating DTMF digits from {self._digits} to {new_digits}"
+                )
                 self._digits = new_digits
                 settings_changed = True
         if "end" in settings:
             # Convert single string to set if needed
             end_value = settings["end"]
@@ -142,12 +123,14 @@ class DTMFAggregator(FrameProcessor):
                 new_end_on = {end_value} if end_value else set()
             else:
                 new_end_on = set(end_value)
             if new_end_on != self._end_on:
-                self.logger.debug(f"Updating DTMF end_on from {self._end_on} to {new_end_on}")
+                self.logger.debug(
+                    f"Updating DTMF end_on from {self._end_on} to {new_end_on}"
+                )
                 self._end_on = new_end_on
                 settings_changed = True
         if "reset" in settings:
             # Convert single string to set if needed
             reset_value = settings["reset"]
@@ -157,58 +140,116 @@ class DTMFAggregator(FrameProcessor):
                 new_reset_on = {reset_value} if reset_value else set()
             else:
                 new_reset_on = set(reset_value)
             if new_reset_on != self._reset_on:
-                self.logger.debug(f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}")
+                self.logger.debug(
+                    f"Updating DTMF reset_on from {self._reset_on} to {new_reset_on}"
+                )
                 self._reset_on = new_reset_on
                 settings_changed = True
         if settings_changed:
             self.logger.info(f"DTMF settings updated successfully")
-    async def _digit_agg_handler(self, direction: FrameDirection, raise_timeout=False):
-        """Idle task that waits for new DTMF activity. If no new digit is received within
-        the timeout period, the current aggregation is flushed.
-        """
+    async def _handle_dtmf_frame(self, frame: InputDTMFFrame):
+        """Handle DTMF input frame processing."""
+        # Create aggregation task if needed
+        if self._aggregation_task is None:
+            self._create_aggregation_task()
+        digit_value = frame.button.value
+        # Handle reset digits
+        if digit_value in self._reset_on:
+            self._aggregation = ""
+            return
+        # Handle end digits
+        if digit_value in self._end_on:
+            if self._aggregation:  # Only flush if we have aggregation
+                await self.flush_aggregation()
+            return
+        # Add digit to aggregation
+        self._aggregation += digit_value
+        # Signal the aggregation task that a digit was received
+        self._digit_event.set()
+        # Check if we reached the digit limit
+        if self._digits and len(self._aggregation) == self._digits:
+            await self.flush_aggregation()
+    def _create_aggregation_task(self, raise_timeout: bool = False) -> None:
+        """Creates the aggregation task if it hasn't been created yet."""
+        if not self._aggregation_task:
+            self._aggregation_task = self.create_task(
+                self._aggregation_task_handler(raise_timeout)
+            )
+    async def _stop_aggregation_task(self) -> None:
+        """Stops the aggregation task."""
+        if self._aggregation_task:
+            await self.cancel_task(self._aggregation_task)
+            self._aggregation_task = None
+    async def _aggregation_task_handler(self, raise_timeout=False):
+        """Background task that handles timeout-based flushing."""
         while True:
             try:
                 # Wait for a new digit signal with a timeout.
-                await asyncio.wait_for(self._digit_event.wait(), timeout=self._idle_timeout)
-            except asyncio.TimeoutError:
-                # No new digit arrived within the timeout period; flush aggregation if non-empty.
-                await self.flush_aggregation(direction, raise_timeout)
-            finally:
-                # Clear the event for the next cycle.
+                await asyncio.wait_for(
+                    self._digit_event.wait(), timeout=self._idle_timeout
+                )
                 self._digit_event.clear()
+            except asyncio.TimeoutError:
+                # No new digit arrived within the timeout period; flush if needed
+                await self.flush_aggregation(raise_timeout=raise_timeout)
-    async def flush_aggregation(self, direction: FrameDirection, raise_timeout=False):
+    async def flush_aggregation(self, *, raise_timeout: bool = False):
         """Flush the aggregated digits by emitting a TranscriptionFrame downstream."""
         if self._aggregation:
-            # Todo: Change to different frame type if we decide to handle it in llm processor separately.
+            # Create transcription frame
             aggregated_frame = TranscriptionFrame(
                 f"User inputted: {self._aggregation}.", "", time_now_iso8601()
             )
             aggregated_frame.metadata["push_aggregation"] = True
-            await self.push_frame(StartInterruptionFrame())
-            await self.push_frame(aggregated_frame, direction)
+            # Send interruption frame (as per original design)
+            await self.push_frame(StartInterruptionFrame(), FrameDirection.DOWNSTREAM)
+            # Push the transcription frame
+            await self.push_frame(aggregated_frame, FrameDirection.DOWNSTREAM)
+            # Reset state
             self._aggregation = ""
-        elif raise_timeout and self._stopped_idle_processor:
+            await self._end_dtmf_capture()
+        elif raise_timeout and not self._aggregation:
+            # Timeout with no aggregation (WaitForDTMFFrame case)
             transcript_frame = TranscriptionFrame(
                 "User didn't press any digits on the keyboard.", "", time_now_iso8601()
             )
             transcript_frame.metadata["push_aggregation"] = True
-            await self.push_frame(transcript_frame)
-            if self._stopped_idle_processor:
-                await self._start_idle_processor()
+            await self.push_frame(transcript_frame, FrameDirection.DOWNSTREAM)
+            await self._end_dtmf_capture()
+    async def _start_dtmf_capture(self):
+        """Signal the start of DTMF capture upstream."""
+        if self._dtmf_capture_active:
+            return
+        await self.push_frame(StartDTMFCaptureFrame(), FrameDirection.UPSTREAM)
+        self._dtmf_capture_active = True
-    async def _stop_digit_aggregate_task(self):
-        """Cancels the digit aggregation task if it exists."""
-        if self._digit_aggregate_task:
-            await self.cancel_task(self._digit_aggregate_task)
-            self._digit_aggregate_task = None
+    async def _end_dtmf_capture(self):
+        """Signal the end of DTMF capture upstream."""
+        if not self._dtmf_capture_active:
+            return
+        await self.push_frame(EndDTMFCaptureFrame(), FrameDirection.UPSTREAM)
+        self._dtmf_capture_active = False
     async def cleanup(self) -> None:
         """Cleans up resources, ensuring that the digit aggregation task is cancelled."""
         await super().cleanup()
-        if self._digit_aggregate_task:
-            await self._stop_digit_aggregate_task()
+        if self._aggregation_task:
+            await self._stop_aggregation_task()

pipecat/processors/filters/stt_mute_filter.py CHANGED Viewed

@@ -27,12 +27,14 @@ from pipecat.frames.frames import (
     InterimTranscriptionFrame,
     StartFrame,
     StartInterruptionFrame,
+    StartDTMFCaptureFrame,
     STTMuteFrame,
     TranscriptionFrame,
     UserStartedSpeakingFrame,
     UserStoppedSpeakingFrame,
     VADUserStartedSpeakingFrame,
     VADUserStoppedSpeakingFrame,
+    EndDTMFCaptureFrame,
 )
 from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
@@ -58,6 +60,7 @@ class STTMuteStrategy(Enum):
     FUNCTION_CALL = "function_call"
     ALWAYS = "always"
     CUSTOM = "custom"
+    DTMF_CAPTURE = "dtmf_capture"
 @dataclass
@@ -120,6 +123,7 @@ class STTMuteFilter(FrameProcessor):
         self._function_call_in_progress = False
         self._is_muted = False  # Initialize as unmuted, will set state on StartFrame if needed
         self._voicemail_detection_enabled = False  # Default to False
+        self._dtmf_capture_active = False
     @property
     def is_muted(self) -> bool:
@@ -165,6 +169,10 @@ class STTMuteFilter(FrameProcessor):
                         if should_mute:
                             return True
+                case STTMuteStrategy.DTMF_CAPTURE:
+                    if self._dtmf_capture_active:
+                        return True
         return False
     async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -205,7 +213,14 @@ class STTMuteFilter(FrameProcessor):
                 self._first_speech_handled = True
             should_mute = await self._should_mute()
             self.logger.debug(f"BotStoppedSpeaking: should mute={should_mute}")
+        elif isinstance(frame, StartDTMFCaptureFrame):
+            self._dtmf_capture_active = True
+            should_mute = await self._should_mute()
+        elif isinstance(frame, EndDTMFCaptureFrame):
+            self._dtmf_capture_active = False
+            should_mute = await self._should_mute()
         elif isinstance(frame, STTMuteFrame):
+            # TODO: Duplication of frame is actually happening. We get this frame from the downstream and then we again push it downstream. Also we're psuhing is upstream  and again push it upstream in _handle_mute_state.
             should_mute = frame.mute
         # Then push the original frame

pipecat/processors/user_idle_processor.py CHANGED Viewed

@@ -15,17 +15,17 @@ from loguru import logger
 from pipecat.frames.frames import (
     BotSpeakingFrame,
     CancelFrame,
+    EndDTMFCaptureFrame,
     EndFrame,
     Frame,
     FunctionCallCancelFrame,
     FunctionCallInProgressFrame,
     FunctionCallResultFrame,
     InputDTMFFrame,
+    StartDTMFCaptureFrame,
+    StartFrame,
     StartUserIdleProcessorFrame,
     StopUserIdleProcessorFrame,
-    FunctionCallInProgressFrame,
-    FunctionCallResultFrame,
-    StartFrame,
     UserStartedSpeakingFrame,
     UserStoppedSpeakingFrame,
 )
@@ -83,6 +83,8 @@ class UserIdleProcessor(FrameProcessor):
         self._timeout = timeout
         self._retry_count = 0
         self._interrupted = False
+        self._function_call_active = False
+        self._dtmf_capture_active = False
         self._conversation_started = False
         self._idle_task = None
         self._idle_event = asyncio.Event()
@@ -180,10 +182,20 @@ class UserIdleProcessor(FrameProcessor):
                 self._idle_event.set()
             elif isinstance(frame, FunctionCallInProgressFrame):
                 # Function calls can take longer than the timeout, so we want to prevent idle callbacks
+                self._function_call_active = True
                 self._interrupted = True
                 self._idle_event.set()
             elif isinstance(frame, FunctionCallResultFrame):
-                self._interrupted = False
+                self._function_call_active = False
+                self._interrupted = self._dtmf_capture_active
+                self._idle_event.set()
+            elif isinstance(frame, StartDTMFCaptureFrame):
+                self._dtmf_capture_active = True
+                self._interrupted = True
+                self._idle_event.set()
+            elif isinstance(frame, EndDTMFCaptureFrame):
+                self._dtmf_capture_active = False
+                self._interrupted = self._function_call_active
                 self._idle_event.set()
             elif isinstance(frame, StartUserIdleProcessorFrame):
                 if not self._idle_task:

pipecat/services/sarvam/stt.py CHANGED Viewed

@@ -31,6 +31,9 @@ from pipecat.utils.tracing.service_decorators import traced_stt
 try:
     import websockets
     from sarvamai import AsyncSarvamAI
+    from sarvamai.speech_to_text_streaming.socket_client import (
+        AsyncSpeechToTextStreamingSocketClient,
+    )
     from sarvamai.speech_to_text_translate_streaming.socket_client import (
         AsyncSpeechToTextTranslateStreamingSocketClient,
     )
@@ -41,11 +44,11 @@ except ModuleNotFoundError as e:
     raise Exception(f"Missing module: {e}")
-def language_to_sarvam_language(language: Language) -> str:
-    """Convert Language enum to Sarvam language code.
+def language_to_sarvam_language(language) -> str:
+    """Convert Language enum or string to Sarvam language code.
     Args:
-        language: The Language enum to convert.
+        language: The Language enum or language code string to convert.
     Returns:
         The corresponding Sarvam language code string.
@@ -53,6 +56,30 @@ def language_to_sarvam_language(language: Language) -> str:
     Raises:
         ValueError: If the language is not supported by Sarvam.
     """
+    # If already a string in the right format, return it
+    if isinstance(language, str):
+        if "-" in language:  # Already in format like "hi-IN"
+            return language
+        # Convert short codes to full format
+        lang_map = {
+            "hi": "hi-IN",
+            "bn": "bn-IN",
+            "gu": "gu-IN",
+            "kn": "kn-IN",
+            "ml": "ml-IN",
+            "mr": "mr-IN",
+            "ta": "ta-IN",
+            "te": "te-IN",
+            "pa": "pa-IN",
+            "or": "od-IN",
+            "as": "as-IN",
+            "en": "en-IN",
+        }
+        if language.lower() in lang_map:
+            return lang_map[language.lower()]
+        raise ValueError(f"Unsupported language string: {language}")
+    # Handle Language enum
     match language:
         case Language.BN_IN:
             return "bn-IN"
@@ -133,6 +160,13 @@ class SarvamSTTService(STTService):
     """Sarvam speech-to-text service.
     Provides real-time speech recognition using Sarvam's WebSocket API.
+    Supports both Saarika (transcription) and Saaras (translation) models.
+    Models:
+        - Saarika (saarika:v2.5): Transcription in a single language
+        - Saaras (saaras:v2.5): Translation from source language to target language
+    The service automatically selects the correct endpoint based on the model name.
     """
     def __init__(
@@ -253,6 +287,7 @@ class SarvamSTTService(STTService):
             # Convert audio bytes to base64 for Sarvam API
             audio_base64 = base64.b64encode(audio).decode("utf-8")
+            # Sarvam requires 'audio/wav' encoding (even for raw PCM data)
             message = {
                 "audio": {
                     "data": audio_base64,
@@ -273,33 +308,47 @@ class SarvamSTTService(STTService):
     async def _connect(self):
         """Connect to Sarvam WebSocket API directly."""
-        logger.debug("Connecting to Sarvam")
+        logger.debug(f"Connecting to Sarvam with model: {self._model}")
         try:
-            # Build WebSocket URL and headers manually
-            ws_url = (
-                self._client._client_wrapper.get_environment().production
-                + "/speech-to-text-translate/ws"
-            )
+            base_url = self._client._client_wrapper.get_environment().production
+            # Choose endpoint and socket class based on model
+            if self._model.startswith("saarika"):
+                # Saarika = Transcription endpoint
+                path = "/speech-to-text/ws"
+                query_params = {
+                    "language-code": language_to_sarvam_language(self._language),
+                    "model": self._model,
+                    "vad_signals": "true",
+                }
+                socket_cls = AsyncSpeechToTextStreamingSocketClient
+                logger.debug(
+                    f"Using Saarika transcription endpoint with language: {self._language}"
+                )
+            else:
+                # Saaras = Translation endpoint
+                path = "/speech-to-text-translate/ws"
+                query_params = {
+                    "model": self._model,
+                    "vad_signals": "true",
+                }
+                socket_cls = AsyncSpeechToTextTranslateStreamingSocketClient
+                logger.debug("Using Saaras translation endpoint")
-            # Add query parameters
-            query_params = {"model": self._model, "vad_signals": "true"}
-            query_string = urlencode(query_params)
-            ws_url = ws_url + f"?{query_string}"
+            ws_url = f"{base_url}{path}?{urlencode(query_params)}"
             # Get headers
             headers = self._client._client_wrapper.get_headers()
             headers["Api-Subscription-Key"] = self._api_key
-            # Connect to WebSocket directly
+            # Connect to WebSocket
             self._websocket_connection = await websockets.connect(
                 ws_url, additional_headers=headers
             )
             # Create the socket client wrapper
-            self._websocket = AsyncSpeechToTextTranslateStreamingSocketClient(
-                websocket=self._websocket_connection
-            )
+            self._websocket = socket_cls(websocket=self._websocket_connection)
             # Start listening for messages
             self._listening_task = asyncio.create_task(self._listen_for_messages())
@@ -309,7 +358,10 @@ class SarvamSTTService(STTService):
         except websockets.exceptions.InvalidStatusCode as e:
             error_msg = f"Failed to connect to Sarvam: HTTP {e.status_code}"
             if e.status_code == 403:
-                error_msg += f" - Access denied. Your API key may not have access to model '{self._model}'. Available models: saaras:v2, saaras:v2.5"
+                if self._model.startswith("saarika"):
+                    error_msg += f" - Access denied. Check: 1) API key has Saarika access, 2) Model '{self._model}' exists (try saarika:v2.5), 3) Using correct endpoint (transcription)"
+                else:
+                    error_msg += f" - Access denied. Check: 1) API key has Saaras access, 2) Model '{self._model}' exists (try saaras:v2.5), 3) Using correct endpoint (translation)"
             elif e.status_code == 401:
                 error_msg += " - Invalid API key"
             logger.error(error_msg)
@@ -370,21 +422,60 @@ class SarvamSTTService(STTService):
     async def _handle_response(self, response):
         """Handle transcription response from Sarvam.
+        Handles both Saarika (transcription) and Saaras (translation) message formats.
         Args:
             response: The response object from Sarvam WebSocket.
         """
         logger.debug(f"Received response: {response}")
         try:
-            if response["type"] == "error":
+            msg_type = response.get("type")
+            # Error handling
+            if msg_type == "error":
                 error_msg = response.get("data", {}).get("message", "Unknown error")
                 logger.error(f"Sarvam API error: {error_msg}")
                 await self.push_error(ErrorFrame(f"Sarvam API error: {error_msg}"))
-                # Close connection on error
                 await self._disconnect()
                 return
-            if response["type"] == "events":
+            # Modern Saarika/Saaras message format
+            if msg_type == "speech_start":
+                await self.start_metrics()
+                logger.debug("User started speaking")
+                await self._call_event_handler("on_speech_started")
+                return
+            if msg_type == "speech_end":
+                logger.debug("User stopped speaking")
+                await self._call_event_handler("on_speech_ended")
+                return
+            if msg_type == "transcript":
+                await self.stop_ttfb_metrics()
+                # Handle both Saarika (text) and Saaras (text + text_translated)
+                transcript = response.get("text") or response.get("text_translated") or ""
+                language_code = (
+                    response.get("source_language_code") or response.get("language_code") or "hi-IN"
+                )
+                language = self._map_language_code_to_enum(language_code)
+                if transcript.strip():
+                    await self.push_frame(
+                        TranscriptionFrame(
+                            transcript,
+                            self._user_id,
+                            time_now_iso8601(),
+                            language,
+                            result=response,
+                        )
+                    )
+                await self.stop_processing_metrics()
+                return
+            # Legacy format (backward compatibility)
+            if msg_type == "events":
                 parsed = EventResponse(**response)
                 signal = parsed.data.signal_type
                 timestamp = parsed.data.occured_at
@@ -397,14 +488,13 @@ class SarvamSTTService(STTService):
                 elif signal == VADSignal.END:
                     logger.debug("User stopped speaking")
                     await self._call_event_handler("on_speech_ended")
+                return
-            elif response["type"] == "data":
+            if msg_type == "data":
                 await self.stop_ttfb_metrics()
                 parsed = TranscriptionResponse(**response)
                 transcript = parsed.data.transcript
-                language_code = parsed.data.language_code
-                if language_code is None:
-                    language_code = "hi-IN"
+                language_code = parsed.data.language_code or "hi-IN"
                 language = self._map_language_code_to_enum(language_code)
                 if transcript and transcript.strip():
@@ -417,8 +507,8 @@ class SarvamSTTService(STTService):
                             result=response,
                         )
                     )
                 await self.stop_processing_metrics()
+                return
         except Exception as e:
             logger.error(f"Error handling Sarvam response: {e}")

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev70.dist-info}/WHEEL RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev70.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.82.dev68.dist-info → dv_pipecat_ai-0.0.82.dev70.dist-info}/top_level.txt RENAMED Viewed

File without changes

dv-pipecat-ai 0.0.82.dev68__py3-none-any.whl → 0.0.82.dev70__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.82.dev68py3-none-any.whl → 0.0.82.dev70py3-none-any.whl