PyPI - dv-pipecat-ai - Versions diffs - 0.0.85.dev7__py3-none-any.whl → 0.0.85.dev10__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.85.dev7py3-none-any.whl → 0.0.85.dev10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (13) hide show

{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dv-pipecat-ai
-Version: 0.0.85.dev7
+Version: 0.0.85.dev10
 Summary: An open source framework for voice (and multimodal) assistants
 License-Expression: BSD-2-Clause
 Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -143,6 +143,9 @@ Requires-Dist: coremltools>=8.0; extra == "local-smart-turn"
 Requires-Dist: transformers; extra == "local-smart-turn"
 Requires-Dist: torch<3,>=2.5.0; extra == "local-smart-turn"
 Requires-Dist: torchaudio<3,>=2.5.0; extra == "local-smart-turn"
+Provides-Extra: local-smart-turn-v3
+Requires-Dist: transformers; extra == "local-smart-turn-v3"
+Requires-Dist: onnxruntime<2,>=1.20.1; extra == "local-smart-turn-v3"
 Provides-Extra: remote-smart-turn
 Provides-Extra: silero
 Requires-Dist: onnxruntime~=1.20.1; extra == "silero"

{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev10.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-dv_pipecat_ai-0.0.85.dev7.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
+dv_pipecat_ai-0.0.85.dev10.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
 pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
 pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,14 +50,17 @@ pipecat/audio/resamplers/resampy_resampler.py,sha256=fEZv6opn_9j50xYEOdwQiZOJQ_J
 pipecat/audio/resamplers/soxr_resampler.py,sha256=CXze7zf_ExlCcgcZp0oArRSbZ9zFpBzsCt2EQ_woKfM,1747
 pipecat/audio/resamplers/soxr_stream_resampler.py,sha256=lHk1__M1HDGf25abpffuWEyqbd0ckNfyADDV_WmTPcY,3665
 pipecat/audio/turn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pipecat/audio/turn/base_turn_analyzer.py,sha256=hLOcH1WkP9iSk84boQv94RFYKEfEX-IHfO1y9pjkDzs,3213
+pipecat/audio/turn/base_turn_analyzer.py,sha256=UoZ61yto2wecXU6nXk2yjdcgM7jGyfMR5ZfrunOFpOA,3359
 pipecat/audio/turn/smart_turn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-pipecat/audio/turn/smart_turn/base_smart_turn.py,sha256=HgUoRfo9tbXVMfmRbYBkm4FDY1AjUJ3CRe7t48Ny2WI,9672
+pipecat/audio/turn/smart_turn/base_smart_turn.py,sha256=gE5jrqrU0gQcgjTOvpUbb6LWAhfk8VKZQ-5pyEIZH4E,10037
 pipecat/audio/turn/smart_turn/fal_smart_turn.py,sha256=neahuTAY9SUQjacRYd19BERiuSHIMSpqzZ9uae_ZlWA,1606
-pipecat/audio/turn/smart_turn/http_smart_turn.py,sha256=s5QP2gd0BqQAlbRJ7hGuCwGqgEENfyRm6aB6jBgDoqE,4642
+pipecat/audio/turn/smart_turn/http_smart_turn.py,sha256=HlHpdVbk-1g_AU3qAAy7Xob8M2V3FUqtr38UAk1F1Dw,4783
 pipecat/audio/turn/smart_turn/local_coreml_smart_turn.py,sha256=50kiBeZhnq7FZWZnzdSX8KUmhhQtkme0KH2rbiAJbCU,3140
-pipecat/audio/turn/smart_turn/local_smart_turn.py,sha256=KVodqUTu8onfmfeOywgH98vBCNvBb-B3pvsQlTKyP_4,3570
-pipecat/audio/turn/smart_turn/local_smart_turn_v2.py,sha256=aYLMDURpmYycQgKsxbNEENtUe5oujeQ9H3Lbi0GYmZA,7160
+pipecat/audio/turn/smart_turn/local_smart_turn.py,sha256=0z2M_MC9xIcelm4d9XqZwzJMe2FM-zOjgnHDAeoMw0g,3564
+pipecat/audio/turn/smart_turn/local_smart_turn_v2.py,sha256=hd_nhEdaxwJ2_G6F2RJru9mC8vyzkmku2YqmtULl7NM,7154
+pipecat/audio/turn/smart_turn/local_smart_turn_v3.py,sha256=x1q437Mp8cEU1S-7W869i1meDtCdjrjPTUCjbSLDVgQ,4649
+pipecat/audio/turn/smart_turn/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx,sha256=B6Ezq6MeLQtSPxf4wuTmXv5tj2he_RLKT-Iev055iZE,8757193
 pipecat/audio/vad/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/audio/vad/silero.py,sha256=Cz4_hJjaBKbmUwZVbqMzED8orHOCsnF3zpERgBTw1Rw,7906
 pipecat/audio/vad/vad_analyzer.py,sha256=XkZLEe4z7Ja0lGoYZst1HNYqt5qOwG-vjsk_w8chiNA,7430
@@ -378,7 +381,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=HwDCqLGijhYD3F8nxDuQmEw-YkRw0
 pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
 pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
 pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
-dv_pipecat_ai-0.0.85.dev7.dist-info/METADATA,sha256=VKliyHrf__BVRdCbvDCp5lJRyqMFTxC3-EEuhwPphe0,32691
-dv_pipecat_ai-0.0.85.dev7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dv_pipecat_ai-0.0.85.dev7.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
-dv_pipecat_ai-0.0.85.dev7.dist-info/RECORD,,
+dv_pipecat_ai-0.0.85.dev10.dist-info/METADATA,sha256=ezbvZ9D9Q9E1aVPhwoNcHu02GKAveWpHvFp0lgahMVc,32858
+dv_pipecat_ai-0.0.85.dev10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dv_pipecat_ai-0.0.85.dev10.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
+dv_pipecat_ai-0.0.85.dev10.dist-info/RECORD,,

pipecat/audio/turn/base_turn_analyzer.py CHANGED Viewed

@@ -14,6 +14,8 @@ from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Optional, Tuple
+from pydantic import BaseModel
 from pipecat.metrics.metrics import MetricsData
@@ -29,6 +31,12 @@ class EndOfTurnState(Enum):
     INCOMPLETE = 2
+class BaseTurnParams(BaseModel):
+    """Base class for turn analyzer parameters."""
+    pass
 class BaseTurnAnalyzer(ABC):
     """Abstract base class for analyzing user end of turn.
@@ -78,7 +86,7 @@ class BaseTurnAnalyzer(ABC):
     @property
     @abstractmethod
-    def params(self):
+    def params(self) -> BaseTurnParams:
         """Get the current turn analyzer parameters.
         Returns:

pipecat/audio/turn/smart_turn/base_smart_turn.py CHANGED Viewed

@@ -11,15 +11,17 @@ machine learning models to determine when a user has finished speaking, going
 beyond simple silence-based detection.
 """
+import asyncio
 import time
 from abc import abstractmethod
+from concurrent.futures import ThreadPoolExecutor
 from typing import Any, Dict, Optional, Tuple
 import numpy as np
 from loguru import logger
 from pydantic import BaseModel
-from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
+from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, BaseTurnParams, EndOfTurnState
 from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
 # Default timing parameters
@@ -29,7 +31,7 @@ MAX_DURATION_SECONDS = 8  # Max allowed segment duration
 USE_ONLY_LAST_VAD_SEGMENT = True
-class SmartTurnParams(BaseModel):
+class SmartTurnParams(BaseTurnParams):
     """Configuration parameters for smart turn analysis.
     Parameters:
@@ -77,6 +79,9 @@ class BaseSmartTurn(BaseTurnAnalyzer):
         self._speech_triggered = False
         self._silence_ms = 0
         self._speech_start_time = 0
+        # Thread executor that will run the model. We only need one thread per
+        # analyzer because one analyzer just handles one audio stream.
+        self._executor = ThreadPoolExecutor(max_workers=1)
     @property
     def speech_triggered(self) -> bool:
@@ -151,7 +156,10 @@ class BaseSmartTurn(BaseTurnAnalyzer):
             Tuple containing the end-of-turn state and optional metrics data
             from the ML model analysis.
         """
-        state, result = await self._process_speech_segment(self._audio_buffer)
+        loop = asyncio.get_running_loop()
+        state, result = await loop.run_in_executor(
+            self._executor, self._process_speech_segment, self._audio_buffer
+        )
         if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
             self._clear(state)
         logger.debug(f"End of Turn result: {state}")
@@ -169,9 +177,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
         self._speech_start_time = 0
         self._silence_ms = 0
-    async def _process_speech_segment(
-        self, audio_buffer
-    ) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
+    def _process_speech_segment(self, audio_buffer) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
         """Process accumulated audio segment using ML model."""
         state = EndOfTurnState.INCOMPLETE
@@ -203,7 +209,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
         if len(segment_audio) > 0:
             start_time = time.perf_counter()
             try:
-                result = await self._predict_endpoint(segment_audio)
+                result = self._predict_endpoint(segment_audio)
                 state = (
                     EndOfTurnState.COMPLETE
                     if result["prediction"] == 1
@@ -249,6 +255,6 @@ class BaseSmartTurn(BaseTurnAnalyzer):
         return state, result_data
     @abstractmethod
-    async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
+    def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
         """Predict end-of-turn using ML model from audio data."""
         pass

pipecat/audio/turn/smart_turn/data/__init__.py ADDED Viewed

File without changes

pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx ADDED Viewed

Binary file

pipecat/audio/turn/smart_turn/http_smart_turn.py CHANGED Viewed

@@ -104,11 +104,15 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
             logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
             raise Exception("Failed to send raw request to Daily Smart Turn.")
-    async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
+    def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
         """Predict end-of-turn using remote HTTP ML service."""
         try:
             serialized_array = self._serialize_array(audio_array)
-            return await self._send_raw_request(serialized_array)
+            loop = asyncio.get_running_loop()
+            future = asyncio.run_coroutine_threadsafe(
+                self._send_raw_request(serialized_array), loop
+            )
+            return future.result()
         except Exception as e:
             logger.error(f"Smart turn prediction failed: {str(e)}")
             # Return an incomplete prediction when a failure occurs

pipecat/audio/turn/smart_turn/local_smart_turn.py CHANGED Viewed

@@ -64,7 +64,7 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
         self._turn_model.eval()
         logger.debug("Loaded Local Smart Turn")
-    async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
+    def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
         """Predict end-of-turn using local PyTorch model."""
         inputs = self._turn_processor(
             audio_array,

pipecat/audio/turn/smart_turn/local_smart_turn_v2.py CHANGED Viewed

@@ -73,7 +73,7 @@ class LocalSmartTurnAnalyzerV2(BaseSmartTurn):
         self._turn_model.eval()
         logger.debug("Loaded Local Smart Turn v2")
-    async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
+    def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
         """Predict end-of-turn using local PyTorch model."""
         inputs = self._turn_processor(
             audio_array,

pipecat/audio/turn/smart_turn/local_smart_turn_v3.py ADDED Viewed

@@ -0,0 +1,124 @@
+#
+# Copyright (c) 2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+"""Local turn analyzer for on-device ML inference using the smart-turn-v3 model.
+This module provides a smart turn analyzer that uses an ONNX model for
+local end-of-turn detection without requiring network connectivity.
+"""
+from typing import Any, Dict, Optional
+import numpy as np
+from loguru import logger
+from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn
+try:
+    import onnxruntime as ort
+    from transformers import WhisperFeatureExtractor
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error(
+        "In order to use LocalSmartTurnAnalyzerV3, you need to `pip install pipecat-ai[local-smart-turn-v3]`."
+    )
+    raise Exception(f"Missing module: {e}")
+class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
+    """Local turn analyzer using the smart-turn-v3 ONNX model.
+    Provides end-of-turn detection using locally-stored ONNX model,
+    enabling offline operation without network dependencies.
+    """
+    def __init__(self, *, smart_turn_model_path: Optional[str] = None, **kwargs):
+        """Initialize the local ONNX smart-turn-v3 analyzer.
+        Args:
+            smart_turn_model_path: Path to the ONNX model file. If this is not
+                set, the bundled smart-turn-v3.0 model will be used.
+            **kwargs: Additional arguments passed to BaseSmartTurn.
+        """
+        super().__init__(**kwargs)
+        logger.debug("Loading Local Smart Turn v3 model...")
+        if not smart_turn_model_path:
+            # Load bundled model
+            model_name = "smart-turn-v3.0.onnx"
+            package_path = "pipecat.audio.turn.smart_turn.data"
+            try:
+                import importlib_resources as impresources
+                smart_turn_model_path = str(impresources.files(package_path).joinpath(model_name))
+            except BaseException:
+                from importlib import resources as impresources
+                try:
+                    with impresources.path(package_path, model_name) as f:
+                        smart_turn_model_path = f
+                except BaseException:
+                    smart_turn_model_path = str(
+                        impresources.files(package_path).joinpath(model_name)
+                    )
+        so = ort.SessionOptions()
+        so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+        so.inter_op_num_threads = 1
+        so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        self._feature_extractor = WhisperFeatureExtractor(chunk_length=8)
+        self._session = ort.InferenceSession(smart_turn_model_path, sess_options=so)
+        logger.debug("Loaded Local Smart Turn v3")
+    def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
+        """Predict end-of-turn using local ONNX model."""
+        def truncate_audio_to_last_n_seconds(audio_array, n_seconds=8, sample_rate=16000):
+            """Truncate audio to last n seconds or pad with zeros to meet n seconds."""
+            max_samples = n_seconds * sample_rate
+            if len(audio_array) > max_samples:
+                return audio_array[-max_samples:]
+            elif len(audio_array) < max_samples:
+                # Pad with zeros at the beginning
+                padding = max_samples - len(audio_array)
+                return np.pad(audio_array, (padding, 0), mode="constant", constant_values=0)
+            return audio_array
+        # Truncate to 8 seconds (keeping the end) or pad to 8 seconds
+        audio_array = truncate_audio_to_last_n_seconds(audio_array, n_seconds=8)
+        # Process audio using Whisper's feature extractor
+        inputs = self._feature_extractor(
+            audio_array,
+            sampling_rate=16000,
+            return_tensors="np",
+            padding="max_length",
+            max_length=8 * 16000,
+            truncation=True,
+            do_normalize=True,
+        )
+        # Extract features and ensure correct shape for ONNX
+        input_features = inputs.input_features.squeeze(0).astype(np.float32)
+        input_features = np.expand_dims(input_features, axis=0)  # Add batch dimension
+        # Run ONNX inference
+        outputs = self._session.run(None, {"input_features": input_features})
+        # Extract probability (ONNX model returns sigmoid probabilities)
+        probability = outputs[0][0].item()
+        # Make prediction (1 for Complete, 0 for Incomplete)
+        prediction = 1 if probability > 0.5 else 0
+        return {
+            "prediction": prediction,
+            "probability": probability,
+        }

{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev10.dist-info}/WHEEL RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev10.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev7.dist-info → dv_pipecat_ai-0.0.85.dev10.dist-info}/top_level.txt RENAMED Viewed

File without changes

dv-pipecat-ai 0.0.85.dev7__py3-none-any.whl → 0.0.85.dev10__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.85.dev7py3-none-any.whl → 0.0.85.dev10py3-none-any.whl