dv-pipecat-ai 0.0.85.dev7__py3-none-any.whl → 0.0.85.dev10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.85.dev7
3
+ Version: 0.0.85.dev10
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -143,6 +143,9 @@ Requires-Dist: coremltools>=8.0; extra == "local-smart-turn"
143
143
  Requires-Dist: transformers; extra == "local-smart-turn"
144
144
  Requires-Dist: torch<3,>=2.5.0; extra == "local-smart-turn"
145
145
  Requires-Dist: torchaudio<3,>=2.5.0; extra == "local-smart-turn"
146
+ Provides-Extra: local-smart-turn-v3
147
+ Requires-Dist: transformers; extra == "local-smart-turn-v3"
148
+ Requires-Dist: onnxruntime<2,>=1.20.1; extra == "local-smart-turn-v3"
146
149
  Provides-Extra: remote-smart-turn
147
150
  Provides-Extra: silero
148
151
  Requires-Dist: onnxruntime~=1.20.1; extra == "silero"
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.85.dev7.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.85.dev10.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -50,14 +50,17 @@ pipecat/audio/resamplers/resampy_resampler.py,sha256=fEZv6opn_9j50xYEOdwQiZOJQ_J
50
50
  pipecat/audio/resamplers/soxr_resampler.py,sha256=CXze7zf_ExlCcgcZp0oArRSbZ9zFpBzsCt2EQ_woKfM,1747
51
51
  pipecat/audio/resamplers/soxr_stream_resampler.py,sha256=lHk1__M1HDGf25abpffuWEyqbd0ckNfyADDV_WmTPcY,3665
52
52
  pipecat/audio/turn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- pipecat/audio/turn/base_turn_analyzer.py,sha256=hLOcH1WkP9iSk84boQv94RFYKEfEX-IHfO1y9pjkDzs,3213
53
+ pipecat/audio/turn/base_turn_analyzer.py,sha256=UoZ61yto2wecXU6nXk2yjdcgM7jGyfMR5ZfrunOFpOA,3359
54
54
  pipecat/audio/turn/smart_turn/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
- pipecat/audio/turn/smart_turn/base_smart_turn.py,sha256=HgUoRfo9tbXVMfmRbYBkm4FDY1AjUJ3CRe7t48Ny2WI,9672
55
+ pipecat/audio/turn/smart_turn/base_smart_turn.py,sha256=gE5jrqrU0gQcgjTOvpUbb6LWAhfk8VKZQ-5pyEIZH4E,10037
56
56
  pipecat/audio/turn/smart_turn/fal_smart_turn.py,sha256=neahuTAY9SUQjacRYd19BERiuSHIMSpqzZ9uae_ZlWA,1606
57
- pipecat/audio/turn/smart_turn/http_smart_turn.py,sha256=s5QP2gd0BqQAlbRJ7hGuCwGqgEENfyRm6aB6jBgDoqE,4642
57
+ pipecat/audio/turn/smart_turn/http_smart_turn.py,sha256=HlHpdVbk-1g_AU3qAAy7Xob8M2V3FUqtr38UAk1F1Dw,4783
58
58
  pipecat/audio/turn/smart_turn/local_coreml_smart_turn.py,sha256=50kiBeZhnq7FZWZnzdSX8KUmhhQtkme0KH2rbiAJbCU,3140
59
- pipecat/audio/turn/smart_turn/local_smart_turn.py,sha256=KVodqUTu8onfmfeOywgH98vBCNvBb-B3pvsQlTKyP_4,3570
60
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py,sha256=aYLMDURpmYycQgKsxbNEENtUe5oujeQ9H3Lbi0GYmZA,7160
59
+ pipecat/audio/turn/smart_turn/local_smart_turn.py,sha256=0z2M_MC9xIcelm4d9XqZwzJMe2FM-zOjgnHDAeoMw0g,3564
60
+ pipecat/audio/turn/smart_turn/local_smart_turn_v2.py,sha256=hd_nhEdaxwJ2_G6F2RJru9mC8vyzkmku2YqmtULl7NM,7154
61
+ pipecat/audio/turn/smart_turn/local_smart_turn_v3.py,sha256=x1q437Mp8cEU1S-7W869i1meDtCdjrjPTUCjbSLDVgQ,4649
62
+ pipecat/audio/turn/smart_turn/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
+ pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx,sha256=B6Ezq6MeLQtSPxf4wuTmXv5tj2he_RLKT-Iev055iZE,8757193
61
64
  pipecat/audio/vad/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
65
  pipecat/audio/vad/silero.py,sha256=Cz4_hJjaBKbmUwZVbqMzED8orHOCsnF3zpERgBTw1Rw,7906
63
66
  pipecat/audio/vad/vad_analyzer.py,sha256=XkZLEe4z7Ja0lGoYZst1HNYqt5qOwG-vjsk_w8chiNA,7430
@@ -378,7 +381,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=HwDCqLGijhYD3F8nxDuQmEw-YkRw0
378
381
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
379
382
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
380
383
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
381
- dv_pipecat_ai-0.0.85.dev7.dist-info/METADATA,sha256=VKliyHrf__BVRdCbvDCp5lJRyqMFTxC3-EEuhwPphe0,32691
382
- dv_pipecat_ai-0.0.85.dev7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
383
- dv_pipecat_ai-0.0.85.dev7.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
384
- dv_pipecat_ai-0.0.85.dev7.dist-info/RECORD,,
384
+ dv_pipecat_ai-0.0.85.dev10.dist-info/METADATA,sha256=ezbvZ9D9Q9E1aVPhwoNcHu02GKAveWpHvFp0lgahMVc,32858
385
+ dv_pipecat_ai-0.0.85.dev10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
386
+ dv_pipecat_ai-0.0.85.dev10.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
387
+ dv_pipecat_ai-0.0.85.dev10.dist-info/RECORD,,
@@ -14,6 +14,8 @@ from abc import ABC, abstractmethod
14
14
  from enum import Enum
15
15
  from typing import Optional, Tuple
16
16
 
17
+ from pydantic import BaseModel
18
+
17
19
  from pipecat.metrics.metrics import MetricsData
18
20
 
19
21
 
@@ -29,6 +31,12 @@ class EndOfTurnState(Enum):
29
31
  INCOMPLETE = 2
30
32
 
31
33
 
34
+ class BaseTurnParams(BaseModel):
35
+ """Base class for turn analyzer parameters."""
36
+
37
+ pass
38
+
39
+
32
40
  class BaseTurnAnalyzer(ABC):
33
41
  """Abstract base class for analyzing user end of turn.
34
42
 
@@ -78,7 +86,7 @@ class BaseTurnAnalyzer(ABC):
78
86
 
79
87
  @property
80
88
  @abstractmethod
81
- def params(self):
89
+ def params(self) -> BaseTurnParams:
82
90
  """Get the current turn analyzer parameters.
83
91
 
84
92
  Returns:
@@ -11,15 +11,17 @@ machine learning models to determine when a user has finished speaking, going
11
11
  beyond simple silence-based detection.
12
12
  """
13
13
 
14
+ import asyncio
14
15
  import time
15
16
  from abc import abstractmethod
17
+ from concurrent.futures import ThreadPoolExecutor
16
18
  from typing import Any, Dict, Optional, Tuple
17
19
 
18
20
  import numpy as np
19
21
  from loguru import logger
20
22
  from pydantic import BaseModel
21
23
 
22
- from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
24
+ from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, BaseTurnParams, EndOfTurnState
23
25
  from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
24
26
 
25
27
  # Default timing parameters
@@ -29,7 +31,7 @@ MAX_DURATION_SECONDS = 8 # Max allowed segment duration
29
31
  USE_ONLY_LAST_VAD_SEGMENT = True
30
32
 
31
33
 
32
- class SmartTurnParams(BaseModel):
34
+ class SmartTurnParams(BaseTurnParams):
33
35
  """Configuration parameters for smart turn analysis.
34
36
 
35
37
  Parameters:
@@ -77,6 +79,9 @@ class BaseSmartTurn(BaseTurnAnalyzer):
77
79
  self._speech_triggered = False
78
80
  self._silence_ms = 0
79
81
  self._speech_start_time = 0
82
+ # Thread executor that will run the model. We only need one thread per
83
+ # analyzer because one analyzer just handles one audio stream.
84
+ self._executor = ThreadPoolExecutor(max_workers=1)
80
85
 
81
86
  @property
82
87
  def speech_triggered(self) -> bool:
@@ -151,7 +156,10 @@ class BaseSmartTurn(BaseTurnAnalyzer):
151
156
  Tuple containing the end-of-turn state and optional metrics data
152
157
  from the ML model analysis.
153
158
  """
154
- state, result = await self._process_speech_segment(self._audio_buffer)
159
+ loop = asyncio.get_running_loop()
160
+ state, result = await loop.run_in_executor(
161
+ self._executor, self._process_speech_segment, self._audio_buffer
162
+ )
155
163
  if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
156
164
  self._clear(state)
157
165
  logger.debug(f"End of Turn result: {state}")
@@ -169,9 +177,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
169
177
  self._speech_start_time = 0
170
178
  self._silence_ms = 0
171
179
 
172
- async def _process_speech_segment(
173
- self, audio_buffer
174
- ) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
180
+ def _process_speech_segment(self, audio_buffer) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
175
181
  """Process accumulated audio segment using ML model."""
176
182
  state = EndOfTurnState.INCOMPLETE
177
183
 
@@ -203,7 +209,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
203
209
  if len(segment_audio) > 0:
204
210
  start_time = time.perf_counter()
205
211
  try:
206
- result = await self._predict_endpoint(segment_audio)
212
+ result = self._predict_endpoint(segment_audio)
207
213
  state = (
208
214
  EndOfTurnState.COMPLETE
209
215
  if result["prediction"] == 1
@@ -249,6 +255,6 @@ class BaseSmartTurn(BaseTurnAnalyzer):
249
255
  return state, result_data
250
256
 
251
257
  @abstractmethod
252
- async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
258
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
253
259
  """Predict end-of-turn using ML model from audio data."""
254
260
  pass
File without changes
@@ -104,11 +104,15 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
104
104
  logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
105
105
  raise Exception("Failed to send raw request to Daily Smart Turn.")
106
106
 
107
- async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
107
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
108
108
  """Predict end-of-turn using remote HTTP ML service."""
109
109
  try:
110
110
  serialized_array = self._serialize_array(audio_array)
111
- return await self._send_raw_request(serialized_array)
111
+ loop = asyncio.get_running_loop()
112
+ future = asyncio.run_coroutine_threadsafe(
113
+ self._send_raw_request(serialized_array), loop
114
+ )
115
+ return future.result()
112
116
  except Exception as e:
113
117
  logger.error(f"Smart turn prediction failed: {str(e)}")
114
118
  # Return an incomplete prediction when a failure occurs
@@ -64,7 +64,7 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
64
64
  self._turn_model.eval()
65
65
  logger.debug("Loaded Local Smart Turn")
66
66
 
67
- async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
67
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
68
68
  """Predict end-of-turn using local PyTorch model."""
69
69
  inputs = self._turn_processor(
70
70
  audio_array,
@@ -73,7 +73,7 @@ class LocalSmartTurnAnalyzerV2(BaseSmartTurn):
73
73
  self._turn_model.eval()
74
74
  logger.debug("Loaded Local Smart Turn v2")
75
75
 
76
- async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
76
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
77
77
  """Predict end-of-turn using local PyTorch model."""
78
78
  inputs = self._turn_processor(
79
79
  audio_array,
@@ -0,0 +1,124 @@
1
+ #
2
+ # Copyright (c) 2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Local turn analyzer for on-device ML inference using the smart-turn-v3 model.
8
+
9
+ This module provides a smart turn analyzer that uses an ONNX model for
10
+ local end-of-turn detection without requiring network connectivity.
11
+ """
12
+
13
+ from typing import Any, Dict, Optional
14
+
15
+ import numpy as np
16
+ from loguru import logger
17
+
18
+ from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn
19
+
20
+ try:
21
+ import onnxruntime as ort
22
+ from transformers import WhisperFeatureExtractor
23
+ except ModuleNotFoundError as e:
24
+ logger.error(f"Exception: {e}")
25
+ logger.error(
26
+ "In order to use LocalSmartTurnAnalyzerV3, you need to `pip install pipecat-ai[local-smart-turn-v3]`."
27
+ )
28
+ raise Exception(f"Missing module: {e}")
29
+
30
+
31
+ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
32
+ """Local turn analyzer using the smart-turn-v3 ONNX model.
33
+
34
+ Provides end-of-turn detection using locally-stored ONNX model,
35
+ enabling offline operation without network dependencies.
36
+ """
37
+
38
+ def __init__(self, *, smart_turn_model_path: Optional[str] = None, **kwargs):
39
+ """Initialize the local ONNX smart-turn-v3 analyzer.
40
+
41
+ Args:
42
+ smart_turn_model_path: Path to the ONNX model file. If this is not
43
+ set, the bundled smart-turn-v3.0 model will be used.
44
+ **kwargs: Additional arguments passed to BaseSmartTurn.
45
+ """
46
+ super().__init__(**kwargs)
47
+
48
+ logger.debug("Loading Local Smart Turn v3 model...")
49
+
50
+ if not smart_turn_model_path:
51
+ # Load bundled model
52
+ model_name = "smart-turn-v3.0.onnx"
53
+ package_path = "pipecat.audio.turn.smart_turn.data"
54
+
55
+ try:
56
+ import importlib_resources as impresources
57
+
58
+ smart_turn_model_path = str(impresources.files(package_path).joinpath(model_name))
59
+ except BaseException:
60
+ from importlib import resources as impresources
61
+
62
+ try:
63
+ with impresources.path(package_path, model_name) as f:
64
+ smart_turn_model_path = f
65
+ except BaseException:
66
+ smart_turn_model_path = str(
67
+ impresources.files(package_path).joinpath(model_name)
68
+ )
69
+
70
+ so = ort.SessionOptions()
71
+ so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
72
+ so.inter_op_num_threads = 1
73
+ so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
74
+
75
+ self._feature_extractor = WhisperFeatureExtractor(chunk_length=8)
76
+ self._session = ort.InferenceSession(smart_turn_model_path, sess_options=so)
77
+
78
+ logger.debug("Loaded Local Smart Turn v3")
79
+
80
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
81
+ """Predict end-of-turn using local ONNX model."""
82
+
83
+ def truncate_audio_to_last_n_seconds(audio_array, n_seconds=8, sample_rate=16000):
84
+ """Truncate audio to last n seconds or pad with zeros to meet n seconds."""
85
+ max_samples = n_seconds * sample_rate
86
+ if len(audio_array) > max_samples:
87
+ return audio_array[-max_samples:]
88
+ elif len(audio_array) < max_samples:
89
+ # Pad with zeros at the beginning
90
+ padding = max_samples - len(audio_array)
91
+ return np.pad(audio_array, (padding, 0), mode="constant", constant_values=0)
92
+ return audio_array
93
+
94
+ # Truncate to 8 seconds (keeping the end) or pad to 8 seconds
95
+ audio_array = truncate_audio_to_last_n_seconds(audio_array, n_seconds=8)
96
+
97
+ # Process audio using Whisper's feature extractor
98
+ inputs = self._feature_extractor(
99
+ audio_array,
100
+ sampling_rate=16000,
101
+ return_tensors="np",
102
+ padding="max_length",
103
+ max_length=8 * 16000,
104
+ truncation=True,
105
+ do_normalize=True,
106
+ )
107
+
108
+ # Extract features and ensure correct shape for ONNX
109
+ input_features = inputs.input_features.squeeze(0).astype(np.float32)
110
+ input_features = np.expand_dims(input_features, axis=0) # Add batch dimension
111
+
112
+ # Run ONNX inference
113
+ outputs = self._session.run(None, {"input_features": input_features})
114
+
115
+ # Extract probability (ONNX model returns sigmoid probabilities)
116
+ probability = outputs[0][0].item()
117
+
118
+ # Make prediction (1 for Complete, 0 for Incomplete)
119
+ prediction = 1 if probability > 0.5 else 0
120
+
121
+ return {
122
+ "prediction": prediction,
123
+ "probability": probability,
124
+ }