dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -24,6 +24,7 @@ from pipecat.processors.aggregators.llm_context import (
24
24
  LLMContext,
25
25
  LLMContextMessage,
26
26
  LLMContextToolChoice,
27
+ LLMSpecificMessage,
27
28
  NotGiven,
28
29
  )
29
30
 
@@ -47,6 +48,11 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
47
48
  - Extracting and sanitizing messages from the LLM context for logging about OpenAI.
48
49
  """
49
50
 
51
+ @property
52
+ def id_for_llm_specific_messages(self) -> str:
53
+ """Get the identifier used in LLMSpecificMessage instances for OpenAI."""
54
+ return "openai"
55
+
50
56
  def get_llm_invocation_params(self, context: LLMContext) -> OpenAILLMInvocationParams:
51
57
  """Get OpenAI-specific LLM invocation parameters from a universal LLM context.
52
58
 
@@ -57,7 +63,7 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
57
63
  Dictionary of parameters for OpenAI's ChatCompletion API.
58
64
  """
59
65
  return {
60
- "messages": self._from_universal_context_messages(self._get_messages(context)),
66
+ "messages": self._from_universal_context_messages(self.get_messages(context)),
61
67
  # NOTE; LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
62
68
  "tools": self.from_standard_tools(context.tools),
63
69
  "tool_choice": context.tool_choice,
@@ -91,7 +97,7 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
91
97
  List of messages in a format ready for logging about OpenAI.
92
98
  """
93
99
  msgs = []
94
- for message in self._get_messages(context):
100
+ for message in self.get_messages(context):
95
101
  msg = copy.deepcopy(message)
96
102
  if "content" in msg:
97
103
  if isinstance(msg["content"], list):
@@ -99,19 +105,25 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
99
105
  if item["type"] == "image_url":
100
106
  if item["image_url"]["url"].startswith("data:image/"):
101
107
  item["image_url"]["url"] = "data:image/..."
108
+ if item["type"] == "input_audio":
109
+ item["input_audio"]["data"] = "..."
102
110
  if "mime_type" in msg and msg["mime_type"].startswith("image/"):
103
111
  msg["data"] = "..."
104
112
  msgs.append(msg)
105
113
  return msgs
106
114
 
107
- def _get_messages(self, context: LLMContext) -> List[LLMContextMessage]:
108
- return context.get_messages("openai")
109
-
110
115
  def _from_universal_context_messages(
111
116
  self, messages: List[LLMContextMessage]
112
117
  ) -> List[ChatCompletionMessageParam]:
113
- # Just a pass-through: messages are already the right type
114
- return messages
118
+ result = []
119
+ for message in messages:
120
+ if isinstance(message, LLMSpecificMessage):
121
+ # Extract the actual message content from LLMSpecificMessage
122
+ result.append(message.message)
123
+ else:
124
+ # Standard message, pass through unchanged
125
+ result.append(message)
126
+ return result
115
127
 
116
128
  def _from_standard_tool_choice(
117
129
  self, tool_choice: LLMContextToolChoice | NotGiven
@@ -30,6 +30,11 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
30
30
  OpenAI's Realtime API for function calling capabilities.
31
31
  """
32
32
 
33
+ @property
34
+ def id_for_llm_specific_messages(self) -> str:
35
+ """Get the identifier used in LLMSpecificMessage instances for OpenAI Realtime."""
36
+ raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
37
+
33
38
  def get_llm_invocation_params(self, context: LLMContext) -> OpenAIRealtimeLLMInvocationParams:
34
39
  """Get OpenAI Realtime-specific LLM invocation parameters from a universal LLM context.
35
40
 
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,193 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Krisp noise reduction audio filter for Pipecat.
8
+
9
+ This module provides an audio filter implementation using Krisp VIVA SDK.
10
+ """
11
+
12
+ import os
13
+
14
+ import numpy as np
15
+ from loguru import logger
16
+
17
+ from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
18
+ from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
19
+
20
+ try:
21
+ import krisp_audio
22
+ except ModuleNotFoundError as e:
23
+ logger.error(f"Exception: {e}")
24
+ logger.error("In order to use the Krisp filter, you need to install krisp_audio.")
25
+ raise Exception(f"Missing module: {e}")
26
+
27
+
28
+ def _log_callback(log_message, log_level):
29
+ logger.info(f"[{log_level}] {log_message}")
30
+
31
+
32
+ class KrispVivaFilter(BaseAudioFilter):
33
+ """Audio filter using the Krisp VIVA SDK.
34
+
35
+ Provides real-time noise reduction for audio streams using Krisp's
36
+ proprietary noise suppression algorithms. This filter requires a
37
+ valid Krisp model file to operate.
38
+
39
+ Supported sample rates:
40
+ - 8000 Hz
41
+ - 16000 Hz
42
+ - 24000 Hz
43
+ - 32000 Hz
44
+ - 44100 Hz
45
+ - 48000 Hz
46
+ """
47
+
48
+ # Initialize Krisp Audio SDK globally
49
+ krisp_audio.globalInit("", _log_callback, krisp_audio.LogLevel.Off)
50
+ SDK_VERSION = krisp_audio.getVersion()
51
+ logger.debug(
52
+ f"Krisp Audio Python SDK Version: {SDK_VERSION.major}."
53
+ f"{SDK_VERSION.minor}.{SDK_VERSION.patch}"
54
+ )
55
+
56
+ SAMPLE_RATES = {
57
+ 8000: krisp_audio.SamplingRate.Sr8000Hz,
58
+ 16000: krisp_audio.SamplingRate.Sr16000Hz,
59
+ 24000: krisp_audio.SamplingRate.Sr24000Hz,
60
+ 32000: krisp_audio.SamplingRate.Sr32000Hz,
61
+ 44100: krisp_audio.SamplingRate.Sr44100Hz,
62
+ 48000: krisp_audio.SamplingRate.Sr48000Hz,
63
+ }
64
+
65
+ FRAME_SIZE_MS = 10 # Krisp requires audio frames of 10ms duration for processing.
66
+
67
+ def __init__(self, model_path: str = None, noise_suppression_level: int = 100) -> None:
68
+ """Initialize the Krisp noise reduction filter.
69
+
70
+ Args:
71
+ model_path: Path to the Krisp model file (.kef extension).
72
+ If None, uses KRISP_VIVA_MODEL_PATH environment variable.
73
+ noise_suppression_level: Noise suppression level.
74
+
75
+ Raises:
76
+ ValueError: If model_path is not provided and KRISP_VIVA_MODEL_PATH is not set.
77
+ Exception: If model file doesn't have .kef extension.
78
+ FileNotFoundError: If model file doesn't exist.
79
+ """
80
+ super().__init__()
81
+
82
+ # Set model path, checking environment if not specified
83
+ self._model_path = model_path or os.getenv("KRISP_VIVA_MODEL_PATH")
84
+ if not self._model_path:
85
+ logger.error("Model path is not provided and KRISP_VIVA_MODEL_PATH is not set.")
86
+ raise ValueError("Model path for KrispAudioProcessor must be provided.")
87
+
88
+ if not self._model_path.endswith(".kef"):
89
+ raise Exception("Model is expected with .kef extension")
90
+
91
+ if not os.path.isfile(self._model_path):
92
+ raise FileNotFoundError(f"Model file not found: {self._model_path}")
93
+
94
+ self._filtering = True
95
+ self._session = None
96
+ self._samples_per_frame = None
97
+ self._noise_suppression_level = noise_suppression_level
98
+
99
+ # Audio buffer to accumulate samples for complete frames
100
+ self._audio_buffer = bytearray()
101
+
102
+ def _int_to_sample_rate(self, sample_rate):
103
+ """Convert integer sample rate to krisp_audio SamplingRate enum.
104
+
105
+ Args:
106
+ sample_rate: Sample rate as integer
107
+
108
+ Returns:
109
+ krisp_audio.SamplingRate enum value
110
+
111
+ Raises:
112
+ ValueError: If sample rate is not supported
113
+ """
114
+ if sample_rate not in self.SAMPLE_RATES:
115
+ raise ValueError("Unsupported sample rate")
116
+ return self.SAMPLE_RATES[sample_rate]
117
+
118
+ async def start(self, sample_rate: int):
119
+ """Initialize the Krisp processor with the transport's sample rate.
120
+
121
+ Args:
122
+ sample_rate: The sample rate of the input transport in Hz.
123
+ """
124
+ model_info = krisp_audio.ModelInfo()
125
+ model_info.path = self._model_path
126
+
127
+ nc_cfg = krisp_audio.NcSessionConfig()
128
+ nc_cfg.inputSampleRate = self._int_to_sample_rate(sample_rate)
129
+ nc_cfg.inputFrameDuration = krisp_audio.FrameDuration.Fd10ms
130
+ nc_cfg.outputSampleRate = nc_cfg.inputSampleRate
131
+ nc_cfg.modelInfo = model_info
132
+
133
+ self._samples_per_frame = int((sample_rate * self.FRAME_SIZE_MS) / 1000)
134
+ self._session = krisp_audio.NcInt16.create(nc_cfg)
135
+
136
+ async def stop(self):
137
+ """Clean up the Krisp processor when stopping."""
138
+ self._session = None
139
+
140
+ async def process_frame(self, frame: FilterControlFrame):
141
+ """Process control frames to enable/disable filtering.
142
+
143
+ Args:
144
+ frame: The control frame containing filter commands.
145
+ """
146
+ if isinstance(frame, FilterEnableFrame):
147
+ self._filtering = frame.enable
148
+
149
+ async def filter(self, audio: bytes) -> bytes:
150
+ """Apply Krisp noise reduction to audio data.
151
+
152
+ Args:
153
+ audio: Raw audio data as bytes to be filtered.
154
+
155
+ Returns:
156
+ Noise-reduced audio data as bytes.
157
+ """
158
+ if not self._filtering:
159
+ return audio
160
+
161
+ # Add incoming audio to our buffer
162
+ self._audio_buffer.extend(audio)
163
+
164
+ # Calculate how many complete frames we can process
165
+ total_samples = len(self._audio_buffer) // 2 # 2 bytes per int16 sample
166
+ num_complete_frames = total_samples // self._samples_per_frame
167
+
168
+ if num_complete_frames == 0:
169
+ # Not enough samples for a complete frame yet, return empty
170
+ return b""
171
+
172
+ # Calculate how many bytes we need for complete frames
173
+ complete_samples_count = num_complete_frames * self._samples_per_frame
174
+ bytes_to_process = complete_samples_count * 2 # 2 bytes per sample
175
+
176
+ # Extract the bytes we can process
177
+ audio_to_process = bytes(self._audio_buffer[:bytes_to_process])
178
+
179
+ # Remove processed bytes from buffer, keep the remainder
180
+ self._audio_buffer = self._audio_buffer[bytes_to_process:]
181
+
182
+ # Process the complete frames
183
+ samples = np.frombuffer(audio_to_process, dtype=np.int16)
184
+ frames = samples.reshape(-1, self._samples_per_frame)
185
+ processed_samples = np.empty_like(samples)
186
+
187
+ for i, frame in enumerate(frames):
188
+ cleaned_frame = self._session.process(frame, self._noise_suppression_level)
189
+ processed_samples[i * self._samples_per_frame : (i + 1) * self._samples_per_frame] = (
190
+ cleaned_frame
191
+ )
192
+
193
+ return processed_samples.tobytes()
@@ -33,6 +33,10 @@ class NoisereduceFilter(BaseAudioFilter):
33
33
  Applies spectral gating noise reduction algorithms to suppress background
34
34
  noise in audio streams. Uses the noisereduce library's default noise
35
35
  reduction parameters.
36
+
37
+ .. deprecated:: 0.0.85
38
+ `NoisereduceFilter` is deprecated and will be removed in a future version.
39
+ We recommend using other real-time audio filters like `KrispFilter` or `AICFilter`.
36
40
  """
37
41
 
38
42
  def __init__(self) -> None:
@@ -40,6 +44,17 @@ class NoisereduceFilter(BaseAudioFilter):
40
44
  self._filtering = True
41
45
  self._sample_rate = 0
42
46
 
47
+ import warnings
48
+
49
+ with warnings.catch_warnings():
50
+ warnings.simplefilter("always")
51
+ warnings.warn(
52
+ "`NoisereduceFilter` is deprecated. "
53
+ "Use other real-time audio filters like `KrispFilter` or `AICFilter`.",
54
+ DeprecationWarning,
55
+ stacklevel=2,
56
+ )
57
+
43
58
  async def start(self, sample_rate: int):
44
59
  """Initialize the filter with the transport's sample rate.
45
60
 
@@ -14,6 +14,8 @@ from abc import ABC, abstractmethod
14
14
  from enum import Enum
15
15
  from typing import Optional, Tuple
16
16
 
17
+ from pydantic import BaseModel
18
+
17
19
  from pipecat.metrics.metrics import MetricsData
18
20
 
19
21
 
@@ -29,6 +31,12 @@ class EndOfTurnState(Enum):
29
31
  INCOMPLETE = 2
30
32
 
31
33
 
34
+ class BaseTurnParams(BaseModel):
35
+ """Base class for turn analyzer parameters."""
36
+
37
+ pass
38
+
39
+
32
40
  class BaseTurnAnalyzer(ABC):
33
41
  """Abstract base class for analyzing user end of turn.
34
42
 
@@ -78,7 +86,7 @@ class BaseTurnAnalyzer(ABC):
78
86
 
79
87
  @property
80
88
  @abstractmethod
81
- def params(self):
89
+ def params(self) -> BaseTurnParams:
82
90
  """Get the current turn analyzer parameters.
83
91
 
84
92
  Returns:
@@ -11,15 +11,17 @@ machine learning models to determine when a user has finished speaking, going
11
11
  beyond simple silence-based detection.
12
12
  """
13
13
 
14
+ import asyncio
14
15
  import time
15
16
  from abc import abstractmethod
17
+ from concurrent.futures import ThreadPoolExecutor
16
18
  from typing import Any, Dict, Optional, Tuple
17
19
 
18
20
  import numpy as np
19
21
  from loguru import logger
20
22
  from pydantic import BaseModel
21
23
 
22
- from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
24
+ from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, BaseTurnParams, EndOfTurnState
23
25
  from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
24
26
 
25
27
  # Default timing parameters
@@ -29,7 +31,7 @@ MAX_DURATION_SECONDS = 8 # Max allowed segment duration
29
31
  USE_ONLY_LAST_VAD_SEGMENT = True
30
32
 
31
33
 
32
- class SmartTurnParams(BaseModel):
34
+ class SmartTurnParams(BaseTurnParams):
33
35
  """Configuration parameters for smart turn analysis.
34
36
 
35
37
  Parameters:
@@ -77,6 +79,9 @@ class BaseSmartTurn(BaseTurnAnalyzer):
77
79
  self._speech_triggered = False
78
80
  self._silence_ms = 0
79
81
  self._speech_start_time = 0
82
+ # Thread executor that will run the model. We only need one thread per
83
+ # analyzer because one analyzer just handles one audio stream.
84
+ self._executor = ThreadPoolExecutor(max_workers=1)
80
85
 
81
86
  @property
82
87
  def speech_triggered(self) -> bool:
@@ -151,7 +156,10 @@ class BaseSmartTurn(BaseTurnAnalyzer):
151
156
  Tuple containing the end-of-turn state and optional metrics data
152
157
  from the ML model analysis.
153
158
  """
154
- state, result = await self._process_speech_segment(self._audio_buffer)
159
+ loop = asyncio.get_running_loop()
160
+ state, result = await loop.run_in_executor(
161
+ self._executor, self._process_speech_segment, self._audio_buffer
162
+ )
155
163
  if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
156
164
  self._clear(state)
157
165
  logger.debug(f"End of Turn result: {state}")
@@ -169,9 +177,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
169
177
  self._speech_start_time = 0
170
178
  self._silence_ms = 0
171
179
 
172
- async def _process_speech_segment(
173
- self, audio_buffer
174
- ) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
180
+ def _process_speech_segment(self, audio_buffer) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
175
181
  """Process accumulated audio segment using ML model."""
176
182
  state = EndOfTurnState.INCOMPLETE
177
183
 
@@ -203,7 +209,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
203
209
  if len(segment_audio) > 0:
204
210
  start_time = time.perf_counter()
205
211
  try:
206
- result = await self._predict_endpoint(segment_audio)
212
+ result = self._predict_endpoint(segment_audio)
207
213
  state = (
208
214
  EndOfTurnState.COMPLETE
209
215
  if result["prediction"] == 1
@@ -249,6 +255,6 @@ class BaseSmartTurn(BaseTurnAnalyzer):
249
255
  return state, result_data
250
256
 
251
257
  @abstractmethod
252
- async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
258
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
253
259
  """Predict end-of-turn using ML model from audio data."""
254
260
  pass
File without changes
@@ -104,11 +104,15 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
104
104
  logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
105
105
  raise Exception("Failed to send raw request to Daily Smart Turn.")
106
106
 
107
- async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
107
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
108
108
  """Predict end-of-turn using remote HTTP ML service."""
109
109
  try:
110
110
  serialized_array = self._serialize_array(audio_array)
111
- return await self._send_raw_request(serialized_array)
111
+ loop = asyncio.get_running_loop()
112
+ future = asyncio.run_coroutine_threadsafe(
113
+ self._send_raw_request(serialized_array), loop
114
+ )
115
+ return future.result()
112
116
  except Exception as e:
113
117
  logger.error(f"Smart turn prediction failed: {str(e)}")
114
118
  # Return an incomplete prediction when a failure occurs
@@ -64,7 +64,7 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
64
64
  self._turn_model.eval()
65
65
  logger.debug("Loaded Local Smart Turn")
66
66
 
67
- async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
67
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
68
68
  """Predict end-of-turn using local PyTorch model."""
69
69
  inputs = self._turn_processor(
70
70
  audio_array,
@@ -73,7 +73,7 @@ class LocalSmartTurnAnalyzerV2(BaseSmartTurn):
73
73
  self._turn_model.eval()
74
74
  logger.debug("Loaded Local Smart Turn v2")
75
75
 
76
- async def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
76
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
77
77
  """Predict end-of-turn using local PyTorch model."""
78
78
  inputs = self._turn_processor(
79
79
  audio_array,
@@ -0,0 +1,124 @@
1
+ #
2
+ # Copyright (c) 2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Local turn analyzer for on-device ML inference using the smart-turn-v3 model.
8
+
9
+ This module provides a smart turn analyzer that uses an ONNX model for
10
+ local end-of-turn detection without requiring network connectivity.
11
+ """
12
+
13
+ from typing import Any, Dict, Optional
14
+
15
+ import numpy as np
16
+ from loguru import logger
17
+
18
+ from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn
19
+
20
+ try:
21
+ import onnxruntime as ort
22
+ from transformers import WhisperFeatureExtractor
23
+ except ModuleNotFoundError as e:
24
+ logger.error(f"Exception: {e}")
25
+ logger.error(
26
+ "In order to use LocalSmartTurnAnalyzerV3, you need to `pip install pipecat-ai[local-smart-turn-v3]`."
27
+ )
28
+ raise Exception(f"Missing module: {e}")
29
+
30
+
31
+ class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
32
+ """Local turn analyzer using the smart-turn-v3 ONNX model.
33
+
34
+ Provides end-of-turn detection using locally-stored ONNX model,
35
+ enabling offline operation without network dependencies.
36
+ """
37
+
38
+ def __init__(self, *, smart_turn_model_path: Optional[str] = None, **kwargs):
39
+ """Initialize the local ONNX smart-turn-v3 analyzer.
40
+
41
+ Args:
42
+ smart_turn_model_path: Path to the ONNX model file. If this is not
43
+ set, the bundled smart-turn-v3.0 model will be used.
44
+ **kwargs: Additional arguments passed to BaseSmartTurn.
45
+ """
46
+ super().__init__(**kwargs)
47
+
48
+ logger.debug("Loading Local Smart Turn v3 model...")
49
+
50
+ if not smart_turn_model_path:
51
+ # Load bundled model
52
+ model_name = "smart-turn-v3.0.onnx"
53
+ package_path = "pipecat.audio.turn.smart_turn.data"
54
+
55
+ try:
56
+ import importlib_resources as impresources
57
+
58
+ smart_turn_model_path = str(impresources.files(package_path).joinpath(model_name))
59
+ except BaseException:
60
+ from importlib import resources as impresources
61
+
62
+ try:
63
+ with impresources.path(package_path, model_name) as f:
64
+ smart_turn_model_path = f
65
+ except BaseException:
66
+ smart_turn_model_path = str(
67
+ impresources.files(package_path).joinpath(model_name)
68
+ )
69
+
70
+ so = ort.SessionOptions()
71
+ so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
72
+ so.inter_op_num_threads = 1
73
+ so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
74
+
75
+ self._feature_extractor = WhisperFeatureExtractor(chunk_length=8)
76
+ self._session = ort.InferenceSession(smart_turn_model_path, sess_options=so)
77
+
78
+ logger.debug("Loaded Local Smart Turn v3")
79
+
80
+ def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
81
+ """Predict end-of-turn using local ONNX model."""
82
+
83
+ def truncate_audio_to_last_n_seconds(audio_array, n_seconds=8, sample_rate=16000):
84
+ """Truncate audio to last n seconds or pad with zeros to meet n seconds."""
85
+ max_samples = n_seconds * sample_rate
86
+ if len(audio_array) > max_samples:
87
+ return audio_array[-max_samples:]
88
+ elif len(audio_array) < max_samples:
89
+ # Pad with zeros at the beginning
90
+ padding = max_samples - len(audio_array)
91
+ return np.pad(audio_array, (padding, 0), mode="constant", constant_values=0)
92
+ return audio_array
93
+
94
+ # Truncate to 8 seconds (keeping the end) or pad to 8 seconds
95
+ audio_array = truncate_audio_to_last_n_seconds(audio_array, n_seconds=8)
96
+
97
+ # Process audio using Whisper's feature extractor
98
+ inputs = self._feature_extractor(
99
+ audio_array,
100
+ sampling_rate=16000,
101
+ return_tensors="np",
102
+ padding="max_length",
103
+ max_length=8 * 16000,
104
+ truncation=True,
105
+ do_normalize=True,
106
+ )
107
+
108
+ # Extract features and ensure correct shape for ONNX
109
+ input_features = inputs.input_features.squeeze(0).astype(np.float32)
110
+ input_features = np.expand_dims(input_features, axis=0) # Add batch dimension
111
+
112
+ # Run ONNX inference
113
+ outputs = self._session.run(None, {"input_features": input_features})
114
+
115
+ # Extract probability (ONNX model returns sigmoid probabilities)
116
+ probability = outputs[0][0].item()
117
+
118
+ # Make prediction (1 for Complete, 0 for Incomplete)
119
+ prediction = 1 if probability > 0.5 else 0
120
+
121
+ return {
122
+ "prediction": prediction,
123
+ "probability": probability,
124
+ }
@@ -0,0 +1,10 @@
1
+ This directory contains packaged VAD model files used by Pipecat.
2
+
3
+ - `silero_vad.onnx`: Default Silero VAD model shipped with the package.
4
+ - `silero_vad_v2.onnx`: Alternate model used when Arabic (codes starting with `ar`) is present
5
+ in the call configuration (primary `language` or any `add_langs`). This file is optional.
6
+
7
+ If `silero_vad_v2.onnx` is not present or fails to load, Pipecat will automatically fall back
8
+ to `silero_vad.onnx` and log a warning. To enable the Arabic-optimized model, place a valid
9
+ ONNX file at this path with the exact filename.
10
+
@@ -135,7 +135,13 @@ class SileroVADAnalyzer(VADAnalyzer):
135
135
  with automatic model state management and periodic resets.
136
136
  """
137
137
 
138
- def __init__(self, *, sample_rate: Optional[int] = None, params: Optional[VADParams] = None):
138
+ def __init__(
139
+ self,
140
+ *,
141
+ sample_rate: Optional[int] = None,
142
+ params: Optional[VADParams] = None,
143
+ model_name: Optional[str] = None,
144
+ ):
139
145
  """Initialize the Silero VAD analyzer.
140
146
 
141
147
  Args:
@@ -146,7 +152,7 @@ class SileroVADAnalyzer(VADAnalyzer):
146
152
 
147
153
  logger.debug("Loading Silero VAD model...")
148
154
 
149
- model_name = "silero_vad.onnx"
155
+ model_name = model_name or "silero_vad.onnx"
150
156
  package_path = "pipecat.audio.vad.data"
151
157
 
152
158
  try:
@@ -166,7 +172,7 @@ class SileroVADAnalyzer(VADAnalyzer):
166
172
 
167
173
  self._last_reset_time = 0
168
174
 
169
- logger.debug("Loaded Silero VAD")
175
+ logger.debug(f"Loaded Silero VAD {model_file_path}")
170
176
 
171
177
  #
172
178
  # VADAnalyzer