dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -24,6 +24,7 @@ from pipecat.processors.aggregators.llm_context import (
|
|
|
24
24
|
LLMContext,
|
|
25
25
|
LLMContextMessage,
|
|
26
26
|
LLMContextToolChoice,
|
|
27
|
+
LLMSpecificMessage,
|
|
27
28
|
NotGiven,
|
|
28
29
|
)
|
|
29
30
|
|
|
@@ -47,6 +48,11 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
|
|
47
48
|
- Extracting and sanitizing messages from the LLM context for logging about OpenAI.
|
|
48
49
|
"""
|
|
49
50
|
|
|
51
|
+
@property
|
|
52
|
+
def id_for_llm_specific_messages(self) -> str:
|
|
53
|
+
"""Get the identifier used in LLMSpecificMessage instances for OpenAI."""
|
|
54
|
+
return "openai"
|
|
55
|
+
|
|
50
56
|
def get_llm_invocation_params(self, context: LLMContext) -> OpenAILLMInvocationParams:
|
|
51
57
|
"""Get OpenAI-specific LLM invocation parameters from a universal LLM context.
|
|
52
58
|
|
|
@@ -57,7 +63,7 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
|
|
57
63
|
Dictionary of parameters for OpenAI's ChatCompletion API.
|
|
58
64
|
"""
|
|
59
65
|
return {
|
|
60
|
-
"messages": self._from_universal_context_messages(self.
|
|
66
|
+
"messages": self._from_universal_context_messages(self.get_messages(context)),
|
|
61
67
|
# NOTE; LLMContext's tools are guaranteed to be a ToolsSchema (or NOT_GIVEN)
|
|
62
68
|
"tools": self.from_standard_tools(context.tools),
|
|
63
69
|
"tool_choice": context.tool_choice,
|
|
@@ -91,7 +97,7 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
|
|
91
97
|
List of messages in a format ready for logging about OpenAI.
|
|
92
98
|
"""
|
|
93
99
|
msgs = []
|
|
94
|
-
for message in self.
|
|
100
|
+
for message in self.get_messages(context):
|
|
95
101
|
msg = copy.deepcopy(message)
|
|
96
102
|
if "content" in msg:
|
|
97
103
|
if isinstance(msg["content"], list):
|
|
@@ -99,19 +105,25 @@ class OpenAILLMAdapter(BaseLLMAdapter[OpenAILLMInvocationParams]):
|
|
|
99
105
|
if item["type"] == "image_url":
|
|
100
106
|
if item["image_url"]["url"].startswith("data:image/"):
|
|
101
107
|
item["image_url"]["url"] = "data:image/..."
|
|
108
|
+
if item["type"] == "input_audio":
|
|
109
|
+
item["input_audio"]["data"] = "..."
|
|
102
110
|
if "mime_type" in msg and msg["mime_type"].startswith("image/"):
|
|
103
111
|
msg["data"] = "..."
|
|
104
112
|
msgs.append(msg)
|
|
105
113
|
return msgs
|
|
106
114
|
|
|
107
|
-
def _get_messages(self, context: LLMContext) -> List[LLMContextMessage]:
|
|
108
|
-
return context.get_messages("openai")
|
|
109
|
-
|
|
110
115
|
def _from_universal_context_messages(
|
|
111
116
|
self, messages: List[LLMContextMessage]
|
|
112
117
|
) -> List[ChatCompletionMessageParam]:
|
|
113
|
-
|
|
114
|
-
|
|
118
|
+
result = []
|
|
119
|
+
for message in messages:
|
|
120
|
+
if isinstance(message, LLMSpecificMessage):
|
|
121
|
+
# Extract the actual message content from LLMSpecificMessage
|
|
122
|
+
result.append(message.message)
|
|
123
|
+
else:
|
|
124
|
+
# Standard message, pass through unchanged
|
|
125
|
+
result.append(message)
|
|
126
|
+
return result
|
|
115
127
|
|
|
116
128
|
def _from_standard_tool_choice(
|
|
117
129
|
self, tool_choice: LLMContextToolChoice | NotGiven
|
|
@@ -30,6 +30,11 @@ class OpenAIRealtimeLLMAdapter(BaseLLMAdapter):
|
|
|
30
30
|
OpenAI's Realtime API for function calling capabilities.
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
|
+
@property
|
|
34
|
+
def id_for_llm_specific_messages(self) -> str:
|
|
35
|
+
"""Get the identifier used in LLMSpecificMessage instances for OpenAI Realtime."""
|
|
36
|
+
raise NotImplementedError("Universal LLMContext is not yet supported for OpenAI Realtime.")
|
|
37
|
+
|
|
33
38
|
def get_llm_invocation_params(self, context: LLMContext) -> OpenAIRealtimeLLMInvocationParams:
|
|
34
39
|
"""Get OpenAI Realtime-specific LLM invocation parameters from a universal LLM context.
|
|
35
40
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Krisp noise reduction audio filter for Pipecat.
|
|
8
|
+
|
|
9
|
+
This module provides an audio filter implementation using Krisp VIVA SDK.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
from loguru import logger
|
|
16
|
+
|
|
17
|
+
from pipecat.audio.filters.base_audio_filter import BaseAudioFilter
|
|
18
|
+
from pipecat.frames.frames import FilterControlFrame, FilterEnableFrame
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
import krisp_audio
|
|
22
|
+
except ModuleNotFoundError as e:
|
|
23
|
+
logger.error(f"Exception: {e}")
|
|
24
|
+
logger.error("In order to use the Krisp filter, you need to install krisp_audio.")
|
|
25
|
+
raise Exception(f"Missing module: {e}")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _log_callback(log_message, log_level):
|
|
29
|
+
logger.info(f"[{log_level}] {log_message}")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class KrispVivaFilter(BaseAudioFilter):
|
|
33
|
+
"""Audio filter using the Krisp VIVA SDK.
|
|
34
|
+
|
|
35
|
+
Provides real-time noise reduction for audio streams using Krisp's
|
|
36
|
+
proprietary noise suppression algorithms. This filter requires a
|
|
37
|
+
valid Krisp model file to operate.
|
|
38
|
+
|
|
39
|
+
Supported sample rates:
|
|
40
|
+
- 8000 Hz
|
|
41
|
+
- 16000 Hz
|
|
42
|
+
- 24000 Hz
|
|
43
|
+
- 32000 Hz
|
|
44
|
+
- 44100 Hz
|
|
45
|
+
- 48000 Hz
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# Initialize Krisp Audio SDK globally
|
|
49
|
+
krisp_audio.globalInit("", _log_callback, krisp_audio.LogLevel.Off)
|
|
50
|
+
SDK_VERSION = krisp_audio.getVersion()
|
|
51
|
+
logger.debug(
|
|
52
|
+
f"Krisp Audio Python SDK Version: {SDK_VERSION.major}."
|
|
53
|
+
f"{SDK_VERSION.minor}.{SDK_VERSION.patch}"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
SAMPLE_RATES = {
|
|
57
|
+
8000: krisp_audio.SamplingRate.Sr8000Hz,
|
|
58
|
+
16000: krisp_audio.SamplingRate.Sr16000Hz,
|
|
59
|
+
24000: krisp_audio.SamplingRate.Sr24000Hz,
|
|
60
|
+
32000: krisp_audio.SamplingRate.Sr32000Hz,
|
|
61
|
+
44100: krisp_audio.SamplingRate.Sr44100Hz,
|
|
62
|
+
48000: krisp_audio.SamplingRate.Sr48000Hz,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
FRAME_SIZE_MS = 10 # Krisp requires audio frames of 10ms duration for processing.
|
|
66
|
+
|
|
67
|
+
def __init__(self, model_path: str = None, noise_suppression_level: int = 100) -> None:
|
|
68
|
+
"""Initialize the Krisp noise reduction filter.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
model_path: Path to the Krisp model file (.kef extension).
|
|
72
|
+
If None, uses KRISP_VIVA_MODEL_PATH environment variable.
|
|
73
|
+
noise_suppression_level: Noise suppression level.
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If model_path is not provided and KRISP_VIVA_MODEL_PATH is not set.
|
|
77
|
+
Exception: If model file doesn't have .kef extension.
|
|
78
|
+
FileNotFoundError: If model file doesn't exist.
|
|
79
|
+
"""
|
|
80
|
+
super().__init__()
|
|
81
|
+
|
|
82
|
+
# Set model path, checking environment if not specified
|
|
83
|
+
self._model_path = model_path or os.getenv("KRISP_VIVA_MODEL_PATH")
|
|
84
|
+
if not self._model_path:
|
|
85
|
+
logger.error("Model path is not provided and KRISP_VIVA_MODEL_PATH is not set.")
|
|
86
|
+
raise ValueError("Model path for KrispAudioProcessor must be provided.")
|
|
87
|
+
|
|
88
|
+
if not self._model_path.endswith(".kef"):
|
|
89
|
+
raise Exception("Model is expected with .kef extension")
|
|
90
|
+
|
|
91
|
+
if not os.path.isfile(self._model_path):
|
|
92
|
+
raise FileNotFoundError(f"Model file not found: {self._model_path}")
|
|
93
|
+
|
|
94
|
+
self._filtering = True
|
|
95
|
+
self._session = None
|
|
96
|
+
self._samples_per_frame = None
|
|
97
|
+
self._noise_suppression_level = noise_suppression_level
|
|
98
|
+
|
|
99
|
+
# Audio buffer to accumulate samples for complete frames
|
|
100
|
+
self._audio_buffer = bytearray()
|
|
101
|
+
|
|
102
|
+
def _int_to_sample_rate(self, sample_rate):
|
|
103
|
+
"""Convert integer sample rate to krisp_audio SamplingRate enum.
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
sample_rate: Sample rate as integer
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
krisp_audio.SamplingRate enum value
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
ValueError: If sample rate is not supported
|
|
113
|
+
"""
|
|
114
|
+
if sample_rate not in self.SAMPLE_RATES:
|
|
115
|
+
raise ValueError("Unsupported sample rate")
|
|
116
|
+
return self.SAMPLE_RATES[sample_rate]
|
|
117
|
+
|
|
118
|
+
async def start(self, sample_rate: int):
|
|
119
|
+
"""Initialize the Krisp processor with the transport's sample rate.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
sample_rate: The sample rate of the input transport in Hz.
|
|
123
|
+
"""
|
|
124
|
+
model_info = krisp_audio.ModelInfo()
|
|
125
|
+
model_info.path = self._model_path
|
|
126
|
+
|
|
127
|
+
nc_cfg = krisp_audio.NcSessionConfig()
|
|
128
|
+
nc_cfg.inputSampleRate = self._int_to_sample_rate(sample_rate)
|
|
129
|
+
nc_cfg.inputFrameDuration = krisp_audio.FrameDuration.Fd10ms
|
|
130
|
+
nc_cfg.outputSampleRate = nc_cfg.inputSampleRate
|
|
131
|
+
nc_cfg.modelInfo = model_info
|
|
132
|
+
|
|
133
|
+
self._samples_per_frame = int((sample_rate * self.FRAME_SIZE_MS) / 1000)
|
|
134
|
+
self._session = krisp_audio.NcInt16.create(nc_cfg)
|
|
135
|
+
|
|
136
|
+
async def stop(self):
|
|
137
|
+
"""Clean up the Krisp processor when stopping."""
|
|
138
|
+
self._session = None
|
|
139
|
+
|
|
140
|
+
async def process_frame(self, frame: FilterControlFrame):
|
|
141
|
+
"""Process control frames to enable/disable filtering.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
frame: The control frame containing filter commands.
|
|
145
|
+
"""
|
|
146
|
+
if isinstance(frame, FilterEnableFrame):
|
|
147
|
+
self._filtering = frame.enable
|
|
148
|
+
|
|
149
|
+
async def filter(self, audio: bytes) -> bytes:
|
|
150
|
+
"""Apply Krisp noise reduction to audio data.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
audio: Raw audio data as bytes to be filtered.
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
Noise-reduced audio data as bytes.
|
|
157
|
+
"""
|
|
158
|
+
if not self._filtering:
|
|
159
|
+
return audio
|
|
160
|
+
|
|
161
|
+
# Add incoming audio to our buffer
|
|
162
|
+
self._audio_buffer.extend(audio)
|
|
163
|
+
|
|
164
|
+
# Calculate how many complete frames we can process
|
|
165
|
+
total_samples = len(self._audio_buffer) // 2 # 2 bytes per int16 sample
|
|
166
|
+
num_complete_frames = total_samples // self._samples_per_frame
|
|
167
|
+
|
|
168
|
+
if num_complete_frames == 0:
|
|
169
|
+
# Not enough samples for a complete frame yet, return empty
|
|
170
|
+
return b""
|
|
171
|
+
|
|
172
|
+
# Calculate how many bytes we need for complete frames
|
|
173
|
+
complete_samples_count = num_complete_frames * self._samples_per_frame
|
|
174
|
+
bytes_to_process = complete_samples_count * 2 # 2 bytes per sample
|
|
175
|
+
|
|
176
|
+
# Extract the bytes we can process
|
|
177
|
+
audio_to_process = bytes(self._audio_buffer[:bytes_to_process])
|
|
178
|
+
|
|
179
|
+
# Remove processed bytes from buffer, keep the remainder
|
|
180
|
+
self._audio_buffer = self._audio_buffer[bytes_to_process:]
|
|
181
|
+
|
|
182
|
+
# Process the complete frames
|
|
183
|
+
samples = np.frombuffer(audio_to_process, dtype=np.int16)
|
|
184
|
+
frames = samples.reshape(-1, self._samples_per_frame)
|
|
185
|
+
processed_samples = np.empty_like(samples)
|
|
186
|
+
|
|
187
|
+
for i, frame in enumerate(frames):
|
|
188
|
+
cleaned_frame = self._session.process(frame, self._noise_suppression_level)
|
|
189
|
+
processed_samples[i * self._samples_per_frame : (i + 1) * self._samples_per_frame] = (
|
|
190
|
+
cleaned_frame
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return processed_samples.tobytes()
|
|
@@ -33,6 +33,10 @@ class NoisereduceFilter(BaseAudioFilter):
|
|
|
33
33
|
Applies spectral gating noise reduction algorithms to suppress background
|
|
34
34
|
noise in audio streams. Uses the noisereduce library's default noise
|
|
35
35
|
reduction parameters.
|
|
36
|
+
|
|
37
|
+
.. deprecated:: 0.0.85
|
|
38
|
+
`NoisereduceFilter` is deprecated and will be removed in a future version.
|
|
39
|
+
We recommend using other real-time audio filters like `KrispFilter` or `AICFilter`.
|
|
36
40
|
"""
|
|
37
41
|
|
|
38
42
|
def __init__(self) -> None:
|
|
@@ -40,6 +44,17 @@ class NoisereduceFilter(BaseAudioFilter):
|
|
|
40
44
|
self._filtering = True
|
|
41
45
|
self._sample_rate = 0
|
|
42
46
|
|
|
47
|
+
import warnings
|
|
48
|
+
|
|
49
|
+
with warnings.catch_warnings():
|
|
50
|
+
warnings.simplefilter("always")
|
|
51
|
+
warnings.warn(
|
|
52
|
+
"`NoisereduceFilter` is deprecated. "
|
|
53
|
+
"Use other real-time audio filters like `KrispFilter` or `AICFilter`.",
|
|
54
|
+
DeprecationWarning,
|
|
55
|
+
stacklevel=2,
|
|
56
|
+
)
|
|
57
|
+
|
|
43
58
|
async def start(self, sample_rate: int):
|
|
44
59
|
"""Initialize the filter with the transport's sample rate.
|
|
45
60
|
|
|
@@ -14,6 +14,8 @@ from abc import ABC, abstractmethod
|
|
|
14
14
|
from enum import Enum
|
|
15
15
|
from typing import Optional, Tuple
|
|
16
16
|
|
|
17
|
+
from pydantic import BaseModel
|
|
18
|
+
|
|
17
19
|
from pipecat.metrics.metrics import MetricsData
|
|
18
20
|
|
|
19
21
|
|
|
@@ -29,6 +31,12 @@ class EndOfTurnState(Enum):
|
|
|
29
31
|
INCOMPLETE = 2
|
|
30
32
|
|
|
31
33
|
|
|
34
|
+
class BaseTurnParams(BaseModel):
|
|
35
|
+
"""Base class for turn analyzer parameters."""
|
|
36
|
+
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
32
40
|
class BaseTurnAnalyzer(ABC):
|
|
33
41
|
"""Abstract base class for analyzing user end of turn.
|
|
34
42
|
|
|
@@ -78,7 +86,7 @@ class BaseTurnAnalyzer(ABC):
|
|
|
78
86
|
|
|
79
87
|
@property
|
|
80
88
|
@abstractmethod
|
|
81
|
-
def params(self):
|
|
89
|
+
def params(self) -> BaseTurnParams:
|
|
82
90
|
"""Get the current turn analyzer parameters.
|
|
83
91
|
|
|
84
92
|
Returns:
|
|
@@ -11,15 +11,17 @@ machine learning models to determine when a user has finished speaking, going
|
|
|
11
11
|
beyond simple silence-based detection.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
+
import asyncio
|
|
14
15
|
import time
|
|
15
16
|
from abc import abstractmethod
|
|
17
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
16
18
|
from typing import Any, Dict, Optional, Tuple
|
|
17
19
|
|
|
18
20
|
import numpy as np
|
|
19
21
|
from loguru import logger
|
|
20
22
|
from pydantic import BaseModel
|
|
21
23
|
|
|
22
|
-
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, EndOfTurnState
|
|
24
|
+
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, BaseTurnParams, EndOfTurnState
|
|
23
25
|
from pipecat.metrics.metrics import MetricsData, SmartTurnMetricsData
|
|
24
26
|
|
|
25
27
|
# Default timing parameters
|
|
@@ -29,7 +31,7 @@ MAX_DURATION_SECONDS = 8 # Max allowed segment duration
|
|
|
29
31
|
USE_ONLY_LAST_VAD_SEGMENT = True
|
|
30
32
|
|
|
31
33
|
|
|
32
|
-
class SmartTurnParams(
|
|
34
|
+
class SmartTurnParams(BaseTurnParams):
|
|
33
35
|
"""Configuration parameters for smart turn analysis.
|
|
34
36
|
|
|
35
37
|
Parameters:
|
|
@@ -77,6 +79,9 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|
|
77
79
|
self._speech_triggered = False
|
|
78
80
|
self._silence_ms = 0
|
|
79
81
|
self._speech_start_time = 0
|
|
82
|
+
# Thread executor that will run the model. We only need one thread per
|
|
83
|
+
# analyzer because one analyzer just handles one audio stream.
|
|
84
|
+
self._executor = ThreadPoolExecutor(max_workers=1)
|
|
80
85
|
|
|
81
86
|
@property
|
|
82
87
|
def speech_triggered(self) -> bool:
|
|
@@ -151,7 +156,10 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|
|
151
156
|
Tuple containing the end-of-turn state and optional metrics data
|
|
152
157
|
from the ML model analysis.
|
|
153
158
|
"""
|
|
154
|
-
|
|
159
|
+
loop = asyncio.get_running_loop()
|
|
160
|
+
state, result = await loop.run_in_executor(
|
|
161
|
+
self._executor, self._process_speech_segment, self._audio_buffer
|
|
162
|
+
)
|
|
155
163
|
if state == EndOfTurnState.COMPLETE or USE_ONLY_LAST_VAD_SEGMENT:
|
|
156
164
|
self._clear(state)
|
|
157
165
|
logger.debug(f"End of Turn result: {state}")
|
|
@@ -169,9 +177,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|
|
169
177
|
self._speech_start_time = 0
|
|
170
178
|
self._silence_ms = 0
|
|
171
179
|
|
|
172
|
-
|
|
173
|
-
self, audio_buffer
|
|
174
|
-
) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
|
180
|
+
def _process_speech_segment(self, audio_buffer) -> Tuple[EndOfTurnState, Optional[MetricsData]]:
|
|
175
181
|
"""Process accumulated audio segment using ML model."""
|
|
176
182
|
state = EndOfTurnState.INCOMPLETE
|
|
177
183
|
|
|
@@ -203,7 +209,7 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|
|
203
209
|
if len(segment_audio) > 0:
|
|
204
210
|
start_time = time.perf_counter()
|
|
205
211
|
try:
|
|
206
|
-
result =
|
|
212
|
+
result = self._predict_endpoint(segment_audio)
|
|
207
213
|
state = (
|
|
208
214
|
EndOfTurnState.COMPLETE
|
|
209
215
|
if result["prediction"] == 1
|
|
@@ -249,6 +255,6 @@ class BaseSmartTurn(BaseTurnAnalyzer):
|
|
|
249
255
|
return state, result_data
|
|
250
256
|
|
|
251
257
|
@abstractmethod
|
|
252
|
-
|
|
258
|
+
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
|
253
259
|
"""Predict end-of-turn using ML model from audio data."""
|
|
254
260
|
pass
|
|
File without changes
|
|
Binary file
|
|
@@ -104,11 +104,15 @@ class HttpSmartTurnAnalyzer(BaseSmartTurn):
|
|
|
104
104
|
logger.error(f"Failed to send raw request to Daily Smart Turn: {e}")
|
|
105
105
|
raise Exception("Failed to send raw request to Daily Smart Turn.")
|
|
106
106
|
|
|
107
|
-
|
|
107
|
+
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
|
108
108
|
"""Predict end-of-turn using remote HTTP ML service."""
|
|
109
109
|
try:
|
|
110
110
|
serialized_array = self._serialize_array(audio_array)
|
|
111
|
-
|
|
111
|
+
loop = asyncio.get_running_loop()
|
|
112
|
+
future = asyncio.run_coroutine_threadsafe(
|
|
113
|
+
self._send_raw_request(serialized_array), loop
|
|
114
|
+
)
|
|
115
|
+
return future.result()
|
|
112
116
|
except Exception as e:
|
|
113
117
|
logger.error(f"Smart turn prediction failed: {str(e)}")
|
|
114
118
|
# Return an incomplete prediction when a failure occurs
|
|
@@ -64,7 +64,7 @@ class LocalSmartTurnAnalyzer(BaseSmartTurn):
|
|
|
64
64
|
self._turn_model.eval()
|
|
65
65
|
logger.debug("Loaded Local Smart Turn")
|
|
66
66
|
|
|
67
|
-
|
|
67
|
+
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
|
68
68
|
"""Predict end-of-turn using local PyTorch model."""
|
|
69
69
|
inputs = self._turn_processor(
|
|
70
70
|
audio_array,
|
|
@@ -73,7 +73,7 @@ class LocalSmartTurnAnalyzerV2(BaseSmartTurn):
|
|
|
73
73
|
self._turn_model.eval()
|
|
74
74
|
logger.debug("Loaded Local Smart Turn v2")
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
|
77
77
|
"""Predict end-of-turn using local PyTorch model."""
|
|
78
78
|
inputs = self._turn_processor(
|
|
79
79
|
audio_array,
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Local turn analyzer for on-device ML inference using the smart-turn-v3 model.
|
|
8
|
+
|
|
9
|
+
This module provides a smart turn analyzer that uses an ONNX model for
|
|
10
|
+
local end-of-turn detection without requiring network connectivity.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import Any, Dict, Optional
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
from loguru import logger
|
|
17
|
+
|
|
18
|
+
from pipecat.audio.turn.smart_turn.base_smart_turn import BaseSmartTurn
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
import onnxruntime as ort
|
|
22
|
+
from transformers import WhisperFeatureExtractor
|
|
23
|
+
except ModuleNotFoundError as e:
|
|
24
|
+
logger.error(f"Exception: {e}")
|
|
25
|
+
logger.error(
|
|
26
|
+
"In order to use LocalSmartTurnAnalyzerV3, you need to `pip install pipecat-ai[local-smart-turn-v3]`."
|
|
27
|
+
)
|
|
28
|
+
raise Exception(f"Missing module: {e}")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LocalSmartTurnAnalyzerV3(BaseSmartTurn):
|
|
32
|
+
"""Local turn analyzer using the smart-turn-v3 ONNX model.
|
|
33
|
+
|
|
34
|
+
Provides end-of-turn detection using locally-stored ONNX model,
|
|
35
|
+
enabling offline operation without network dependencies.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(self, *, smart_turn_model_path: Optional[str] = None, **kwargs):
|
|
39
|
+
"""Initialize the local ONNX smart-turn-v3 analyzer.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
smart_turn_model_path: Path to the ONNX model file. If this is not
|
|
43
|
+
set, the bundled smart-turn-v3.0 model will be used.
|
|
44
|
+
**kwargs: Additional arguments passed to BaseSmartTurn.
|
|
45
|
+
"""
|
|
46
|
+
super().__init__(**kwargs)
|
|
47
|
+
|
|
48
|
+
logger.debug("Loading Local Smart Turn v3 model...")
|
|
49
|
+
|
|
50
|
+
if not smart_turn_model_path:
|
|
51
|
+
# Load bundled model
|
|
52
|
+
model_name = "smart-turn-v3.0.onnx"
|
|
53
|
+
package_path = "pipecat.audio.turn.smart_turn.data"
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
import importlib_resources as impresources
|
|
57
|
+
|
|
58
|
+
smart_turn_model_path = str(impresources.files(package_path).joinpath(model_name))
|
|
59
|
+
except BaseException:
|
|
60
|
+
from importlib import resources as impresources
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
with impresources.path(package_path, model_name) as f:
|
|
64
|
+
smart_turn_model_path = f
|
|
65
|
+
except BaseException:
|
|
66
|
+
smart_turn_model_path = str(
|
|
67
|
+
impresources.files(package_path).joinpath(model_name)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
so = ort.SessionOptions()
|
|
71
|
+
so.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
|
72
|
+
so.inter_op_num_threads = 1
|
|
73
|
+
so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
74
|
+
|
|
75
|
+
self._feature_extractor = WhisperFeatureExtractor(chunk_length=8)
|
|
76
|
+
self._session = ort.InferenceSession(smart_turn_model_path, sess_options=so)
|
|
77
|
+
|
|
78
|
+
logger.debug("Loaded Local Smart Turn v3")
|
|
79
|
+
|
|
80
|
+
def _predict_endpoint(self, audio_array: np.ndarray) -> Dict[str, Any]:
|
|
81
|
+
"""Predict end-of-turn using local ONNX model."""
|
|
82
|
+
|
|
83
|
+
def truncate_audio_to_last_n_seconds(audio_array, n_seconds=8, sample_rate=16000):
|
|
84
|
+
"""Truncate audio to last n seconds or pad with zeros to meet n seconds."""
|
|
85
|
+
max_samples = n_seconds * sample_rate
|
|
86
|
+
if len(audio_array) > max_samples:
|
|
87
|
+
return audio_array[-max_samples:]
|
|
88
|
+
elif len(audio_array) < max_samples:
|
|
89
|
+
# Pad with zeros at the beginning
|
|
90
|
+
padding = max_samples - len(audio_array)
|
|
91
|
+
return np.pad(audio_array, (padding, 0), mode="constant", constant_values=0)
|
|
92
|
+
return audio_array
|
|
93
|
+
|
|
94
|
+
# Truncate to 8 seconds (keeping the end) or pad to 8 seconds
|
|
95
|
+
audio_array = truncate_audio_to_last_n_seconds(audio_array, n_seconds=8)
|
|
96
|
+
|
|
97
|
+
# Process audio using Whisper's feature extractor
|
|
98
|
+
inputs = self._feature_extractor(
|
|
99
|
+
audio_array,
|
|
100
|
+
sampling_rate=16000,
|
|
101
|
+
return_tensors="np",
|
|
102
|
+
padding="max_length",
|
|
103
|
+
max_length=8 * 16000,
|
|
104
|
+
truncation=True,
|
|
105
|
+
do_normalize=True,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Extract features and ensure correct shape for ONNX
|
|
109
|
+
input_features = inputs.input_features.squeeze(0).astype(np.float32)
|
|
110
|
+
input_features = np.expand_dims(input_features, axis=0) # Add batch dimension
|
|
111
|
+
|
|
112
|
+
# Run ONNX inference
|
|
113
|
+
outputs = self._session.run(None, {"input_features": input_features})
|
|
114
|
+
|
|
115
|
+
# Extract probability (ONNX model returns sigmoid probabilities)
|
|
116
|
+
probability = outputs[0][0].item()
|
|
117
|
+
|
|
118
|
+
# Make prediction (1 for Complete, 0 for Incomplete)
|
|
119
|
+
prediction = 1 if probability > 0.5 else 0
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"prediction": prediction,
|
|
123
|
+
"probability": probability,
|
|
124
|
+
}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
This directory contains packaged VAD model files used by Pipecat.
|
|
2
|
+
|
|
3
|
+
- `silero_vad.onnx`: Default Silero VAD model shipped with the package.
|
|
4
|
+
- `silero_vad_v2.onnx`: Alternate model used when Arabic (codes starting with `ar`) is present
|
|
5
|
+
in the call configuration (primary `language` or any `add_langs`). This file is optional.
|
|
6
|
+
|
|
7
|
+
If `silero_vad_v2.onnx` is not present or fails to load, Pipecat will automatically fall back
|
|
8
|
+
to `silero_vad.onnx` and log a warning. To enable the Arabic-optimized model, place a valid
|
|
9
|
+
ONNX file at this path with the exact filename.
|
|
10
|
+
|
|
Binary file
|
pipecat/audio/vad/silero.py
CHANGED
|
@@ -135,7 +135,13 @@ class SileroVADAnalyzer(VADAnalyzer):
|
|
|
135
135
|
with automatic model state management and periodic resets.
|
|
136
136
|
"""
|
|
137
137
|
|
|
138
|
-
def __init__(
|
|
138
|
+
def __init__(
|
|
139
|
+
self,
|
|
140
|
+
*,
|
|
141
|
+
sample_rate: Optional[int] = None,
|
|
142
|
+
params: Optional[VADParams] = None,
|
|
143
|
+
model_name: Optional[str] = None,
|
|
144
|
+
):
|
|
139
145
|
"""Initialize the Silero VAD analyzer.
|
|
140
146
|
|
|
141
147
|
Args:
|
|
@@ -146,7 +152,7 @@ class SileroVADAnalyzer(VADAnalyzer):
|
|
|
146
152
|
|
|
147
153
|
logger.debug("Loading Silero VAD model...")
|
|
148
154
|
|
|
149
|
-
model_name = "silero_vad.onnx"
|
|
155
|
+
model_name = model_name or "silero_vad.onnx"
|
|
150
156
|
package_path = "pipecat.audio.vad.data"
|
|
151
157
|
|
|
152
158
|
try:
|
|
@@ -166,7 +172,7 @@ class SileroVADAnalyzer(VADAnalyzer):
|
|
|
166
172
|
|
|
167
173
|
self._last_reset_time = 0
|
|
168
174
|
|
|
169
|
-
logger.debug("Loaded Silero VAD")
|
|
175
|
+
logger.debug(f"Loaded Silero VAD {model_file_path}")
|
|
170
176
|
|
|
171
177
|
#
|
|
172
178
|
# VADAnalyzer
|