dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
pipecat/audio/vad/silero.py
CHANGED
|
@@ -4,6 +4,13 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Silero Voice Activity Detection (VAD) implementation for Pipecat.
|
|
8
|
+
|
|
9
|
+
This module provides a VAD analyzer based on the Silero VAD ONNX model,
|
|
10
|
+
which can detect voice activity in audio streams with high accuracy.
|
|
11
|
+
Supports 8kHz and 16kHz sample rates.
|
|
12
|
+
"""
|
|
13
|
+
|
|
7
14
|
import time
|
|
8
15
|
from typing import Optional
|
|
9
16
|
|
|
@@ -25,11 +32,20 @@ except ModuleNotFoundError as e:
|
|
|
25
32
|
|
|
26
33
|
|
|
27
34
|
class SileroOnnxModel:
|
|
28
|
-
|
|
29
|
-
import numpy as np
|
|
35
|
+
"""ONNX runtime wrapper for the Silero VAD model.
|
|
30
36
|
|
|
31
|
-
|
|
37
|
+
Provides voice activity detection using the pre-trained Silero VAD model
|
|
38
|
+
with ONNX runtime for efficient inference. Handles model state management
|
|
39
|
+
and input validation for audio processing.
|
|
40
|
+
"""
|
|
32
41
|
|
|
42
|
+
def __init__(self, path, force_onnx_cpu=True):
|
|
43
|
+
"""Initialize the Silero ONNX model.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
path: Path to the ONNX model file.
|
|
47
|
+
force_onnx_cpu: Whether to force CPU execution provider.
|
|
48
|
+
"""
|
|
33
49
|
opts = onnxruntime.SessionOptions()
|
|
34
50
|
opts.inter_op_num_threads = 1
|
|
35
51
|
opts.intra_op_num_threads = 1
|
|
@@ -45,6 +61,7 @@ class SileroOnnxModel:
|
|
|
45
61
|
self.sample_rates = [8000, 16000]
|
|
46
62
|
|
|
47
63
|
def _validate_input(self, x, sr: int):
|
|
64
|
+
"""Validate and preprocess input audio data."""
|
|
48
65
|
if np.ndim(x) == 1:
|
|
49
66
|
x = np.expand_dims(x, 0)
|
|
50
67
|
if np.ndim(x) > 2:
|
|
@@ -60,12 +77,18 @@ class SileroOnnxModel:
|
|
|
60
77
|
return x, sr
|
|
61
78
|
|
|
62
79
|
def reset_states(self, batch_size=1):
|
|
80
|
+
"""Reset the internal model states.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
batch_size: Batch size for state initialization. Defaults to 1.
|
|
84
|
+
"""
|
|
63
85
|
self._state = np.zeros((2, batch_size, 128), dtype="float32")
|
|
64
86
|
self._context = np.zeros((batch_size, 0), dtype="float32")
|
|
65
87
|
self._last_sr = 0
|
|
66
88
|
self._last_batch_size = 0
|
|
67
89
|
|
|
68
90
|
def __call__(self, x, sr: int):
|
|
91
|
+
"""Process audio input through the VAD model."""
|
|
69
92
|
x, sr = self._validate_input(x, sr)
|
|
70
93
|
num_samples = 512 if sr == 16000 else 256
|
|
71
94
|
|
|
@@ -105,7 +128,20 @@ class SileroOnnxModel:
|
|
|
105
128
|
|
|
106
129
|
|
|
107
130
|
class SileroVADAnalyzer(VADAnalyzer):
|
|
131
|
+
"""Voice Activity Detection analyzer using the Silero VAD model.
|
|
132
|
+
|
|
133
|
+
Implements VAD analysis using the pre-trained Silero ONNX model for
|
|
134
|
+
accurate voice activity detection. Supports 8kHz and 16kHz sample rates
|
|
135
|
+
with automatic model state management and periodic resets.
|
|
136
|
+
"""
|
|
137
|
+
|
|
108
138
|
def __init__(self, *, sample_rate: Optional[int] = None, params: Optional[VADParams] = None):
|
|
139
|
+
"""Initialize the Silero VAD analyzer.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
sample_rate: Audio sample rate (8000 or 16000 Hz). If None, will be set later.
|
|
143
|
+
params: VAD parameters for detection thresholds and timing.
|
|
144
|
+
"""
|
|
109
145
|
super().__init__(sample_rate=sample_rate, params=params)
|
|
110
146
|
|
|
111
147
|
logger.debug("Loading Silero VAD model...")
|
|
@@ -137,6 +173,14 @@ class SileroVADAnalyzer(VADAnalyzer):
|
|
|
137
173
|
#
|
|
138
174
|
|
|
139
175
|
def set_sample_rate(self, sample_rate: int):
|
|
176
|
+
"""Set the sample rate for audio processing.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
sample_rate: Audio sample rate (must be 8000 or 16000 Hz).
|
|
180
|
+
|
|
181
|
+
Raises:
|
|
182
|
+
ValueError: If sample rate is not 8000 or 16000 Hz.
|
|
183
|
+
"""
|
|
140
184
|
if sample_rate != 16000 and sample_rate != 8000:
|
|
141
185
|
raise ValueError(
|
|
142
186
|
f"Silero VAD sample rate needs to be 16000 or 8000 (sample rate: {sample_rate})"
|
|
@@ -145,9 +189,22 @@ class SileroVADAnalyzer(VADAnalyzer):
|
|
|
145
189
|
super().set_sample_rate(sample_rate)
|
|
146
190
|
|
|
147
191
|
def num_frames_required(self) -> int:
|
|
192
|
+
"""Get the number of audio frames required for VAD analysis.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Number of frames required (512 for 16kHz, 256 for 8kHz).
|
|
196
|
+
"""
|
|
148
197
|
return 512 if self.sample_rate == 16000 else 256
|
|
149
198
|
|
|
150
199
|
def voice_confidence(self, buffer) -> float:
|
|
200
|
+
"""Calculate voice activity confidence for the given audio buffer.
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
buffer: Audio buffer to analyze.
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
Voice confidence score between 0.0 and 1.0.
|
|
207
|
+
"""
|
|
151
208
|
try:
|
|
152
209
|
audio_int16 = np.frombuffer(buffer, np.int16)
|
|
153
210
|
# Divide by 32768 because we have signed 16-bit data.
|
|
@@ -4,6 +4,13 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Voice Activity Detection (VAD) analyzer base classes and utilities.
|
|
8
|
+
|
|
9
|
+
This module provides the abstract base class for VAD analyzers and associated
|
|
10
|
+
data structures for voice activity detection in audio streams. Includes state
|
|
11
|
+
management, parameter configuration, and audio analysis framework.
|
|
12
|
+
"""
|
|
13
|
+
|
|
7
14
|
from abc import ABC, abstractmethod
|
|
8
15
|
from enum import Enum
|
|
9
16
|
from typing import Optional
|
|
@@ -21,6 +28,15 @@ Logger = type(logger)
|
|
|
21
28
|
|
|
22
29
|
|
|
23
30
|
class VADState(Enum):
|
|
31
|
+
"""Voice Activity Detection states.
|
|
32
|
+
|
|
33
|
+
Parameters:
|
|
34
|
+
QUIET: No voice activity detected.
|
|
35
|
+
STARTING: Voice activity beginning, transitioning from quiet.
|
|
36
|
+
SPEAKING: Active voice detected and confirmed.
|
|
37
|
+
STOPPING: Voice activity ending, transitioning to quiet.
|
|
38
|
+
"""
|
|
39
|
+
|
|
24
40
|
QUIET = 1
|
|
25
41
|
STARTING = 2
|
|
26
42
|
SPEAKING = 3
|
|
@@ -28,6 +44,15 @@ class VADState(Enum):
|
|
|
28
44
|
|
|
29
45
|
|
|
30
46
|
class VADParams(BaseModel):
|
|
47
|
+
"""Configuration parameters for Voice Activity Detection.
|
|
48
|
+
|
|
49
|
+
Parameters:
|
|
50
|
+
confidence: Minimum confidence threshold for voice detection.
|
|
51
|
+
start_secs: Duration to wait before confirming voice start.
|
|
52
|
+
stop_secs: Duration to wait before confirming voice stop.
|
|
53
|
+
min_volume: Minimum audio volume threshold for voice detection.
|
|
54
|
+
"""
|
|
55
|
+
|
|
31
56
|
confidence: float = VAD_CONFIDENCE
|
|
32
57
|
start_secs: float = VAD_START_SECS
|
|
33
58
|
stop_secs: float = VAD_STOP_SECS
|
|
@@ -35,7 +60,20 @@ class VADParams(BaseModel):
|
|
|
35
60
|
|
|
36
61
|
|
|
37
62
|
class VADAnalyzer(ABC):
|
|
63
|
+
"""Abstract base class for Voice Activity Detection analyzers.
|
|
64
|
+
|
|
65
|
+
Provides the framework for implementing VAD analysis with configurable
|
|
66
|
+
parameters, state management, and audio processing capabilities.
|
|
67
|
+
Subclasses must implement the core voice confidence calculation.
|
|
68
|
+
"""
|
|
69
|
+
|
|
38
70
|
def __init__(self, *, sample_rate: Optional[int] = None, params: Optional[VADParams] = None):
|
|
71
|
+
"""Initialize the VAD analyzer.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
sample_rate: Audio sample rate in Hz. If None, will be set later.
|
|
75
|
+
params: VAD parameters for detection configuration.
|
|
76
|
+
"""
|
|
39
77
|
self._init_sample_rate = sample_rate
|
|
40
78
|
self._sample_rate = 0
|
|
41
79
|
self._params = params or VADParams()
|
|
@@ -49,25 +87,58 @@ class VADAnalyzer(ABC):
|
|
|
49
87
|
|
|
50
88
|
@property
|
|
51
89
|
def sample_rate(self) -> int:
|
|
90
|
+
"""Get the current sample rate.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Current audio sample rate in Hz.
|
|
94
|
+
"""
|
|
52
95
|
return self._sample_rate
|
|
53
96
|
|
|
54
97
|
@property
|
|
55
98
|
def num_channels(self) -> int:
|
|
99
|
+
"""Get the number of audio channels.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Number of audio channels (always 1 for mono).
|
|
103
|
+
"""
|
|
56
104
|
return self._num_channels
|
|
57
105
|
|
|
58
106
|
@property
|
|
59
107
|
def params(self) -> VADParams:
|
|
108
|
+
"""Get the current VAD parameters.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Current VAD configuration parameters.
|
|
112
|
+
"""
|
|
60
113
|
return self._params
|
|
61
114
|
|
|
62
115
|
@abstractmethod
|
|
63
116
|
def num_frames_required(self) -> int:
|
|
117
|
+
"""Get the number of audio frames required for analysis.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Number of frames needed for VAD processing.
|
|
121
|
+
"""
|
|
64
122
|
pass
|
|
65
123
|
|
|
66
124
|
@abstractmethod
|
|
67
125
|
def voice_confidence(self, buffer) -> float:
|
|
126
|
+
"""Calculate voice activity confidence for the given audio buffer.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
buffer: Audio buffer to analyze.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
Voice confidence score between 0.0 and 1.0.
|
|
133
|
+
"""
|
|
68
134
|
pass
|
|
69
135
|
|
|
70
136
|
def set_sample_rate(self, sample_rate: int):
|
|
137
|
+
"""Set the sample rate for audio processing.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
sample_rate: Audio sample rate in Hz.
|
|
141
|
+
"""
|
|
71
142
|
self._sample_rate = self._init_sample_rate or sample_rate
|
|
72
143
|
self.set_params(self._params)
|
|
73
144
|
|
|
@@ -86,46 +157,59 @@ class VADAnalyzer(ABC):
|
|
|
86
157
|
self._vad_state: VADState = VADState.QUIET
|
|
87
158
|
|
|
88
159
|
def _get_smoothed_volume(self, audio: bytes) -> float:
|
|
160
|
+
"""Calculate smoothed audio volume using exponential smoothing."""
|
|
89
161
|
volume = calculate_audio_volume(audio, self.sample_rate)
|
|
90
162
|
return exp_smoothing(volume, self._prev_volume, self._smoothing_factor)
|
|
91
163
|
|
|
92
164
|
def analyze_audio(self, buffer) -> VADState:
|
|
165
|
+
"""Analyze audio buffer and return current VAD state.
|
|
166
|
+
|
|
167
|
+
Processes incoming audio data, maintains internal state, and determines
|
|
168
|
+
voice activity status based on confidence and volume thresholds.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
buffer: Audio buffer to analyze.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Current VAD state after processing the buffer.
|
|
175
|
+
"""
|
|
93
176
|
self._vad_buffer += buffer
|
|
94
177
|
|
|
95
178
|
num_required_bytes = self._vad_frames_num_bytes
|
|
96
179
|
if len(self._vad_buffer) < num_required_bytes:
|
|
97
180
|
return self._vad_state
|
|
98
181
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
182
|
+
while len(self._vad_buffer) >= num_required_bytes:
|
|
183
|
+
audio_frames = self._vad_buffer[:num_required_bytes]
|
|
184
|
+
self._vad_buffer = self._vad_buffer[num_required_bytes:]
|
|
185
|
+
|
|
186
|
+
confidence = self.voice_confidence(audio_frames)
|
|
187
|
+
|
|
188
|
+
volume = self._get_smoothed_volume(audio_frames)
|
|
189
|
+
self._prev_volume = volume
|
|
190
|
+
|
|
191
|
+
speaking = confidence >= self._params.confidence and volume >= self._params.min_volume
|
|
192
|
+
|
|
193
|
+
if speaking:
|
|
194
|
+
match self._vad_state:
|
|
195
|
+
case VADState.QUIET:
|
|
196
|
+
self._vad_state = VADState.STARTING
|
|
197
|
+
self._vad_starting_count = 1
|
|
198
|
+
case VADState.STARTING:
|
|
199
|
+
self._vad_starting_count += 1
|
|
200
|
+
case VADState.STOPPING:
|
|
201
|
+
self._vad_state = VADState.SPEAKING
|
|
202
|
+
self._vad_stopping_count = 0
|
|
203
|
+
else:
|
|
204
|
+
match self._vad_state:
|
|
205
|
+
case VADState.STARTING:
|
|
206
|
+
self._vad_state = VADState.QUIET
|
|
207
|
+
self._vad_starting_count = 0
|
|
208
|
+
case VADState.SPEAKING:
|
|
209
|
+
self._vad_state = VADState.STOPPING
|
|
210
|
+
self._vad_stopping_count = 1
|
|
211
|
+
case VADState.STOPPING:
|
|
212
|
+
self._vad_stopping_count += 1
|
|
129
213
|
|
|
130
214
|
if (
|
|
131
215
|
self._vad_state == VADState.STARTING
|
pipecat/clocks/base_clock.py
CHANGED
|
@@ -4,14 +4,33 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Base clock interface for Pipecat timing operations."""
|
|
8
|
+
|
|
7
9
|
from abc import ABC, abstractmethod
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class BaseClock(ABC):
|
|
13
|
+
"""Abstract base class for clock implementations.
|
|
14
|
+
|
|
15
|
+
Provides a common interface for timing operations used in Pipecat
|
|
16
|
+
for synchronization, scheduling, and time-based processing.
|
|
17
|
+
"""
|
|
18
|
+
|
|
11
19
|
@abstractmethod
|
|
12
20
|
def get_time(self) -> int:
|
|
21
|
+
"""Get the current time value.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
The current time as an integer value. The specific unit and
|
|
25
|
+
reference point depend on the concrete implementation.
|
|
26
|
+
"""
|
|
13
27
|
pass
|
|
14
28
|
|
|
15
29
|
@abstractmethod
|
|
16
30
|
def start(self):
|
|
31
|
+
"""Start or initialize the clock.
|
|
32
|
+
|
|
33
|
+
Performs any necessary initialization or starts the timing mechanism.
|
|
34
|
+
This method should be called before using get_time().
|
|
35
|
+
"""
|
|
17
36
|
pass
|
pipecat/clocks/system_clock.py
CHANGED
|
@@ -4,17 +4,42 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""System clock implementation for Pipecat."""
|
|
8
|
+
|
|
7
9
|
import time
|
|
8
10
|
|
|
9
11
|
from pipecat.clocks.base_clock import BaseClock
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
class SystemClock(BaseClock):
|
|
15
|
+
"""A monotonic clock implementation using system time.
|
|
16
|
+
|
|
17
|
+
Provides high-precision timing using the system's monotonic clock,
|
|
18
|
+
which is not affected by system clock adjustments and is suitable
|
|
19
|
+
for measuring elapsed time in real-time applications.
|
|
20
|
+
"""
|
|
21
|
+
|
|
13
22
|
def __init__(self):
|
|
23
|
+
"""Initialize the system clock.
|
|
24
|
+
|
|
25
|
+
The clock starts in an uninitialized state and must be started
|
|
26
|
+
explicitly using the start() method before time measurement begins.
|
|
27
|
+
"""
|
|
14
28
|
self._time = 0
|
|
15
29
|
|
|
16
30
|
def get_time(self) -> int:
|
|
31
|
+
"""Get the elapsed time since the clock was started.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
The elapsed time in nanoseconds since start() was called.
|
|
35
|
+
Returns 0 if the clock has not been started yet.
|
|
36
|
+
"""
|
|
17
37
|
return time.monotonic_ns() - self._time if self._time > 0 else 0
|
|
18
38
|
|
|
19
39
|
def start(self):
|
|
40
|
+
"""Start the clock and begin time measurement.
|
|
41
|
+
|
|
42
|
+
Records the current monotonic time as the reference point
|
|
43
|
+
for all subsequent get_time() calls.
|
|
44
|
+
"""
|
|
20
45
|
self._time = time.monotonic_ns()
|
|
File without changes
|