dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
|
@@ -4,6 +4,13 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Soundfile-based audio mixer for file playback integration.
|
|
8
|
+
|
|
9
|
+
Provides an audio mixer that combines incoming audio with audio loaded from
|
|
10
|
+
files using the soundfile library. Supports multiple audio formats and
|
|
11
|
+
runtime configuration changes.
|
|
12
|
+
"""
|
|
13
|
+
|
|
7
14
|
import asyncio
|
|
8
15
|
from typing import Any, Dict, Mapping
|
|
9
16
|
|
|
@@ -24,7 +31,9 @@ except ModuleNotFoundError as e:
|
|
|
24
31
|
|
|
25
32
|
|
|
26
33
|
class SoundfileMixer(BaseAudioMixer):
|
|
27
|
-
"""
|
|
34
|
+
"""Audio mixer that combines incoming audio with file-based audio.
|
|
35
|
+
|
|
36
|
+
This is an audio mixer that mixes incoming audio with audio from a
|
|
28
37
|
file. It uses the soundfile library to load files so it supports multiple
|
|
29
38
|
formats. The audio files need to only have one channel (mono) and it needs
|
|
30
39
|
to match the sample rate of the output transport.
|
|
@@ -33,7 +42,6 @@ class SoundfileMixer(BaseAudioMixer):
|
|
|
33
42
|
`MixerUpdateSettingsFrame` has the following settings available: `sound`
|
|
34
43
|
(str) and `volume` (float) to be able to update to a different sound file or
|
|
35
44
|
to change the volume at runtime.
|
|
36
|
-
|
|
37
45
|
"""
|
|
38
46
|
|
|
39
47
|
def __init__(
|
|
@@ -46,6 +54,16 @@ class SoundfileMixer(BaseAudioMixer):
|
|
|
46
54
|
loop: bool = True,
|
|
47
55
|
**kwargs,
|
|
48
56
|
):
|
|
57
|
+
"""Initialize the soundfile mixer.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
sound_files: Mapping of sound names to file paths for loading.
|
|
61
|
+
default_sound: Name of the default sound to play initially.
|
|
62
|
+
volume: Mixing volume level (0.0 to 1.0). Defaults to 0.4.
|
|
63
|
+
mixing: Whether mixing is initially enabled. Defaults to True.
|
|
64
|
+
loop: Whether to loop audio files when they end. Defaults to True.
|
|
65
|
+
**kwargs: Additional arguments passed to parent class.
|
|
66
|
+
"""
|
|
49
67
|
super().__init__(**kwargs)
|
|
50
68
|
self._sound_files = sound_files
|
|
51
69
|
self._volume = volume
|
|
@@ -58,14 +76,28 @@ class SoundfileMixer(BaseAudioMixer):
|
|
|
58
76
|
self._loop = loop
|
|
59
77
|
|
|
60
78
|
async def start(self, sample_rate: int):
|
|
79
|
+
"""Initialize the mixer and load all sound files.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
sample_rate: The sample rate of the output transport in Hz.
|
|
83
|
+
"""
|
|
61
84
|
self._sample_rate = sample_rate
|
|
62
85
|
for sound_name, file_name in self._sound_files.items():
|
|
63
86
|
await asyncio.to_thread(self._load_sound_file, sound_name, file_name)
|
|
64
87
|
|
|
65
88
|
async def stop(self):
|
|
89
|
+
"""Clean up mixer resources.
|
|
90
|
+
|
|
91
|
+
Currently performs no cleanup as sound data is managed by garbage collection.
|
|
92
|
+
"""
|
|
66
93
|
pass
|
|
67
94
|
|
|
68
95
|
async def process_frame(self, frame: MixerControlFrame):
|
|
96
|
+
"""Process mixer control frames to update settings or enable/disable mixing.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
frame: The mixer control frame to process.
|
|
100
|
+
"""
|
|
69
101
|
if isinstance(frame, MixerUpdateSettingsFrame):
|
|
70
102
|
await self._update_settings(frame)
|
|
71
103
|
elif isinstance(frame, MixerEnableFrame):
|
|
@@ -73,12 +105,22 @@ class SoundfileMixer(BaseAudioMixer):
|
|
|
73
105
|
pass
|
|
74
106
|
|
|
75
107
|
async def mix(self, audio: bytes) -> bytes:
|
|
108
|
+
"""Mix transport audio with the current sound file.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
audio: Raw audio bytes from the transport to mix.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Mixed audio bytes combining transport and file audio.
|
|
115
|
+
"""
|
|
76
116
|
return self._mix_with_sound(audio)
|
|
77
117
|
|
|
78
118
|
async def _enable_mixing(self, enable: bool):
|
|
119
|
+
"""Enable or disable audio mixing."""
|
|
79
120
|
self._mixing = enable
|
|
80
121
|
|
|
81
122
|
async def _update_settings(self, frame: MixerUpdateSettingsFrame):
|
|
123
|
+
"""Update mixer settings from a control frame."""
|
|
82
124
|
for setting, value in frame.settings.items():
|
|
83
125
|
match setting:
|
|
84
126
|
case "sound":
|
|
@@ -89,6 +131,11 @@ class SoundfileMixer(BaseAudioMixer):
|
|
|
89
131
|
await self._update_loop(value)
|
|
90
132
|
|
|
91
133
|
async def _change_sound(self, sound: str):
|
|
134
|
+
"""Change the currently playing sound file.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
sound: Name of the sound file to switch to.
|
|
138
|
+
"""
|
|
92
139
|
if sound in self._sound_files:
|
|
93
140
|
self._current_sound = sound
|
|
94
141
|
self._sound_pos = 0
|
|
@@ -96,12 +143,15 @@ class SoundfileMixer(BaseAudioMixer):
|
|
|
96
143
|
logger.error(f"Sound {sound} is not available")
|
|
97
144
|
|
|
98
145
|
async def _update_volume(self, volume: float):
|
|
146
|
+
"""Update the mixing volume level."""
|
|
99
147
|
self._volume = volume
|
|
100
148
|
|
|
101
149
|
async def _update_loop(self, loop: bool):
|
|
150
|
+
"""Update the looping behavior."""
|
|
102
151
|
self._loop = loop
|
|
103
152
|
|
|
104
153
|
def _load_sound_file(self, sound_name: str, file_name: str):
|
|
154
|
+
"""Load an audio file into memory for mixing."""
|
|
105
155
|
try:
|
|
106
156
|
logger.debug(f"Loading mixer sound from {file_name}")
|
|
107
157
|
sound, sample_rate = sf.read(file_name, dtype="int16")
|
|
@@ -118,10 +168,7 @@ class SoundfileMixer(BaseAudioMixer):
|
|
|
118
168
|
logger.error(f"Unable to open file {file_name}: {e}")
|
|
119
169
|
|
|
120
170
|
def _mix_with_sound(self, audio: bytes):
|
|
121
|
-
"""
|
|
122
|
-
file.
|
|
123
|
-
|
|
124
|
-
"""
|
|
171
|
+
"""Mix raw audio frames with chunks of the same length from the sound file."""
|
|
125
172
|
if not self._mixing or not self._current_sound in self._sounds:
|
|
126
173
|
return audio
|
|
127
174
|
|
|
@@ -4,27 +4,35 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Base audio resampler interface for Pipecat.
|
|
8
|
+
|
|
9
|
+
This module defines the abstract base class for audio resampling implementations,
|
|
10
|
+
providing a common interface for converting audio between different sample rates.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
from abc import ABC, abstractmethod
|
|
8
14
|
|
|
9
15
|
|
|
10
16
|
class BaseAudioResampler(ABC):
|
|
11
|
-
"""Abstract base class for audio resampling.
|
|
12
|
-
|
|
17
|
+
"""Abstract base class for audio resampling implementations.
|
|
18
|
+
|
|
19
|
+
This class defines the interface that all audio resampling implementations
|
|
20
|
+
must follow, providing a standardized way to convert audio data between
|
|
21
|
+
different sample rates.
|
|
13
22
|
"""
|
|
14
23
|
|
|
15
24
|
@abstractmethod
|
|
16
25
|
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
|
17
|
-
"""
|
|
18
|
-
Resamples the given audio data to a different sample rate.
|
|
26
|
+
"""Resamples the given audio data to a different sample rate.
|
|
19
27
|
|
|
20
28
|
This is an abstract method that must be implemented in subclasses.
|
|
21
29
|
|
|
22
|
-
|
|
23
|
-
audio
|
|
24
|
-
in_rate
|
|
25
|
-
out_rate
|
|
30
|
+
Args:
|
|
31
|
+
audio: The audio data to be resampled, as raw bytes.
|
|
32
|
+
in_rate: The original sample rate of the audio data in Hz.
|
|
33
|
+
out_rate: The desired sample rate for the output audio in Hz.
|
|
26
34
|
|
|
27
35
|
Returns:
|
|
28
|
-
|
|
36
|
+
The resampled audio data as raw bytes.
|
|
29
37
|
"""
|
|
30
38
|
pass
|
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Resampy-based audio resampler implementation.
|
|
8
|
+
|
|
9
|
+
This module provides an audio resampler that uses the resampy library
|
|
10
|
+
for high-quality audio sample rate conversion.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
import numpy as np
|
|
8
14
|
import resampy
|
|
9
15
|
|
|
@@ -11,12 +17,31 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
|
|
11
17
|
|
|
12
18
|
|
|
13
19
|
class ResampyResampler(BaseAudioResampler):
|
|
14
|
-
"""Audio resampler implementation using the resampy library.
|
|
20
|
+
"""Audio resampler implementation using the resampy library.
|
|
21
|
+
|
|
22
|
+
This resampler uses the resampy library's Kaiser windowing filter
|
|
23
|
+
for high-quality audio resampling with good performance characteristics.
|
|
24
|
+
"""
|
|
15
25
|
|
|
16
26
|
def __init__(self, **kwargs):
|
|
27
|
+
"""Initialize the resampy resampler.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
**kwargs: Additional keyword arguments (currently unused).
|
|
31
|
+
"""
|
|
17
32
|
pass
|
|
18
33
|
|
|
19
34
|
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
|
35
|
+
"""Resample audio data using resampy library.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
audio: Input audio data as raw bytes (16-bit signed integers).
|
|
39
|
+
in_rate: Original sample rate in Hz.
|
|
40
|
+
out_rate: Target sample rate in Hz.
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Resampled audio data as raw bytes (16-bit signed integers).
|
|
44
|
+
"""
|
|
20
45
|
if in_rate == out_rate:
|
|
21
46
|
return audio
|
|
22
47
|
audio_data = np.frombuffer(audio, dtype=np.int16)
|
|
@@ -4,6 +4,17 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""SoX-based audio resampler implementation.
|
|
8
|
+
|
|
9
|
+
This module provides an audio resampler that uses the SoX resampler library
|
|
10
|
+
for very high-quality audio sample rate conversion.
|
|
11
|
+
|
|
12
|
+
When to use the SOXRAudioResampler:
|
|
13
|
+
1. For batch processing of complete audio files
|
|
14
|
+
2. When you have all the audio data available at once
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
|
|
7
18
|
import numpy as np
|
|
8
19
|
import soxr
|
|
9
20
|
|
|
@@ -11,12 +22,32 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
|
|
11
22
|
|
|
12
23
|
|
|
13
24
|
class SOXRAudioResampler(BaseAudioResampler):
|
|
14
|
-
"""Audio resampler implementation using the SoX resampler library.
|
|
25
|
+
"""Audio resampler implementation using the SoX resampler library.
|
|
26
|
+
|
|
27
|
+
This resampler uses the SoX resampler library configured for very high
|
|
28
|
+
quality (VHQ) resampling, providing excellent audio quality at the cost
|
|
29
|
+
of additional computational overhead.
|
|
30
|
+
"""
|
|
15
31
|
|
|
16
32
|
def __init__(self, **kwargs):
|
|
33
|
+
"""Initialize the SoX audio resampler.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
**kwargs: Additional keyword arguments (currently unused).
|
|
37
|
+
"""
|
|
17
38
|
pass
|
|
18
39
|
|
|
19
40
|
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
|
41
|
+
"""Resample audio data using SoX resampler library.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
audio: Input audio data as raw bytes (16-bit signed integers).
|
|
45
|
+
in_rate: Original sample rate in Hz.
|
|
46
|
+
out_rate: Target sample rate in Hz.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Resampled audio data as raw bytes (16-bit signed integers).
|
|
50
|
+
"""
|
|
20
51
|
if in_rate == out_rate:
|
|
21
52
|
return audio
|
|
22
53
|
audio_data = np.frombuffer(audio, dtype=np.int16)
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""SoX-based audio resampler stream implementation.
|
|
8
|
+
|
|
9
|
+
This module provides an audio resampler that uses the SoX ResampleStream library
|
|
10
|
+
for very high quality audio sample rate conversion.
|
|
11
|
+
|
|
12
|
+
When to use the SOXRStreamAudioResampler:
|
|
13
|
+
1. For real-time processing scenarios
|
|
14
|
+
2. When dealing with very long audio signals
|
|
15
|
+
3. When processing audio in chunks or streams
|
|
16
|
+
4. When you need to reuse the same resampler configuration multiple times, as it saves initialization overhead
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import time
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import soxr
|
|
24
|
+
|
|
25
|
+
from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
|
26
|
+
|
|
27
|
+
CLEAR_STREAM_AFTER_SECS = 0.2
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SOXRStreamAudioResampler(BaseAudioResampler):
|
|
31
|
+
"""Audio resampler implementation using the SoX ResampleStream library.
|
|
32
|
+
|
|
33
|
+
This resampler uses the SoX ResampleStream library configured for very high
|
|
34
|
+
quality (VHQ) resampling, providing excellent audio quality at the cost
|
|
35
|
+
of additional computational overhead.
|
|
36
|
+
It keeps an internal history which avoids clicks at chunk boundaries.
|
|
37
|
+
|
|
38
|
+
Notes:
|
|
39
|
+
- Only supports mono audio (1 channel).
|
|
40
|
+
- Input must be 16-bit signed PCM audio as raw bytes.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, **kwargs):
|
|
44
|
+
"""Initialize the resampler.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
**kwargs: Additional keyword arguments (currently unused).
|
|
48
|
+
"""
|
|
49
|
+
self._in_rate: float | None = None
|
|
50
|
+
self._out_rate: float | None = None
|
|
51
|
+
self._last_resample_time: float = 0
|
|
52
|
+
self._soxr_stream: soxr.ResampleStream | None = None
|
|
53
|
+
|
|
54
|
+
def _initialize(self, in_rate: float, out_rate: float):
|
|
55
|
+
self._in_rate = in_rate
|
|
56
|
+
self._out_rate = out_rate
|
|
57
|
+
self._last_resample_time = time.time()
|
|
58
|
+
self._soxr_stream = soxr.ResampleStream(
|
|
59
|
+
in_rate=in_rate, out_rate=out_rate, num_channels=1, quality="VHQ", dtype="int16"
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
def _maybe_clear_internal_state(self):
|
|
63
|
+
current_time = time.time()
|
|
64
|
+
time_since_last_resample = current_time - self._last_resample_time
|
|
65
|
+
# If more than CLEAR_STREAM_AFTER_SECS seconds have passed, clear the resampler state
|
|
66
|
+
if time_since_last_resample > CLEAR_STREAM_AFTER_SECS:
|
|
67
|
+
if self._soxr_stream:
|
|
68
|
+
self._soxr_stream.clear()
|
|
69
|
+
self._last_resample_time = current_time
|
|
70
|
+
|
|
71
|
+
def _maybe_initialize_sox_stream(self, in_rate: int, out_rate: int):
|
|
72
|
+
if self._soxr_stream is None:
|
|
73
|
+
self._initialize(in_rate, out_rate)
|
|
74
|
+
else:
|
|
75
|
+
self._maybe_clear_internal_state()
|
|
76
|
+
|
|
77
|
+
if self._in_rate != in_rate or self._out_rate != out_rate:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"SOXRStreamAudioResampler cannot be reused with different sample rates: "
|
|
80
|
+
f"expected {self._in_rate}->{self._out_rate}, got {in_rate}->{out_rate}"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
|
|
84
|
+
"""Resample audio data using soxr.ResampleStream resampler library.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
audio: Input audio data as raw bytes (16-bit signed integers).
|
|
88
|
+
in_rate: Original sample rate in Hz.
|
|
89
|
+
out_rate: Target sample rate in Hz.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Resampled audio data as raw bytes (16-bit signed integers).
|
|
93
|
+
"""
|
|
94
|
+
if in_rate == out_rate:
|
|
95
|
+
return audio
|
|
96
|
+
|
|
97
|
+
self._maybe_initialize_sox_stream(in_rate, out_rate)
|
|
98
|
+
audio_data = np.frombuffer(audio, dtype=np.int16)
|
|
99
|
+
resampled_audio = self._soxr_stream.resample_chunk(audio_data)
|
|
100
|
+
result = resampled_audio.astype(np.int16).tobytes()
|
|
101
|
+
return result
|
pipecat/audio/utils.py
CHANGED
|
@@ -4,21 +4,91 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Audio utility functions for Pipecat.
|
|
8
|
+
|
|
9
|
+
This module provides common audio processing utilities including mixing,
|
|
10
|
+
format conversion, volume calculation, and codec transformations for
|
|
11
|
+
various audio formats used in Pipecat pipelines.
|
|
12
|
+
"""
|
|
13
|
+
|
|
7
14
|
import audioop
|
|
8
15
|
|
|
9
16
|
import numpy as np
|
|
10
17
|
import pyloudnorm as pyln
|
|
11
|
-
import soxr
|
|
12
18
|
|
|
13
19
|
from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
|
|
14
20
|
from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
|
|
21
|
+
from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
|
|
22
|
+
|
|
23
|
+
# Normal speech usually results in many samples between ±500 to ±5000, depending on loudness and mic gain.
|
|
24
|
+
# So we are using a threshold that is well below what real speech produces.
|
|
25
|
+
SPEAKING_THRESHOLD = 20
|
|
15
26
|
|
|
16
27
|
|
|
17
28
|
def create_default_resampler(**kwargs) -> BaseAudioResampler:
|
|
29
|
+
"""Create a default audio resampler instance.
|
|
30
|
+
|
|
31
|
+
.. deprecated:: 0.0.74
|
|
32
|
+
This function is deprecated and will be removed in a future version.
|
|
33
|
+
Use `create_stream_resampler` for real-time processing scenarios or
|
|
34
|
+
`create_file_resampler` for batch processing of complete audio files.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
**kwargs: Additional keyword arguments passed to the resampler constructor.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
A configured SOXRAudioResampler instance.
|
|
41
|
+
"""
|
|
42
|
+
import warnings
|
|
43
|
+
|
|
44
|
+
warnings.warn(
|
|
45
|
+
"`create_default_resampler` is deprecated. "
|
|
46
|
+
"Use `create_stream_resampler` for real-time processing scenarios or "
|
|
47
|
+
"`create_file_resampler` for batch processing of complete audio files.",
|
|
48
|
+
DeprecationWarning,
|
|
49
|
+
stacklevel=2,
|
|
50
|
+
)
|
|
18
51
|
return SOXRAudioResampler(**kwargs)
|
|
19
52
|
|
|
20
53
|
|
|
54
|
+
def create_file_resampler(**kwargs) -> BaseAudioResampler:
|
|
55
|
+
"""Create an audio resampler instance for batch processing of complete audio files.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
**kwargs: Additional keyword arguments passed to the resampler constructor.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
A configured SOXRAudioResampler instance.
|
|
62
|
+
"""
|
|
63
|
+
return SOXRAudioResampler(**kwargs)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def create_stream_resampler(**kwargs) -> BaseAudioResampler:
|
|
67
|
+
"""Create a stream audio resampler instance.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
**kwargs: Additional keyword arguments passed to the resampler constructor.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
A configured SOXRStreamAudioResampler instance.
|
|
74
|
+
"""
|
|
75
|
+
return SOXRStreamAudioResampler(**kwargs)
|
|
76
|
+
|
|
77
|
+
|
|
21
78
|
def mix_audio(audio1: bytes, audio2: bytes) -> bytes:
|
|
79
|
+
"""Mix two audio streams together by adding their samples.
|
|
80
|
+
|
|
81
|
+
Both audio streams are assumed to be 16-bit signed integer PCM data.
|
|
82
|
+
If the streams have different lengths, the shorter one is zero-padded
|
|
83
|
+
to match the longer stream.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
audio1: First audio stream as raw bytes (16-bit signed integers).
|
|
87
|
+
audio2: Second audio stream as raw bytes (16-bit signed integers).
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Mixed audio data as raw bytes with samples clipped to 16-bit range.
|
|
91
|
+
"""
|
|
22
92
|
data1 = np.frombuffer(audio1, dtype=np.int16)
|
|
23
93
|
data2 = np.frombuffer(audio2, dtype=np.int16)
|
|
24
94
|
|
|
@@ -37,6 +107,19 @@ def mix_audio(audio1: bytes, audio2: bytes) -> bytes:
|
|
|
37
107
|
|
|
38
108
|
|
|
39
109
|
def interleave_stereo_audio(left_audio: bytes, right_audio: bytes) -> bytes:
|
|
110
|
+
"""Interleave left and right mono audio channels into stereo audio.
|
|
111
|
+
|
|
112
|
+
Takes two mono audio streams and combines them into a single stereo
|
|
113
|
+
stream by interleaving the samples (L, R, L, R, ...). If the channels
|
|
114
|
+
have different lengths, both are truncated to the shorter length.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
left_audio: Left channel audio as raw bytes (16-bit signed integers).
|
|
118
|
+
right_audio: Right channel audio as raw bytes (16-bit signed integers).
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Interleaved stereo audio data as raw bytes.
|
|
122
|
+
"""
|
|
40
123
|
left = np.frombuffer(left_audio, dtype=np.int16)
|
|
41
124
|
right = np.frombuffer(right_audio, dtype=np.int16)
|
|
42
125
|
|
|
@@ -50,12 +133,34 @@ def interleave_stereo_audio(left_audio: bytes, right_audio: bytes) -> bytes:
|
|
|
50
133
|
|
|
51
134
|
|
|
52
135
|
def normalize_value(value, min_value, max_value):
|
|
136
|
+
"""Normalize a value to the range [0, 1] and clamp it to bounds.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
value: The value to normalize.
|
|
140
|
+
min_value: The minimum value of the input range.
|
|
141
|
+
max_value: The maximum value of the input range.
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
Normalized value clamped to the range [0, 1].
|
|
145
|
+
"""
|
|
53
146
|
normalized = (value - min_value) / (max_value - min_value)
|
|
54
147
|
normalized_clamped = max(0, min(1, normalized))
|
|
55
148
|
return normalized_clamped
|
|
56
149
|
|
|
57
150
|
|
|
58
151
|
def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
|
|
152
|
+
"""Calculate the loudness level of audio data using EBU R128 standard.
|
|
153
|
+
|
|
154
|
+
Uses the pyloudnorm library to calculate integrated loudness according
|
|
155
|
+
to the EBU R128 recommendation, then normalizes the result to [0, 1].
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
audio: Audio data as raw bytes (16-bit signed integers).
|
|
159
|
+
sample_rate: Sample rate of the audio in Hz.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Normalized loudness value between 0 (quiet) and 1 (loud).
|
|
163
|
+
"""
|
|
59
164
|
audio_np = np.frombuffer(audio, dtype=np.int16)
|
|
60
165
|
audio_float = audio_np.astype(np.float64)
|
|
61
166
|
|
|
@@ -71,12 +176,37 @@ def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
|
|
|
71
176
|
|
|
72
177
|
|
|
73
178
|
def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
|
|
179
|
+
"""Apply exponential smoothing to a value.
|
|
180
|
+
|
|
181
|
+
Exponential smoothing is used to reduce noise in time-series data by
|
|
182
|
+
giving more weight to recent values while still considering historical data.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
value: The new value to incorporate.
|
|
186
|
+
prev_value: The previous smoothed value.
|
|
187
|
+
factor: Smoothing factor between 0 and 1. Higher values give more
|
|
188
|
+
weight to the new value.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
The exponentially smoothed value.
|
|
192
|
+
"""
|
|
74
193
|
return prev_value + factor * (value - prev_value)
|
|
75
194
|
|
|
76
195
|
|
|
77
196
|
async def ulaw_to_pcm(
|
|
78
197
|
ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
|
|
79
198
|
):
|
|
199
|
+
"""Convert μ-law encoded audio to PCM and optionally resample.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
ulaw_bytes: μ-law encoded audio data as raw bytes.
|
|
203
|
+
in_rate: Original sample rate of the μ-law audio in Hz.
|
|
204
|
+
out_rate: Desired output sample rate in Hz.
|
|
205
|
+
resampler: Audio resampler instance for rate conversion.
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
PCM audio data as raw bytes at the specified output rate.
|
|
209
|
+
"""
|
|
80
210
|
# Convert μ-law to PCM
|
|
81
211
|
in_pcm_bytes = audioop.ulaw2lin(ulaw_bytes, 2)
|
|
82
212
|
|
|
@@ -87,6 +217,17 @@ async def ulaw_to_pcm(
|
|
|
87
217
|
|
|
88
218
|
|
|
89
219
|
async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
|
220
|
+
"""Convert PCM audio to μ-law encoding and optionally resample.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
pcm_bytes: PCM audio data as raw bytes (16-bit signed integers).
|
|
224
|
+
in_rate: Original sample rate of the PCM audio in Hz.
|
|
225
|
+
out_rate: Desired output sample rate in Hz.
|
|
226
|
+
resampler: Audio resampler instance for rate conversion.
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
μ-law encoded audio data as raw bytes at the specified output rate.
|
|
230
|
+
"""
|
|
90
231
|
# Resample
|
|
91
232
|
in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
|
|
92
233
|
|
|
@@ -99,6 +240,17 @@ async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
|
|
|
99
240
|
async def alaw_to_pcm(
|
|
100
241
|
alaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
|
|
101
242
|
) -> bytes:
|
|
243
|
+
"""Convert A-law encoded audio to PCM and optionally resample.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
alaw_bytes: A-law encoded audio data as raw bytes.
|
|
247
|
+
in_rate: Original sample rate of the A-law audio in Hz.
|
|
248
|
+
out_rate: Desired output sample rate in Hz.
|
|
249
|
+
resampler: Audio resampler instance for rate conversion.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
PCM audio data as raw bytes at the specified output rate.
|
|
253
|
+
"""
|
|
102
254
|
# Convert a-law to PCM
|
|
103
255
|
in_pcm_bytes = audioop.alaw2lin(alaw_bytes, 2)
|
|
104
256
|
|
|
@@ -109,6 +261,17 @@ async def alaw_to_pcm(
|
|
|
109
261
|
|
|
110
262
|
|
|
111
263
|
async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
|
|
264
|
+
"""Convert PCM audio to A-law encoding and optionally resample.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
pcm_bytes: PCM audio data as raw bytes (16-bit signed integers).
|
|
268
|
+
in_rate: Original sample rate of the PCM audio in Hz.
|
|
269
|
+
out_rate: Desired output sample rate in Hz.
|
|
270
|
+
resampler: Audio resampler instance for rate conversion.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
A-law encoded audio data as raw bytes at the specified output rate.
|
|
274
|
+
"""
|
|
112
275
|
# Resample
|
|
113
276
|
in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
|
|
114
277
|
|
|
@@ -116,3 +279,33 @@ async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
|
|
|
116
279
|
out_alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2)
|
|
117
280
|
|
|
118
281
|
return out_alaw_bytes
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def is_silence(pcm_bytes: bytes) -> bool:
|
|
285
|
+
"""Determine if an audio sample contains silence by checking amplitude levels.
|
|
286
|
+
|
|
287
|
+
This function analyzes raw PCM audio data to detect silence by comparing
|
|
288
|
+
the maximum absolute amplitude against a predefined threshold. The audio
|
|
289
|
+
is expected to be clean speech or complete silence without background noise.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
pcm_bytes: Raw PCM audio data as bytes (16-bit signed integers).
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
bool: True if the audio sample is considered silence (below threshold),
|
|
296
|
+
False otherwise.
|
|
297
|
+
|
|
298
|
+
Note:
|
|
299
|
+
Normal speech typically produces amplitude values between ±500 to ±5000,
|
|
300
|
+
depending on factors like loudness and microphone gain. The threshold
|
|
301
|
+
(SPEAKING_THRESHOLD) is set well below typical speech levels to
|
|
302
|
+
reliably detect silence vs. speech.
|
|
303
|
+
"""
|
|
304
|
+
# Convert raw audio bytes to a NumPy array of int16 samples
|
|
305
|
+
audio_data = np.frombuffer(pcm_bytes, dtype=np.int16)
|
|
306
|
+
|
|
307
|
+
# Check the maximum absolute amplitude in the frame
|
|
308
|
+
max_value = np.abs(audio_data).max()
|
|
309
|
+
|
|
310
|
+
# If max value is lower than SPEAKING_THRESHOLD, consider it as silence
|
|
311
|
+
return max_value <= SPEAKING_THRESHOLD
|