dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -4,6 +4,13 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Soundfile-based audio mixer for file playback integration.
8
+
9
+ Provides an audio mixer that combines incoming audio with audio loaded from
10
+ files using the soundfile library. Supports multiple audio formats and
11
+ runtime configuration changes.
12
+ """
13
+
7
14
  import asyncio
8
15
  from typing import Any, Dict, Mapping
9
16
 
@@ -24,7 +31,9 @@ except ModuleNotFoundError as e:
24
31
 
25
32
 
26
33
  class SoundfileMixer(BaseAudioMixer):
27
- """This is an audio mixer that mixes incoming audio with audio from a
34
+ """Audio mixer that combines incoming audio with file-based audio.
35
+
36
+ This is an audio mixer that mixes incoming audio with audio from a
28
37
  file. It uses the soundfile library to load files so it supports multiple
29
38
  formats. The audio files need to only have one channel (mono) and it needs
30
39
  to match the sample rate of the output transport.
@@ -33,7 +42,6 @@ class SoundfileMixer(BaseAudioMixer):
33
42
  `MixerUpdateSettingsFrame` has the following settings available: `sound`
34
43
  (str) and `volume` (float) to be able to update to a different sound file or
35
44
  to change the volume at runtime.
36
-
37
45
  """
38
46
 
39
47
  def __init__(
@@ -46,6 +54,16 @@ class SoundfileMixer(BaseAudioMixer):
46
54
  loop: bool = True,
47
55
  **kwargs,
48
56
  ):
57
+ """Initialize the soundfile mixer.
58
+
59
+ Args:
60
+ sound_files: Mapping of sound names to file paths for loading.
61
+ default_sound: Name of the default sound to play initially.
62
+ volume: Mixing volume level (0.0 to 1.0). Defaults to 0.4.
63
+ mixing: Whether mixing is initially enabled. Defaults to True.
64
+ loop: Whether to loop audio files when they end. Defaults to True.
65
+ **kwargs: Additional arguments passed to parent class.
66
+ """
49
67
  super().__init__(**kwargs)
50
68
  self._sound_files = sound_files
51
69
  self._volume = volume
@@ -58,14 +76,28 @@ class SoundfileMixer(BaseAudioMixer):
58
76
  self._loop = loop
59
77
 
60
78
  async def start(self, sample_rate: int):
79
+ """Initialize the mixer and load all sound files.
80
+
81
+ Args:
82
+ sample_rate: The sample rate of the output transport in Hz.
83
+ """
61
84
  self._sample_rate = sample_rate
62
85
  for sound_name, file_name in self._sound_files.items():
63
86
  await asyncio.to_thread(self._load_sound_file, sound_name, file_name)
64
87
 
65
88
  async def stop(self):
89
+ """Clean up mixer resources.
90
+
91
+ Currently performs no cleanup as sound data is managed by garbage collection.
92
+ """
66
93
  pass
67
94
 
68
95
  async def process_frame(self, frame: MixerControlFrame):
96
+ """Process mixer control frames to update settings or enable/disable mixing.
97
+
98
+ Args:
99
+ frame: The mixer control frame to process.
100
+ """
69
101
  if isinstance(frame, MixerUpdateSettingsFrame):
70
102
  await self._update_settings(frame)
71
103
  elif isinstance(frame, MixerEnableFrame):
@@ -73,12 +105,22 @@ class SoundfileMixer(BaseAudioMixer):
73
105
  pass
74
106
 
75
107
  async def mix(self, audio: bytes) -> bytes:
108
+ """Mix transport audio with the current sound file.
109
+
110
+ Args:
111
+ audio: Raw audio bytes from the transport to mix.
112
+
113
+ Returns:
114
+ Mixed audio bytes combining transport and file audio.
115
+ """
76
116
  return self._mix_with_sound(audio)
77
117
 
78
118
  async def _enable_mixing(self, enable: bool):
119
+ """Enable or disable audio mixing."""
79
120
  self._mixing = enable
80
121
 
81
122
  async def _update_settings(self, frame: MixerUpdateSettingsFrame):
123
+ """Update mixer settings from a control frame."""
82
124
  for setting, value in frame.settings.items():
83
125
  match setting:
84
126
  case "sound":
@@ -89,6 +131,11 @@ class SoundfileMixer(BaseAudioMixer):
89
131
  await self._update_loop(value)
90
132
 
91
133
  async def _change_sound(self, sound: str):
134
+ """Change the currently playing sound file.
135
+
136
+ Args:
137
+ sound: Name of the sound file to switch to.
138
+ """
92
139
  if sound in self._sound_files:
93
140
  self._current_sound = sound
94
141
  self._sound_pos = 0
@@ -96,12 +143,15 @@ class SoundfileMixer(BaseAudioMixer):
96
143
  logger.error(f"Sound {sound} is not available")
97
144
 
98
145
  async def _update_volume(self, volume: float):
146
+ """Update the mixing volume level."""
99
147
  self._volume = volume
100
148
 
101
149
  async def _update_loop(self, loop: bool):
150
+ """Update the looping behavior."""
102
151
  self._loop = loop
103
152
 
104
153
  def _load_sound_file(self, sound_name: str, file_name: str):
154
+ """Load an audio file into memory for mixing."""
105
155
  try:
106
156
  logger.debug(f"Loading mixer sound from {file_name}")
107
157
  sound, sample_rate = sf.read(file_name, dtype="int16")
@@ -118,10 +168,7 @@ class SoundfileMixer(BaseAudioMixer):
118
168
  logger.error(f"Unable to open file {file_name}: {e}")
119
169
 
120
170
  def _mix_with_sound(self, audio: bytes):
121
- """Mixes raw audio frames with chunks of the same length from the sound
122
- file.
123
-
124
- """
171
+ """Mix raw audio frames with chunks of the same length from the sound file."""
125
172
  if not self._mixing or not self._current_sound in self._sounds:
126
173
  return audio
127
174
 
@@ -4,27 +4,35 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Base audio resampler interface for Pipecat.
8
+
9
+ This module defines the abstract base class for audio resampling implementations,
10
+ providing a common interface for converting audio between different sample rates.
11
+ """
12
+
7
13
  from abc import ABC, abstractmethod
8
14
 
9
15
 
10
16
  class BaseAudioResampler(ABC):
11
- """Abstract base class for audio resampling. This class defines an
12
- interface for audio resampling implementations.
17
+ """Abstract base class for audio resampling implementations.
18
+
19
+ This class defines the interface that all audio resampling implementations
20
+ must follow, providing a standardized way to convert audio data between
21
+ different sample rates.
13
22
  """
14
23
 
15
24
  @abstractmethod
16
25
  async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
17
- """
18
- Resamples the given audio data to a different sample rate.
26
+ """Resamples the given audio data to a different sample rate.
19
27
 
20
28
  This is an abstract method that must be implemented in subclasses.
21
29
 
22
- Parameters:
23
- audio (bytes): The audio data to be resampled, represented as a byte string.
24
- in_rate (int): The original sample rate of the audio data (in Hz).
25
- out_rate (int): The desired sample rate for the resampled audio data (in Hz).
30
+ Args:
31
+ audio: The audio data to be resampled, as raw bytes.
32
+ in_rate: The original sample rate of the audio data in Hz.
33
+ out_rate: The desired sample rate for the output audio in Hz.
26
34
 
27
35
  Returns:
28
- bytes: The resampled audio data as a byte string.
36
+ The resampled audio data as raw bytes.
29
37
  """
30
38
  pass
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Resampy-based audio resampler implementation.
8
+
9
+ This module provides an audio resampler that uses the resampy library
10
+ for high-quality audio sample rate conversion.
11
+ """
12
+
7
13
  import numpy as np
8
14
  import resampy
9
15
 
@@ -11,12 +17,31 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
11
17
 
12
18
 
13
19
  class ResampyResampler(BaseAudioResampler):
14
- """Audio resampler implementation using the resampy library."""
20
+ """Audio resampler implementation using the resampy library.
21
+
22
+ This resampler uses the resampy library's Kaiser windowing filter
23
+ for high-quality audio resampling with good performance characteristics.
24
+ """
15
25
 
16
26
  def __init__(self, **kwargs):
27
+ """Initialize the resampy resampler.
28
+
29
+ Args:
30
+ **kwargs: Additional keyword arguments (currently unused).
31
+ """
17
32
  pass
18
33
 
19
34
  async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
35
+ """Resample audio data using resampy library.
36
+
37
+ Args:
38
+ audio: Input audio data as raw bytes (16-bit signed integers).
39
+ in_rate: Original sample rate in Hz.
40
+ out_rate: Target sample rate in Hz.
41
+
42
+ Returns:
43
+ Resampled audio data as raw bytes (16-bit signed integers).
44
+ """
20
45
  if in_rate == out_rate:
21
46
  return audio
22
47
  audio_data = np.frombuffer(audio, dtype=np.int16)
@@ -4,6 +4,17 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """SoX-based audio resampler implementation.
8
+
9
+ This module provides an audio resampler that uses the SoX resampler library
10
+ for very high-quality audio sample rate conversion.
11
+
12
+ When to use the SOXRAudioResampler:
13
+ 1. For batch processing of complete audio files
14
+ 2. When you have all the audio data available at once
15
+
16
+ """
17
+
7
18
  import numpy as np
8
19
  import soxr
9
20
 
@@ -11,12 +22,32 @@ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
11
22
 
12
23
 
13
24
  class SOXRAudioResampler(BaseAudioResampler):
14
- """Audio resampler implementation using the SoX resampler library."""
25
+ """Audio resampler implementation using the SoX resampler library.
26
+
27
+ This resampler uses the SoX resampler library configured for very high
28
+ quality (VHQ) resampling, providing excellent audio quality at the cost
29
+ of additional computational overhead.
30
+ """
15
31
 
16
32
  def __init__(self, **kwargs):
33
+ """Initialize the SoX audio resampler.
34
+
35
+ Args:
36
+ **kwargs: Additional keyword arguments (currently unused).
37
+ """
17
38
  pass
18
39
 
19
40
  async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
41
+ """Resample audio data using SoX resampler library.
42
+
43
+ Args:
44
+ audio: Input audio data as raw bytes (16-bit signed integers).
45
+ in_rate: Original sample rate in Hz.
46
+ out_rate: Target sample rate in Hz.
47
+
48
+ Returns:
49
+ Resampled audio data as raw bytes (16-bit signed integers).
50
+ """
20
51
  if in_rate == out_rate:
21
52
  return audio
22
53
  audio_data = np.frombuffer(audio, dtype=np.int16)
@@ -0,0 +1,101 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """SoX-based audio resampler stream implementation.
8
+
9
+ This module provides an audio resampler that uses the SoX ResampleStream library
10
+ for very high quality audio sample rate conversion.
11
+
12
+ When to use the SOXRStreamAudioResampler:
13
+ 1. For real-time processing scenarios
14
+ 2. When dealing with very long audio signals
15
+ 3. When processing audio in chunks or streams
16
+ 4. When you need to reuse the same resampler configuration multiple times, as it saves initialization overhead
17
+
18
+ """
19
+
20
+ import time
21
+
22
+ import numpy as np
23
+ import soxr
24
+
25
+ from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
26
+
27
+ CLEAR_STREAM_AFTER_SECS = 0.2
28
+
29
+
30
+ class SOXRStreamAudioResampler(BaseAudioResampler):
31
+ """Audio resampler implementation using the SoX ResampleStream library.
32
+
33
+ This resampler uses the SoX ResampleStream library configured for very high
34
+ quality (VHQ) resampling, providing excellent audio quality at the cost
35
+ of additional computational overhead.
36
+ It keeps an internal history which avoids clicks at chunk boundaries.
37
+
38
+ Notes:
39
+ - Only supports mono audio (1 channel).
40
+ - Input must be 16-bit signed PCM audio as raw bytes.
41
+ """
42
+
43
+ def __init__(self, **kwargs):
44
+ """Initialize the resampler.
45
+
46
+ Args:
47
+ **kwargs: Additional keyword arguments (currently unused).
48
+ """
49
+ self._in_rate: float | None = None
50
+ self._out_rate: float | None = None
51
+ self._last_resample_time: float = 0
52
+ self._soxr_stream: soxr.ResampleStream | None = None
53
+
54
+ def _initialize(self, in_rate: float, out_rate: float):
55
+ self._in_rate = in_rate
56
+ self._out_rate = out_rate
57
+ self._last_resample_time = time.time()
58
+ self._soxr_stream = soxr.ResampleStream(
59
+ in_rate=in_rate, out_rate=out_rate, num_channels=1, quality="VHQ", dtype="int16"
60
+ )
61
+
62
+ def _maybe_clear_internal_state(self):
63
+ current_time = time.time()
64
+ time_since_last_resample = current_time - self._last_resample_time
65
+ # If more than CLEAR_STREAM_AFTER_SECS seconds have passed, clear the resampler state
66
+ if time_since_last_resample > CLEAR_STREAM_AFTER_SECS:
67
+ if self._soxr_stream:
68
+ self._soxr_stream.clear()
69
+ self._last_resample_time = current_time
70
+
71
+ def _maybe_initialize_sox_stream(self, in_rate: int, out_rate: int):
72
+ if self._soxr_stream is None:
73
+ self._initialize(in_rate, out_rate)
74
+ else:
75
+ self._maybe_clear_internal_state()
76
+
77
+ if self._in_rate != in_rate or self._out_rate != out_rate:
78
+ raise ValueError(
79
+ f"SOXRStreamAudioResampler cannot be reused with different sample rates: "
80
+ f"expected {self._in_rate}->{self._out_rate}, got {in_rate}->{out_rate}"
81
+ )
82
+
83
+ async def resample(self, audio: bytes, in_rate: int, out_rate: int) -> bytes:
84
+ """Resample audio data using soxr.ResampleStream resampler library.
85
+
86
+ Args:
87
+ audio: Input audio data as raw bytes (16-bit signed integers).
88
+ in_rate: Original sample rate in Hz.
89
+ out_rate: Target sample rate in Hz.
90
+
91
+ Returns:
92
+ Resampled audio data as raw bytes (16-bit signed integers).
93
+ """
94
+ if in_rate == out_rate:
95
+ return audio
96
+
97
+ self._maybe_initialize_sox_stream(in_rate, out_rate)
98
+ audio_data = np.frombuffer(audio, dtype=np.int16)
99
+ resampled_audio = self._soxr_stream.resample_chunk(audio_data)
100
+ result = resampled_audio.astype(np.int16).tobytes()
101
+ return result
pipecat/audio/utils.py CHANGED
@@ -4,21 +4,91 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Audio utility functions for Pipecat.
8
+
9
+ This module provides common audio processing utilities including mixing,
10
+ format conversion, volume calculation, and codec transformations for
11
+ various audio formats used in Pipecat pipelines.
12
+ """
13
+
7
14
  import audioop
8
15
 
9
16
  import numpy as np
10
17
  import pyloudnorm as pyln
11
- import soxr
12
18
 
13
19
  from pipecat.audio.resamplers.base_audio_resampler import BaseAudioResampler
14
20
  from pipecat.audio.resamplers.soxr_resampler import SOXRAudioResampler
21
+ from pipecat.audio.resamplers.soxr_stream_resampler import SOXRStreamAudioResampler
22
+
23
+ # Normal speech usually results in many samples between ±500 to ±5000, depending on loudness and mic gain.
24
+ # So we are using a threshold that is well below what real speech produces.
25
+ SPEAKING_THRESHOLD = 20
15
26
 
16
27
 
17
28
  def create_default_resampler(**kwargs) -> BaseAudioResampler:
29
+ """Create a default audio resampler instance.
30
+
31
+ .. deprecated:: 0.0.74
32
+ This function is deprecated and will be removed in a future version.
33
+ Use `create_stream_resampler` for real-time processing scenarios or
34
+ `create_file_resampler` for batch processing of complete audio files.
35
+
36
+ Args:
37
+ **kwargs: Additional keyword arguments passed to the resampler constructor.
38
+
39
+ Returns:
40
+ A configured SOXRAudioResampler instance.
41
+ """
42
+ import warnings
43
+
44
+ warnings.warn(
45
+ "`create_default_resampler` is deprecated. "
46
+ "Use `create_stream_resampler` for real-time processing scenarios or "
47
+ "`create_file_resampler` for batch processing of complete audio files.",
48
+ DeprecationWarning,
49
+ stacklevel=2,
50
+ )
18
51
  return SOXRAudioResampler(**kwargs)
19
52
 
20
53
 
54
+ def create_file_resampler(**kwargs) -> BaseAudioResampler:
55
+ """Create an audio resampler instance for batch processing of complete audio files.
56
+
57
+ Args:
58
+ **kwargs: Additional keyword arguments passed to the resampler constructor.
59
+
60
+ Returns:
61
+ A configured SOXRAudioResampler instance.
62
+ """
63
+ return SOXRAudioResampler(**kwargs)
64
+
65
+
66
+ def create_stream_resampler(**kwargs) -> BaseAudioResampler:
67
+ """Create a stream audio resampler instance.
68
+
69
+ Args:
70
+ **kwargs: Additional keyword arguments passed to the resampler constructor.
71
+
72
+ Returns:
73
+ A configured SOXRStreamAudioResampler instance.
74
+ """
75
+ return SOXRStreamAudioResampler(**kwargs)
76
+
77
+
21
78
  def mix_audio(audio1: bytes, audio2: bytes) -> bytes:
79
+ """Mix two audio streams together by adding their samples.
80
+
81
+ Both audio streams are assumed to be 16-bit signed integer PCM data.
82
+ If the streams have different lengths, the shorter one is zero-padded
83
+ to match the longer stream.
84
+
85
+ Args:
86
+ audio1: First audio stream as raw bytes (16-bit signed integers).
87
+ audio2: Second audio stream as raw bytes (16-bit signed integers).
88
+
89
+ Returns:
90
+ Mixed audio data as raw bytes with samples clipped to 16-bit range.
91
+ """
22
92
  data1 = np.frombuffer(audio1, dtype=np.int16)
23
93
  data2 = np.frombuffer(audio2, dtype=np.int16)
24
94
 
@@ -37,6 +107,19 @@ def mix_audio(audio1: bytes, audio2: bytes) -> bytes:
37
107
 
38
108
 
39
109
  def interleave_stereo_audio(left_audio: bytes, right_audio: bytes) -> bytes:
110
+ """Interleave left and right mono audio channels into stereo audio.
111
+
112
+ Takes two mono audio streams and combines them into a single stereo
113
+ stream by interleaving the samples (L, R, L, R, ...). If the channels
114
+ have different lengths, both are truncated to the shorter length.
115
+
116
+ Args:
117
+ left_audio: Left channel audio as raw bytes (16-bit signed integers).
118
+ right_audio: Right channel audio as raw bytes (16-bit signed integers).
119
+
120
+ Returns:
121
+ Interleaved stereo audio data as raw bytes.
122
+ """
40
123
  left = np.frombuffer(left_audio, dtype=np.int16)
41
124
  right = np.frombuffer(right_audio, dtype=np.int16)
42
125
 
@@ -50,12 +133,34 @@ def interleave_stereo_audio(left_audio: bytes, right_audio: bytes) -> bytes:
50
133
 
51
134
 
52
135
  def normalize_value(value, min_value, max_value):
136
+ """Normalize a value to the range [0, 1] and clamp it to bounds.
137
+
138
+ Args:
139
+ value: The value to normalize.
140
+ min_value: The minimum value of the input range.
141
+ max_value: The maximum value of the input range.
142
+
143
+ Returns:
144
+ Normalized value clamped to the range [0, 1].
145
+ """
53
146
  normalized = (value - min_value) / (max_value - min_value)
54
147
  normalized_clamped = max(0, min(1, normalized))
55
148
  return normalized_clamped
56
149
 
57
150
 
58
151
  def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
152
+ """Calculate the loudness level of audio data using EBU R128 standard.
153
+
154
+ Uses the pyloudnorm library to calculate integrated loudness according
155
+ to the EBU R128 recommendation, then normalizes the result to [0, 1].
156
+
157
+ Args:
158
+ audio: Audio data as raw bytes (16-bit signed integers).
159
+ sample_rate: Sample rate of the audio in Hz.
160
+
161
+ Returns:
162
+ Normalized loudness value between 0 (quiet) and 1 (loud).
163
+ """
59
164
  audio_np = np.frombuffer(audio, dtype=np.int16)
60
165
  audio_float = audio_np.astype(np.float64)
61
166
 
@@ -71,12 +176,37 @@ def calculate_audio_volume(audio: bytes, sample_rate: int) -> float:
71
176
 
72
177
 
73
178
  def exp_smoothing(value: float, prev_value: float, factor: float) -> float:
179
+ """Apply exponential smoothing to a value.
180
+
181
+ Exponential smoothing is used to reduce noise in time-series data by
182
+ giving more weight to recent values while still considering historical data.
183
+
184
+ Args:
185
+ value: The new value to incorporate.
186
+ prev_value: The previous smoothed value.
187
+ factor: Smoothing factor between 0 and 1. Higher values give more
188
+ weight to the new value.
189
+
190
+ Returns:
191
+ The exponentially smoothed value.
192
+ """
74
193
  return prev_value + factor * (value - prev_value)
75
194
 
76
195
 
77
196
  async def ulaw_to_pcm(
78
197
  ulaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
79
198
  ):
199
+ """Convert μ-law encoded audio to PCM and optionally resample.
200
+
201
+ Args:
202
+ ulaw_bytes: μ-law encoded audio data as raw bytes.
203
+ in_rate: Original sample rate of the μ-law audio in Hz.
204
+ out_rate: Desired output sample rate in Hz.
205
+ resampler: Audio resampler instance for rate conversion.
206
+
207
+ Returns:
208
+ PCM audio data as raw bytes at the specified output rate.
209
+ """
80
210
  # Convert μ-law to PCM
81
211
  in_pcm_bytes = audioop.ulaw2lin(ulaw_bytes, 2)
82
212
 
@@ -87,6 +217,17 @@ async def ulaw_to_pcm(
87
217
 
88
218
 
89
219
  async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
220
+ """Convert PCM audio to μ-law encoding and optionally resample.
221
+
222
+ Args:
223
+ pcm_bytes: PCM audio data as raw bytes (16-bit signed integers).
224
+ in_rate: Original sample rate of the PCM audio in Hz.
225
+ out_rate: Desired output sample rate in Hz.
226
+ resampler: Audio resampler instance for rate conversion.
227
+
228
+ Returns:
229
+ μ-law encoded audio data as raw bytes at the specified output rate.
230
+ """
90
231
  # Resample
91
232
  in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
92
233
 
@@ -99,6 +240,17 @@ async def pcm_to_ulaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
99
240
  async def alaw_to_pcm(
100
241
  alaw_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler
101
242
  ) -> bytes:
243
+ """Convert A-law encoded audio to PCM and optionally resample.
244
+
245
+ Args:
246
+ alaw_bytes: A-law encoded audio data as raw bytes.
247
+ in_rate: Original sample rate of the A-law audio in Hz.
248
+ out_rate: Desired output sample rate in Hz.
249
+ resampler: Audio resampler instance for rate conversion.
250
+
251
+ Returns:
252
+ PCM audio data as raw bytes at the specified output rate.
253
+ """
102
254
  # Convert a-law to PCM
103
255
  in_pcm_bytes = audioop.alaw2lin(alaw_bytes, 2)
104
256
 
@@ -109,6 +261,17 @@ async def alaw_to_pcm(
109
261
 
110
262
 
111
263
  async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler: BaseAudioResampler):
264
+ """Convert PCM audio to A-law encoding and optionally resample.
265
+
266
+ Args:
267
+ pcm_bytes: PCM audio data as raw bytes (16-bit signed integers).
268
+ in_rate: Original sample rate of the PCM audio in Hz.
269
+ out_rate: Desired output sample rate in Hz.
270
+ resampler: Audio resampler instance for rate conversion.
271
+
272
+ Returns:
273
+ A-law encoded audio data as raw bytes at the specified output rate.
274
+ """
112
275
  # Resample
113
276
  in_pcm_bytes = await resampler.resample(pcm_bytes, in_rate, out_rate)
114
277
 
@@ -116,3 +279,33 @@ async def pcm_to_alaw(pcm_bytes: bytes, in_rate: int, out_rate: int, resampler:
116
279
  out_alaw_bytes = audioop.lin2alaw(in_pcm_bytes, 2)
117
280
 
118
281
  return out_alaw_bytes
282
+
283
+
284
+ def is_silence(pcm_bytes: bytes) -> bool:
285
+ """Determine if an audio sample contains silence by checking amplitude levels.
286
+
287
+ This function analyzes raw PCM audio data to detect silence by comparing
288
+ the maximum absolute amplitude against a predefined threshold. The audio
289
+ is expected to be clean speech or complete silence without background noise.
290
+
291
+ Args:
292
+ pcm_bytes: Raw PCM audio data as bytes (16-bit signed integers).
293
+
294
+ Returns:
295
+ bool: True if the audio sample is considered silence (below threshold),
296
+ False otherwise.
297
+
298
+ Note:
299
+ Normal speech typically produces amplitude values between ±500 to ±5000,
300
+ depending on factors like loudness and microphone gain. The threshold
301
+ (SPEAKING_THRESHOLD) is set well below typical speech levels to
302
+ reliably detect silence vs. speech.
303
+ """
304
+ # Convert raw audio bytes to a NumPy array of int16 samples
305
+ audio_data = np.frombuffer(pcm_bytes, dtype=np.int16)
306
+
307
+ # Check the maximum absolute amplitude in the frame
308
+ max_value = np.abs(audio_data).max()
309
+
310
+ # If max value is lower than SPEAKING_THRESHOLD, consider it as silence
311
+ return max_value <= SPEAKING_THRESHOLD