dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show
  1. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
  2. dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
  3. pipecat/__init__.py +17 -0
  4. pipecat/adapters/base_llm_adapter.py +36 -1
  5. pipecat/adapters/schemas/direct_function.py +296 -0
  6. pipecat/adapters/schemas/function_schema.py +15 -6
  7. pipecat/adapters/schemas/tools_schema.py +55 -7
  8. pipecat/adapters/services/anthropic_adapter.py +22 -3
  9. pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
  10. pipecat/adapters/services/bedrock_adapter.py +22 -3
  11. pipecat/adapters/services/gemini_adapter.py +16 -3
  12. pipecat/adapters/services/open_ai_adapter.py +17 -2
  13. pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
  14. pipecat/audio/filters/base_audio_filter.py +30 -6
  15. pipecat/audio/filters/koala_filter.py +37 -2
  16. pipecat/audio/filters/krisp_filter.py +59 -6
  17. pipecat/audio/filters/noisereduce_filter.py +37 -0
  18. pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
  19. pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
  20. pipecat/audio/mixers/base_audio_mixer.py +30 -7
  21. pipecat/audio/mixers/soundfile_mixer.py +53 -6
  22. pipecat/audio/resamplers/base_audio_resampler.py +17 -9
  23. pipecat/audio/resamplers/resampy_resampler.py +26 -1
  24. pipecat/audio/resamplers/soxr_resampler.py +32 -1
  25. pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
  26. pipecat/audio/utils.py +194 -1
  27. pipecat/audio/vad/silero.py +60 -3
  28. pipecat/audio/vad/vad_analyzer.py +114 -30
  29. pipecat/clocks/base_clock.py +19 -0
  30. pipecat/clocks/system_clock.py +25 -0
  31. pipecat/extensions/voicemail/__init__.py +0 -0
  32. pipecat/extensions/voicemail/voicemail_detector.py +707 -0
  33. pipecat/frames/frames.py +590 -156
  34. pipecat/metrics/metrics.py +64 -1
  35. pipecat/observers/base_observer.py +58 -19
  36. pipecat/observers/loggers/debug_log_observer.py +56 -64
  37. pipecat/observers/loggers/llm_log_observer.py +8 -1
  38. pipecat/observers/loggers/transcription_log_observer.py +19 -7
  39. pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
  40. pipecat/observers/turn_tracking_observer.py +26 -1
  41. pipecat/pipeline/base_pipeline.py +5 -7
  42. pipecat/pipeline/base_task.py +52 -9
  43. pipecat/pipeline/parallel_pipeline.py +121 -177
  44. pipecat/pipeline/pipeline.py +129 -20
  45. pipecat/pipeline/runner.py +50 -1
  46. pipecat/pipeline/sync_parallel_pipeline.py +132 -32
  47. pipecat/pipeline/task.py +263 -280
  48. pipecat/pipeline/task_observer.py +85 -34
  49. pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
  50. pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
  51. pipecat/processors/aggregators/gated.py +25 -24
  52. pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
  53. pipecat/processors/aggregators/llm_response.py +398 -89
  54. pipecat/processors/aggregators/openai_llm_context.py +161 -13
  55. pipecat/processors/aggregators/sentence.py +25 -14
  56. pipecat/processors/aggregators/user_response.py +28 -3
  57. pipecat/processors/aggregators/vision_image_frame.py +24 -14
  58. pipecat/processors/async_generator.py +28 -0
  59. pipecat/processors/audio/audio_buffer_processor.py +78 -37
  60. pipecat/processors/consumer_processor.py +25 -6
  61. pipecat/processors/filters/frame_filter.py +23 -0
  62. pipecat/processors/filters/function_filter.py +30 -0
  63. pipecat/processors/filters/identity_filter.py +17 -2
  64. pipecat/processors/filters/null_filter.py +24 -1
  65. pipecat/processors/filters/stt_mute_filter.py +56 -21
  66. pipecat/processors/filters/wake_check_filter.py +46 -3
  67. pipecat/processors/filters/wake_notifier_filter.py +21 -3
  68. pipecat/processors/frame_processor.py +488 -131
  69. pipecat/processors/frameworks/langchain.py +38 -3
  70. pipecat/processors/frameworks/rtvi.py +719 -34
  71. pipecat/processors/gstreamer/pipeline_source.py +41 -0
  72. pipecat/processors/idle_frame_processor.py +26 -3
  73. pipecat/processors/logger.py +23 -0
  74. pipecat/processors/metrics/frame_processor_metrics.py +77 -4
  75. pipecat/processors/metrics/sentry.py +42 -4
  76. pipecat/processors/producer_processor.py +34 -14
  77. pipecat/processors/text_transformer.py +22 -10
  78. pipecat/processors/transcript_processor.py +48 -29
  79. pipecat/processors/user_idle_processor.py +31 -21
  80. pipecat/runner/__init__.py +1 -0
  81. pipecat/runner/daily.py +132 -0
  82. pipecat/runner/livekit.py +148 -0
  83. pipecat/runner/run.py +543 -0
  84. pipecat/runner/types.py +67 -0
  85. pipecat/runner/utils.py +515 -0
  86. pipecat/serializers/base_serializer.py +42 -0
  87. pipecat/serializers/exotel.py +17 -6
  88. pipecat/serializers/genesys.py +95 -0
  89. pipecat/serializers/livekit.py +33 -0
  90. pipecat/serializers/plivo.py +16 -15
  91. pipecat/serializers/protobuf.py +37 -1
  92. pipecat/serializers/telnyx.py +18 -17
  93. pipecat/serializers/twilio.py +32 -16
  94. pipecat/services/ai_service.py +5 -3
  95. pipecat/services/anthropic/llm.py +113 -43
  96. pipecat/services/assemblyai/models.py +63 -5
  97. pipecat/services/assemblyai/stt.py +64 -11
  98. pipecat/services/asyncai/__init__.py +0 -0
  99. pipecat/services/asyncai/tts.py +501 -0
  100. pipecat/services/aws/llm.py +185 -111
  101. pipecat/services/aws/stt.py +217 -23
  102. pipecat/services/aws/tts.py +118 -52
  103. pipecat/services/aws/utils.py +101 -5
  104. pipecat/services/aws_nova_sonic/aws.py +82 -64
  105. pipecat/services/aws_nova_sonic/context.py +15 -6
  106. pipecat/services/azure/common.py +10 -2
  107. pipecat/services/azure/image.py +32 -0
  108. pipecat/services/azure/llm.py +9 -7
  109. pipecat/services/azure/stt.py +65 -2
  110. pipecat/services/azure/tts.py +154 -23
  111. pipecat/services/cartesia/stt.py +125 -8
  112. pipecat/services/cartesia/tts.py +102 -38
  113. pipecat/services/cerebras/llm.py +15 -23
  114. pipecat/services/deepgram/stt.py +19 -11
  115. pipecat/services/deepgram/tts.py +36 -0
  116. pipecat/services/deepseek/llm.py +14 -23
  117. pipecat/services/elevenlabs/tts.py +330 -64
  118. pipecat/services/fal/image.py +43 -0
  119. pipecat/services/fal/stt.py +48 -10
  120. pipecat/services/fireworks/llm.py +14 -21
  121. pipecat/services/fish/tts.py +109 -9
  122. pipecat/services/gemini_multimodal_live/__init__.py +1 -0
  123. pipecat/services/gemini_multimodal_live/events.py +83 -2
  124. pipecat/services/gemini_multimodal_live/file_api.py +189 -0
  125. pipecat/services/gemini_multimodal_live/gemini.py +218 -21
  126. pipecat/services/gladia/config.py +17 -10
  127. pipecat/services/gladia/stt.py +82 -36
  128. pipecat/services/google/frames.py +40 -0
  129. pipecat/services/google/google.py +2 -0
  130. pipecat/services/google/image.py +39 -2
  131. pipecat/services/google/llm.py +176 -58
  132. pipecat/services/google/llm_openai.py +26 -4
  133. pipecat/services/google/llm_vertex.py +37 -15
  134. pipecat/services/google/rtvi.py +41 -0
  135. pipecat/services/google/stt.py +65 -17
  136. pipecat/services/google/test-google-chirp.py +45 -0
  137. pipecat/services/google/tts.py +390 -19
  138. pipecat/services/grok/llm.py +8 -6
  139. pipecat/services/groq/llm.py +8 -6
  140. pipecat/services/groq/stt.py +13 -9
  141. pipecat/services/groq/tts.py +40 -0
  142. pipecat/services/hamsa/__init__.py +9 -0
  143. pipecat/services/hamsa/stt.py +241 -0
  144. pipecat/services/heygen/__init__.py +5 -0
  145. pipecat/services/heygen/api.py +281 -0
  146. pipecat/services/heygen/client.py +620 -0
  147. pipecat/services/heygen/video.py +338 -0
  148. pipecat/services/image_service.py +5 -3
  149. pipecat/services/inworld/__init__.py +1 -0
  150. pipecat/services/inworld/tts.py +592 -0
  151. pipecat/services/llm_service.py +127 -45
  152. pipecat/services/lmnt/tts.py +80 -7
  153. pipecat/services/mcp_service.py +85 -44
  154. pipecat/services/mem0/memory.py +42 -13
  155. pipecat/services/minimax/tts.py +74 -15
  156. pipecat/services/mistral/__init__.py +0 -0
  157. pipecat/services/mistral/llm.py +185 -0
  158. pipecat/services/moondream/vision.py +55 -10
  159. pipecat/services/neuphonic/tts.py +275 -48
  160. pipecat/services/nim/llm.py +8 -6
  161. pipecat/services/ollama/llm.py +27 -7
  162. pipecat/services/openai/base_llm.py +54 -16
  163. pipecat/services/openai/image.py +30 -0
  164. pipecat/services/openai/llm.py +7 -5
  165. pipecat/services/openai/stt.py +13 -9
  166. pipecat/services/openai/tts.py +42 -10
  167. pipecat/services/openai_realtime_beta/azure.py +11 -9
  168. pipecat/services/openai_realtime_beta/context.py +7 -5
  169. pipecat/services/openai_realtime_beta/events.py +10 -7
  170. pipecat/services/openai_realtime_beta/openai.py +37 -18
  171. pipecat/services/openpipe/llm.py +30 -24
  172. pipecat/services/openrouter/llm.py +9 -7
  173. pipecat/services/perplexity/llm.py +15 -19
  174. pipecat/services/piper/tts.py +26 -12
  175. pipecat/services/playht/tts.py +227 -65
  176. pipecat/services/qwen/llm.py +8 -6
  177. pipecat/services/rime/tts.py +128 -17
  178. pipecat/services/riva/stt.py +160 -22
  179. pipecat/services/riva/tts.py +67 -2
  180. pipecat/services/sambanova/llm.py +19 -17
  181. pipecat/services/sambanova/stt.py +14 -8
  182. pipecat/services/sarvam/tts.py +60 -13
  183. pipecat/services/simli/video.py +82 -21
  184. pipecat/services/soniox/__init__.py +0 -0
  185. pipecat/services/soniox/stt.py +398 -0
  186. pipecat/services/speechmatics/stt.py +29 -17
  187. pipecat/services/stt_service.py +47 -11
  188. pipecat/services/tavus/video.py +94 -25
  189. pipecat/services/together/llm.py +8 -6
  190. pipecat/services/tts_service.py +77 -53
  191. pipecat/services/ultravox/stt.py +46 -43
  192. pipecat/services/vision_service.py +5 -3
  193. pipecat/services/websocket_service.py +12 -11
  194. pipecat/services/whisper/base_stt.py +58 -12
  195. pipecat/services/whisper/stt.py +69 -58
  196. pipecat/services/xtts/tts.py +59 -2
  197. pipecat/sync/base_notifier.py +19 -0
  198. pipecat/sync/event_notifier.py +24 -0
  199. pipecat/tests/utils.py +73 -5
  200. pipecat/transcriptions/language.py +24 -0
  201. pipecat/transports/base_input.py +112 -8
  202. pipecat/transports/base_output.py +235 -13
  203. pipecat/transports/base_transport.py +119 -0
  204. pipecat/transports/local/audio.py +76 -0
  205. pipecat/transports/local/tk.py +84 -0
  206. pipecat/transports/network/fastapi_websocket.py +174 -15
  207. pipecat/transports/network/small_webrtc.py +383 -39
  208. pipecat/transports/network/webrtc_connection.py +214 -8
  209. pipecat/transports/network/websocket_client.py +171 -1
  210. pipecat/transports/network/websocket_server.py +147 -9
  211. pipecat/transports/services/daily.py +792 -70
  212. pipecat/transports/services/helpers/daily_rest.py +122 -129
  213. pipecat/transports/services/livekit.py +339 -4
  214. pipecat/transports/services/tavus.py +273 -38
  215. pipecat/utils/asyncio/task_manager.py +92 -186
  216. pipecat/utils/base_object.py +83 -1
  217. pipecat/utils/network.py +2 -0
  218. pipecat/utils/string.py +114 -58
  219. pipecat/utils/text/base_text_aggregator.py +44 -13
  220. pipecat/utils/text/base_text_filter.py +46 -0
  221. pipecat/utils/text/markdown_text_filter.py +70 -14
  222. pipecat/utils/text/pattern_pair_aggregator.py +18 -14
  223. pipecat/utils/text/simple_text_aggregator.py +43 -2
  224. pipecat/utils/text/skip_tags_aggregator.py +21 -13
  225. pipecat/utils/time.py +36 -0
  226. pipecat/utils/tracing/class_decorators.py +32 -7
  227. pipecat/utils/tracing/conversation_context_provider.py +12 -2
  228. pipecat/utils/tracing/service_attributes.py +80 -64
  229. pipecat/utils/tracing/service_decorators.py +48 -21
  230. pipecat/utils/tracing/setup.py +13 -7
  231. pipecat/utils/tracing/turn_context_provider.py +12 -2
  232. pipecat/utils/tracing/turn_trace_observer.py +27 -0
  233. pipecat/utils/utils.py +14 -14
  234. dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
  235. pipecat/examples/daily_runner.py +0 -64
  236. pipecat/examples/run.py +0 -265
  237. pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
  238. pipecat/utils/asyncio/watchdog_event.py +0 -42
  239. pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
  240. pipecat/utils/asyncio/watchdog_queue.py +0 -48
  241. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
  242. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
  243. {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
  244. /pipecat/{examples → extensions}/__init__.py +0 -0
@@ -44,13 +44,12 @@ except ModuleNotFoundError as e:
44
44
  class AudioBuffer:
45
45
  """Buffer to collect audio frames before processing.
46
46
 
47
- Attributes:
48
- frames: List of AudioRawFrames to process
49
- started_at: Timestamp when speech started
50
- is_processing: Flag to prevent concurrent processing
47
+ Manages the collection and state of audio frames during speech
48
+ recording sessions, including timing and processing flags.
51
49
  """
52
50
 
53
51
  def __init__(self):
52
+ """Initialize the audio buffer."""
54
53
  self.frames: List[AudioRawFrame] = []
55
54
  self.started_at: Optional[float] = None
56
55
  self.is_processing: bool = False
@@ -59,19 +58,17 @@ class AudioBuffer:
59
58
  class UltravoxModel:
60
59
  """Model wrapper for the Ultravox multimodal model.
61
60
 
62
- This class handles loading and running the Ultravox model for speech-to-text.
63
-
64
- Args:
65
- model_name: The name or path of the Ultravox model to load
66
-
67
- Attributes:
68
- model_name: The name of the loaded model
69
- engine: The vLLM engine for model inference
70
- tokenizer: The tokenizer for the model
71
- stop_token_ids: Optional token IDs to stop generation
61
+ This class handles loading and running the Ultravox model for speech-to-text
62
+ transcription using vLLM for efficient inference.
72
63
  """
73
64
 
74
65
  def __init__(self, model_name: str = "fixie-ai/ultravox-v0_5-llama-3_1-8b"):
66
+ """Initialize the Ultravox model.
67
+
68
+ Args:
69
+ model_name: The name or path of the Ultravox model to load.
70
+ Defaults to "fixie-ai/ultravox-v0_5-llama-3_1-8b".
71
+ """
75
72
  self.model_name = model_name
76
73
  self._initialize_engine()
77
74
  self._initialize_tokenizer()
@@ -95,10 +92,10 @@ class UltravoxModel:
95
92
  """Format chat messages into a prompt for the model.
96
93
 
97
94
  Args:
98
- messages: List of message dictionaries with 'role' and 'content'
95
+ messages: List of message dictionaries with 'role' and 'content'.
99
96
 
100
97
  Returns:
101
- str: Formatted prompt string
98
+ str: Formatted prompt string ready for model input.
102
99
  """
103
100
  return self.tokenizer.apply_chat_template(
104
101
  messages, tokenize=False, add_generation_prompt=True
@@ -114,13 +111,13 @@ class UltravoxModel:
114
111
  """Generate text from audio input using the model.
115
112
 
116
113
  Args:
117
- messages: List of message dictionaries
118
- temperature: Sampling temperature
119
- max_tokens: Maximum tokens to generate
120
- audio: Audio data as numpy array
114
+ messages: List of message dictionaries for conversation context.
115
+ temperature: Sampling temperature for generation randomness.
116
+ max_tokens: Maximum number of tokens to generate.
117
+ audio: Audio data as numpy array in float32 format.
121
118
 
122
119
  Yields:
123
- str: JSON chunks of the generated response
120
+ str: JSON chunks of the generated response in OpenAI format.
124
121
  """
125
122
  sampling_params = SamplingParams(
126
123
  temperature=temperature, max_tokens=max_tokens, stop_token_ids=self.stop_token_ids
@@ -173,22 +170,9 @@ class UltravoxModel:
173
170
  class UltravoxSTTService(AIService):
174
171
  """Service to transcribe audio using the Ultravox multimodal model.
175
172
 
176
- This service collects audio frames and processes them with Ultravox
177
- to generate text transcriptions.
178
-
179
- Args:
180
- model_name: The Ultravox model to use (ModelSize enum or string)
181
- hf_token: Hugging Face token for model access
182
- temperature: Sampling temperature for generation
183
- max_tokens: Maximum tokens to generate
184
- **kwargs: Additional arguments passed to AIService
185
-
186
- Attributes:
187
- model: The UltravoxModel instance
188
- buffer: Buffer to collect audio frames
189
- temperature: Temperature for text generation
190
- max_tokens: Maximum tokens to generate
191
- _connection_active: Flag indicating if service is active
173
+ This service collects audio frames during speech and processes them with
174
+ Ultravox to generate text transcriptions. It handles real-time audio
175
+ buffering, model warm-up, and streaming text generation.
192
176
  """
193
177
 
194
178
  def __init__(
@@ -200,6 +184,17 @@ class UltravoxSTTService(AIService):
200
184
  max_tokens: int = 100,
201
185
  **kwargs,
202
186
  ):
187
+ """Initialize the UltravoxSTTService.
188
+
189
+ Args:
190
+ model_name: The Ultravox model to use. Defaults to
191
+ "fixie-ai/ultravox-v0_5-llama-3_1-8b".
192
+ hf_token: Hugging Face token for model access. If None, will try
193
+ to use HF_TOKEN environment variable.
194
+ temperature: Sampling temperature for generation. Defaults to 0.7.
195
+ max_tokens: Maximum tokens to generate. Defaults to 100.
196
+ **kwargs: Additional arguments passed to AIService.
197
+ """
203
198
  super().__init__(**kwargs)
204
199
 
205
200
  # Authenticate with Hugging Face if token provided
@@ -283,8 +278,11 @@ class UltravoxSTTService(AIService):
283
278
  async def start(self, frame: StartFrame):
284
279
  """Handle service start.
285
280
 
281
+ Starts the service, marks it as active, and performs model warm-up
282
+ to ensure optimal performance for the first inference.
283
+
286
284
  Args:
287
- frame: StartFrame that triggered this method
285
+ frame: StartFrame that triggered this method.
288
286
  """
289
287
  await super().start(frame)
290
288
  self._connection_active = True
@@ -296,8 +294,10 @@ class UltravoxSTTService(AIService):
296
294
  async def stop(self, frame: EndFrame):
297
295
  """Handle service stop.
298
296
 
297
+ Stops the service and marks it as inactive.
298
+
299
299
  Args:
300
- frame: EndFrame that triggered this method
300
+ frame: EndFrame that triggered this method.
301
301
  """
302
302
  await super().stop(frame)
303
303
  self._connection_active = False
@@ -306,8 +306,10 @@ class UltravoxSTTService(AIService):
306
306
  async def cancel(self, frame: CancelFrame):
307
307
  """Handle service cancellation.
308
308
 
309
+ Cancels the service, clears any buffered audio, and marks it as inactive.
310
+
309
311
  Args:
310
- frame: CancelFrame that triggered this method
312
+ frame: CancelFrame that triggered this method.
311
313
  """
312
314
  await super().cancel(frame)
313
315
  self._connection_active = False
@@ -317,11 +319,12 @@ class UltravoxSTTService(AIService):
317
319
  async def process_frame(self, frame: Frame, direction: FrameDirection):
318
320
  """Process incoming frames.
319
321
 
320
- This method collects audio frames and processes them when speech ends.
322
+ This method collects audio frames during speech and processes them
323
+ when speech ends to generate text transcriptions.
321
324
 
322
325
  Args:
323
- frame: The frame to process
324
- direction: Direction of the frame (input/output)
326
+ frame: The frame to process.
327
+ direction: Direction of the frame (input/output).
325
328
  """
326
329
  await super().process_frame(frame, direction)
327
330
 
@@ -25,12 +25,14 @@ class VisionService(AIService):
25
25
  Provides common functionality for vision services that process images and
26
26
  generate textual responses. Handles image frame processing and integrates
27
27
  with the AI service infrastructure for metrics and lifecycle management.
28
-
29
- Args:
30
- **kwargs: Additional arguments passed to the parent AIService.
31
28
  """
32
29
 
33
30
  def __init__(self, **kwargs):
31
+ """Initialize the vision service.
32
+
33
+ Args:
34
+ **kwargs: Additional arguments passed to the parent AIService.
35
+ """
34
36
  super().__init__(**kwargs)
35
37
  self._describe_text = None
36
38
 
@@ -12,6 +12,7 @@ from typing import Awaitable, Callable, Optional
12
12
 
13
13
  import websockets
14
14
  from loguru import logger
15
+ from websockets.exceptions import ConnectionClosedOK
15
16
  from websockets.protocol import State
16
17
 
17
18
  from pipecat.frames.frames import ErrorFrame
@@ -24,13 +25,15 @@ class WebsocketService(ABC):
24
25
  Provides websocket connection management, automatic reconnection with
25
26
  exponential backoff, connection verification, and error handling.
26
27
  Subclasses implement service-specific connection and message handling logic.
27
-
28
- Args:
29
- reconnect_on_error: Whether to automatically reconnect on connection errors.
30
- **kwargs: Additional arguments (unused, for compatibility).
31
28
  """
32
29
 
33
30
  def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
31
+ """Initialize the websocket service.
32
+
33
+ Args:
34
+ reconnect_on_error: Whether to automatically reconnect on connection errors.
35
+ **kwargs: Additional arguments (unused, for compatibility).
36
+ """
34
37
  self._websocket: Optional[websockets.WebSocketClientProtocol] = None
35
38
  self._reconnect_on_error = reconnect_on_error
36
39
 
@@ -41,7 +44,7 @@ class WebsocketService(ABC):
41
44
  True if connection is verified working, False otherwise.
42
45
  """
43
46
  try:
44
- if not self._websocket or self._websocket.closed:
47
+ if not self._websocket or self._websocket.state is State.CLOSED:
45
48
  return False
46
49
  await self._websocket.ping()
47
50
  return True
@@ -80,12 +83,10 @@ class WebsocketService(ABC):
80
83
  try:
81
84
  await self._receive_messages()
82
85
  retry_count = 0 # Reset counter on successful message receive
83
- if self._websocket and self._websocket.state == State.CLOSED:
84
- raise websockets.ConnectionClosedOK(
85
- self._websocket.close_rcvd,
86
- self._websocket.close_sent,
87
- self._websocket.close_rcvd_then_sent,
88
- )
86
+ except ConnectionClosedOK as e:
87
+ # Normal closure, don't retry
88
+ logger.debug(f"{self} connection closed normally: {e}")
89
+ break
89
90
  except Exception as e:
90
91
  message = f"{self} error receiving messages: {e}"
91
92
  logger.error(message)
@@ -4,6 +4,12 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
+ """Base class for Whisper-based speech-to-text services.
8
+
9
+ This module provides common functionality for services implementing the Whisper API
10
+ interface, including language mapping, metrics generation, and error handling.
11
+ """
12
+
7
13
  from typing import AsyncGenerator, Optional
8
14
 
9
15
  from loguru import logger
@@ -18,9 +24,16 @@ from pipecat.utils.tracing.service_decorators import traced_stt
18
24
 
19
25
 
20
26
  def language_to_whisper_language(language: Language) -> Optional[str]:
21
- """Language support for Whisper API.
27
+ """Maps pipecat Language enum to Whisper API language codes.
22
28
 
29
+ Language support for Whisper API.
23
30
  Docs: https://platform.openai.com/docs/guides/speech-to-text#supported-languages
31
+
32
+ Args:
33
+ language: A Language enum value representing the input language.
34
+
35
+ Returns:
36
+ str or None: The corresponding Whisper language code, or None if not supported.
24
37
  """
25
38
  BASE_LANGUAGES = {
26
39
  Language.AF: "af",
@@ -98,15 +111,6 @@ class BaseWhisperSTTService(SegmentedSTTService):
98
111
 
99
112
  Provides common functionality for services implementing the Whisper API interface,
100
113
  including metrics generation and error handling.
101
-
102
- Args:
103
- model: Name of the Whisper model to use.
104
- api_key: Service API key. Defaults to None.
105
- base_url: Service API base URL. Defaults to None.
106
- language: Language of the audio input. Defaults to English.
107
- prompt: Optional text to guide the model's style or continue a previous segment.
108
- temperature: Sampling temperature between 0 and 1. Defaults to 0.0.
109
- **kwargs: Additional arguments passed to SegmentedSTTService.
110
114
  """
111
115
 
112
116
  def __init__(
@@ -120,6 +124,17 @@ class BaseWhisperSTTService(SegmentedSTTService):
120
124
  temperature: Optional[float] = None,
121
125
  **kwargs,
122
126
  ):
127
+ """Initialize the Whisper STT service.
128
+
129
+ Args:
130
+ model: Name of the Whisper model to use.
131
+ api_key: Service API key. Defaults to None.
132
+ base_url: Service API base URL. Defaults to None.
133
+ language: Language of the audio input. Defaults to English.
134
+ prompt: Optional text to guide the model's style or continue a previous segment.
135
+ temperature: Sampling temperature between 0 and 1. Defaults to 0.0.
136
+ **kwargs: Additional arguments passed to SegmentedSTTService.
137
+ """
123
138
  super().__init__(**kwargs)
124
139
  self.set_model_name(model)
125
140
  self._client = self._create_client(api_key, base_url)
@@ -138,12 +153,30 @@ class BaseWhisperSTTService(SegmentedSTTService):
138
153
  return AsyncOpenAI(api_key=api_key, base_url=base_url)
139
154
 
140
155
  async def set_model(self, model: str):
156
+ """Set the model name for transcription.
157
+
158
+ Args:
159
+ model: The name of the model to use.
160
+ """
141
161
  self.set_model_name(model)
142
162
 
143
163
  def can_generate_metrics(self) -> bool:
164
+ """Indicates whether this service can generate metrics.
165
+
166
+ Returns:
167
+ bool: True, as this service supports metric generation.
168
+ """
144
169
  return True
145
170
 
146
171
  def language_to_service_language(self, language: Language) -> Optional[str]:
172
+ """Convert from pipecat Language to service language code.
173
+
174
+ Args:
175
+ language: The Language enum value to convert.
176
+
177
+ Returns:
178
+ str or None: The corresponding service language code, or None if not supported.
179
+ """
147
180
  return language_to_whisper_language(language)
148
181
 
149
182
  async def set_language(self, language: Language):
@@ -153,7 +186,7 @@ class BaseWhisperSTTService(SegmentedSTTService):
153
186
  language: The Language enum value to use for transcription.
154
187
  """
155
188
  logger.info(f"Switching STT language to: [{language}]")
156
- self._language = language
189
+ self._language = self.language_to_service_language(language)
157
190
 
158
191
  @traced_stt
159
192
  async def _handle_transcription(
@@ -163,6 +196,15 @@ class BaseWhisperSTTService(SegmentedSTTService):
163
196
  pass
164
197
 
165
198
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
199
+ """Transcribe audio data to text.
200
+
201
+ Args:
202
+ audio: Raw audio data to transcribe.
203
+
204
+ Yields:
205
+ Frame: Either a TranscriptionFrame containing the transcribed text
206
+ or an ErrorFrame if transcription fails.
207
+ """
166
208
  try:
167
209
  await self.start_processing_metrics()
168
210
  await self.start_ttfb_metrics()
@@ -177,7 +219,11 @@ class BaseWhisperSTTService(SegmentedSTTService):
177
219
  if text:
178
220
  await self._handle_transcription(text, True, self._language)
179
221
  logger.debug(f"Transcription: [{text}]")
180
- yield TranscriptionFrame(text, "", time_now_iso8601())
222
+ yield TranscriptionFrame(
223
+ text,
224
+ self._user_id,
225
+ time_now_iso8601(),
226
+ )
181
227
  else:
182
228
  logger.warning("Received empty transcription from API")
183
229
 
@@ -4,7 +4,11 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- """This module implements Whisper transcription with a locally-downloaded model."""
7
+ """Whisper speech-to-text services with locally-downloaded models.
8
+
9
+ This module implements Whisper transcription using locally-downloaded models,
10
+ supporting both Faster Whisper and MLX Whisper backends for efficient inference.
11
+ """
8
12
 
9
13
  import asyncio
10
14
  from enum import Enum
@@ -37,25 +41,29 @@ if TYPE_CHECKING:
37
41
 
38
42
 
39
43
  class Model(Enum):
40
- """Class of basic Whisper model selection options.
41
-
42
- Available models:
43
- Multilingual models:
44
- TINY: Smallest multilingual model
45
- BASE: Basic multilingual model
46
- MEDIUM: Good balance for multilingual
47
- LARGE: Best quality multilingual
48
- DISTIL_LARGE_V2: Fast multilingual
49
-
50
- English-only models:
51
- DISTIL_MEDIUM_EN: Fast English-only
44
+ """Whisper model selection options for Faster Whisper.
45
+
46
+ Provides various model sizes and specializations for speech recognition,
47
+ balancing quality and performance based on use case requirements.
48
+
49
+ Parameters:
50
+ TINY: Smallest multilingual model, fastest inference.
51
+ BASE: Basic multilingual model, good speed/quality balance.
52
+ SMALL: Small multilingual model, better speed/quality balance than BASE.
53
+ MEDIUM: Medium-sized multilingual model, better quality.
54
+ LARGE: Best quality multilingual model, slower inference.
55
+ LARGE_V3_TURBO: Fast multilingual model, slightly lower quality than LARGE.
56
+ DISTIL_LARGE_V2: Fast multilingual distilled model.
57
+ DISTIL_MEDIUM_EN: Fast English-only distilled model.
52
58
  """
53
59
 
54
60
  # Multilingual models
55
61
  TINY = "tiny"
56
62
  BASE = "base"
63
+ SMALL = "small"
57
64
  MEDIUM = "medium"
58
65
  LARGE = "large-v3"
66
+ LARGE_V3_TURBO = "deepdml/faster-whisper-large-v3-turbo-ct2"
59
67
  DISTIL_LARGE_V2 = "Systran/faster-distil-whisper-large-v2"
60
68
 
61
69
  # English-only models
@@ -63,16 +71,18 @@ class Model(Enum):
63
71
 
64
72
 
65
73
  class MLXModel(Enum):
66
- """Class of MLX Whisper model selection options.
67
-
68
- Available models:
69
- Multilingual models:
70
- TINY: Smallest multilingual model
71
- MEDIUM: Good balance for multilingual
72
- LARGE_V3: Best quality multilingual
73
- LARGE_V3_TURBO: Finetuned, pruned Whisper large-v3, much faster, slightly lower quality
74
- DISTIL_LARGE_V3: Fast multilingual
75
- LARGE_V3_TURBO_Q4: LARGE_V3_TURBO, quantized to Q4
74
+ """MLX Whisper model selection options for Apple Silicon.
75
+
76
+ Provides various model sizes optimized for Apple Silicon hardware,
77
+ including quantized variants for improved performance.
78
+
79
+ Parameters:
80
+ TINY: Smallest multilingual model for MLX.
81
+ MEDIUM: Medium-sized multilingual model for MLX.
82
+ LARGE_V3: Best quality multilingual model for MLX.
83
+ LARGE_V3_TURBO: Finetuned, pruned Whisper large-v3, much faster with slightly lower quality.
84
+ DISTIL_LARGE_V3: Fast multilingual distilled model for MLX.
85
+ LARGE_V3_TURBO_Q4: LARGE_V3_TURBO quantized to Q4 for reduced memory usage.
76
86
  """
77
87
 
78
88
  # Multilingual models
@@ -256,21 +266,6 @@ class WhisperSTTService(SegmentedSTTService):
256
266
 
257
267
  This service uses Faster Whisper to perform speech-to-text transcription on audio
258
268
  segments. It supports multiple languages and various model sizes.
259
-
260
- Args:
261
- model: The Whisper model to use for transcription. Can be a Model enum or string.
262
- device: The device to run inference on ('cpu', 'cuda', or 'auto').
263
- compute_type: The compute type for inference ('default', 'int8', 'int8_float16', etc.).
264
- no_speech_prob: Probability threshold for filtering out non-speech segments.
265
- language: The default language for transcription.
266
- **kwargs: Additional arguments passed to SegmentedSTTService.
267
-
268
- Attributes:
269
- _device: The device used for inference.
270
- _compute_type: The compute type for inference.
271
- _no_speech_prob: Threshold for non-speech filtering.
272
- _model: The loaded Whisper model instance.
273
- _settings: Dictionary containing service settings.
274
269
  """
275
270
 
276
271
  def __init__(
@@ -283,6 +278,16 @@ class WhisperSTTService(SegmentedSTTService):
283
278
  language: Language = Language.EN,
284
279
  **kwargs,
285
280
  ):
281
+ """Initialize the Whisper STT service.
282
+
283
+ Args:
284
+ model: The Whisper model to use for transcription. Can be a Model enum or string.
285
+ device: The device to run inference on ('cpu', 'cuda', or 'auto').
286
+ compute_type: The compute type for inference ('default', 'int8', 'int8_float16', etc.).
287
+ no_speech_prob: Probability threshold for filtering out non-speech segments.
288
+ language: The default language for transcription.
289
+ **kwargs: Additional arguments passed to SegmentedSTTService.
290
+ """
286
291
  super().__init__(**kwargs)
287
292
  self._device: str = device
288
293
  self._compute_type = compute_type
@@ -355,7 +360,7 @@ class WhisperSTTService(SegmentedSTTService):
355
360
  pass
356
361
 
357
362
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
358
- """Transcribes given audio using Whisper.
363
+ """Transcribe audio data using Whisper.
359
364
 
360
365
  Args:
361
366
  audio: Raw audio bytes in 16-bit PCM format.
@@ -394,7 +399,12 @@ class WhisperSTTService(SegmentedSTTService):
394
399
  if text:
395
400
  await self._handle_transcription(text, True, self._settings["language"])
396
401
  logger.debug(f"Transcription: [{text}]")
397
- yield TranscriptionFrame(text, "", time_now_iso8601(), self._settings["language"])
402
+ yield TranscriptionFrame(
403
+ text,
404
+ self._user_id,
405
+ time_now_iso8601(),
406
+ self._settings["language"],
407
+ )
398
408
 
399
409
 
400
410
  class WhisperSTTServiceMLX(WhisperSTTService):
@@ -402,18 +412,6 @@ class WhisperSTTServiceMLX(WhisperSTTService):
402
412
 
403
413
  This service uses MLX Whisper to perform speech-to-text transcription on audio
404
414
  segments. It's optimized for Apple Silicon and supports multiple languages and quantizations.
405
-
406
- Args:
407
- model: The MLX Whisper model to use for transcription. Can be an MLXModel enum or string.
408
- no_speech_prob: Probability threshold for filtering out non-speech segments.
409
- language: The default language for transcription.
410
- temperature: Temperature for sampling. Can be a float or tuple of floats.
411
- **kwargs: Additional arguments passed to SegmentedSTTService.
412
-
413
- Attributes:
414
- _no_speech_threshold: Threshold for non-speech filtering.
415
- _temperature: Temperature for sampling.
416
- _settings: Dictionary containing service settings.
417
415
  """
418
416
 
419
417
  def __init__(
@@ -425,6 +423,15 @@ class WhisperSTTServiceMLX(WhisperSTTService):
425
423
  temperature: float = 0.0,
426
424
  **kwargs,
427
425
  ):
426
+ """Initialize the MLX Whisper STT service.
427
+
428
+ Args:
429
+ model: The MLX Whisper model to use for transcription. Can be an MLXModel enum or string.
430
+ no_speech_prob: Probability threshold for filtering out non-speech segments.
431
+ language: The default language for transcription.
432
+ temperature: Temperature for sampling. Can be a float or tuple of floats.
433
+ **kwargs: Additional arguments passed to SegmentedSTTService.
434
+ """
428
435
  # Skip WhisperSTTService.__init__ and call its parent directly
429
436
  SegmentedSTTService.__init__(self, **kwargs)
430
437
 
@@ -455,7 +462,10 @@ class WhisperSTTServiceMLX(WhisperSTTService):
455
462
 
456
463
  @override
457
464
  async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
458
- """Transcribes given audio using MLX Whisper.
465
+ """Transcribe audio data using MLX Whisper.
466
+
467
+ The audio is expected to be 16-bit signed PCM data.
468
+ MLX Whisper will handle the conversion internally.
459
469
 
460
470
  Args:
461
471
  audio: Raw audio bytes in 16-bit PCM format.
@@ -463,10 +473,6 @@ class WhisperSTTServiceMLX(WhisperSTTService):
463
473
  Yields:
464
474
  Frame: Either a TranscriptionFrame containing the transcribed text
465
475
  or an ErrorFrame if transcription fails.
466
-
467
- Note:
468
- The audio is expected to be 16-bit signed PCM data.
469
- MLX Whisper will handle the conversion internally.
470
476
  """
471
477
  try:
472
478
  import mlx_whisper
@@ -503,7 +509,12 @@ class WhisperSTTServiceMLX(WhisperSTTService):
503
509
  if text:
504
510
  await self._handle_transcription(text, True, self._settings["language"])
505
511
  logger.debug(f"Transcription: [{text}]")
506
- yield TranscriptionFrame(text, "", time_now_iso8601(), self._settings["language"])
512
+ yield TranscriptionFrame(
513
+ text,
514
+ self._user_id,
515
+ time_now_iso8601(),
516
+ self._settings["language"],
517
+ )
507
518
 
508
519
  except Exception as e:
509
520
  logger.exception(f"MLX Whisper transcription error: {e}")