dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
- dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
- pipecat/__init__.py +17 -0
- pipecat/adapters/base_llm_adapter.py +36 -1
- pipecat/adapters/schemas/direct_function.py +296 -0
- pipecat/adapters/schemas/function_schema.py +15 -6
- pipecat/adapters/schemas/tools_schema.py +55 -7
- pipecat/adapters/services/anthropic_adapter.py +22 -3
- pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
- pipecat/adapters/services/bedrock_adapter.py +22 -3
- pipecat/adapters/services/gemini_adapter.py +16 -3
- pipecat/adapters/services/open_ai_adapter.py +17 -2
- pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
- pipecat/audio/filters/base_audio_filter.py +30 -6
- pipecat/audio/filters/koala_filter.py +37 -2
- pipecat/audio/filters/krisp_filter.py +59 -6
- pipecat/audio/filters/noisereduce_filter.py +37 -0
- pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
- pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
- pipecat/audio/mixers/base_audio_mixer.py +30 -7
- pipecat/audio/mixers/soundfile_mixer.py +53 -6
- pipecat/audio/resamplers/base_audio_resampler.py +17 -9
- pipecat/audio/resamplers/resampy_resampler.py +26 -1
- pipecat/audio/resamplers/soxr_resampler.py +32 -1
- pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
- pipecat/audio/utils.py +194 -1
- pipecat/audio/vad/silero.py +60 -3
- pipecat/audio/vad/vad_analyzer.py +114 -30
- pipecat/clocks/base_clock.py +19 -0
- pipecat/clocks/system_clock.py +25 -0
- pipecat/extensions/voicemail/__init__.py +0 -0
- pipecat/extensions/voicemail/voicemail_detector.py +707 -0
- pipecat/frames/frames.py +590 -156
- pipecat/metrics/metrics.py +64 -1
- pipecat/observers/base_observer.py +58 -19
- pipecat/observers/loggers/debug_log_observer.py +56 -64
- pipecat/observers/loggers/llm_log_observer.py +8 -1
- pipecat/observers/loggers/transcription_log_observer.py +19 -7
- pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
- pipecat/observers/turn_tracking_observer.py +26 -1
- pipecat/pipeline/base_pipeline.py +5 -7
- pipecat/pipeline/base_task.py +52 -9
- pipecat/pipeline/parallel_pipeline.py +121 -177
- pipecat/pipeline/pipeline.py +129 -20
- pipecat/pipeline/runner.py +50 -1
- pipecat/pipeline/sync_parallel_pipeline.py +132 -32
- pipecat/pipeline/task.py +263 -280
- pipecat/pipeline/task_observer.py +85 -34
- pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
- pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
- pipecat/processors/aggregators/gated.py +25 -24
- pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
- pipecat/processors/aggregators/llm_response.py +398 -89
- pipecat/processors/aggregators/openai_llm_context.py +161 -13
- pipecat/processors/aggregators/sentence.py +25 -14
- pipecat/processors/aggregators/user_response.py +28 -3
- pipecat/processors/aggregators/vision_image_frame.py +24 -14
- pipecat/processors/async_generator.py +28 -0
- pipecat/processors/audio/audio_buffer_processor.py +78 -37
- pipecat/processors/consumer_processor.py +25 -6
- pipecat/processors/filters/frame_filter.py +23 -0
- pipecat/processors/filters/function_filter.py +30 -0
- pipecat/processors/filters/identity_filter.py +17 -2
- pipecat/processors/filters/null_filter.py +24 -1
- pipecat/processors/filters/stt_mute_filter.py +56 -21
- pipecat/processors/filters/wake_check_filter.py +46 -3
- pipecat/processors/filters/wake_notifier_filter.py +21 -3
- pipecat/processors/frame_processor.py +488 -131
- pipecat/processors/frameworks/langchain.py +38 -3
- pipecat/processors/frameworks/rtvi.py +719 -34
- pipecat/processors/gstreamer/pipeline_source.py +41 -0
- pipecat/processors/idle_frame_processor.py +26 -3
- pipecat/processors/logger.py +23 -0
- pipecat/processors/metrics/frame_processor_metrics.py +77 -4
- pipecat/processors/metrics/sentry.py +42 -4
- pipecat/processors/producer_processor.py +34 -14
- pipecat/processors/text_transformer.py +22 -10
- pipecat/processors/transcript_processor.py +48 -29
- pipecat/processors/user_idle_processor.py +31 -21
- pipecat/runner/__init__.py +1 -0
- pipecat/runner/daily.py +132 -0
- pipecat/runner/livekit.py +148 -0
- pipecat/runner/run.py +543 -0
- pipecat/runner/types.py +67 -0
- pipecat/runner/utils.py +515 -0
- pipecat/serializers/base_serializer.py +42 -0
- pipecat/serializers/exotel.py +17 -6
- pipecat/serializers/genesys.py +95 -0
- pipecat/serializers/livekit.py +33 -0
- pipecat/serializers/plivo.py +16 -15
- pipecat/serializers/protobuf.py +37 -1
- pipecat/serializers/telnyx.py +18 -17
- pipecat/serializers/twilio.py +32 -16
- pipecat/services/ai_service.py +5 -3
- pipecat/services/anthropic/llm.py +113 -43
- pipecat/services/assemblyai/models.py +63 -5
- pipecat/services/assemblyai/stt.py +64 -11
- pipecat/services/asyncai/__init__.py +0 -0
- pipecat/services/asyncai/tts.py +501 -0
- pipecat/services/aws/llm.py +185 -111
- pipecat/services/aws/stt.py +217 -23
- pipecat/services/aws/tts.py +118 -52
- pipecat/services/aws/utils.py +101 -5
- pipecat/services/aws_nova_sonic/aws.py +82 -64
- pipecat/services/aws_nova_sonic/context.py +15 -6
- pipecat/services/azure/common.py +10 -2
- pipecat/services/azure/image.py +32 -0
- pipecat/services/azure/llm.py +9 -7
- pipecat/services/azure/stt.py +65 -2
- pipecat/services/azure/tts.py +154 -23
- pipecat/services/cartesia/stt.py +125 -8
- pipecat/services/cartesia/tts.py +102 -38
- pipecat/services/cerebras/llm.py +15 -23
- pipecat/services/deepgram/stt.py +19 -11
- pipecat/services/deepgram/tts.py +36 -0
- pipecat/services/deepseek/llm.py +14 -23
- pipecat/services/elevenlabs/tts.py +330 -64
- pipecat/services/fal/image.py +43 -0
- pipecat/services/fal/stt.py +48 -10
- pipecat/services/fireworks/llm.py +14 -21
- pipecat/services/fish/tts.py +109 -9
- pipecat/services/gemini_multimodal_live/__init__.py +1 -0
- pipecat/services/gemini_multimodal_live/events.py +83 -2
- pipecat/services/gemini_multimodal_live/file_api.py +189 -0
- pipecat/services/gemini_multimodal_live/gemini.py +218 -21
- pipecat/services/gladia/config.py +17 -10
- pipecat/services/gladia/stt.py +82 -36
- pipecat/services/google/frames.py +40 -0
- pipecat/services/google/google.py +2 -0
- pipecat/services/google/image.py +39 -2
- pipecat/services/google/llm.py +176 -58
- pipecat/services/google/llm_openai.py +26 -4
- pipecat/services/google/llm_vertex.py +37 -15
- pipecat/services/google/rtvi.py +41 -0
- pipecat/services/google/stt.py +65 -17
- pipecat/services/google/test-google-chirp.py +45 -0
- pipecat/services/google/tts.py +390 -19
- pipecat/services/grok/llm.py +8 -6
- pipecat/services/groq/llm.py +8 -6
- pipecat/services/groq/stt.py +13 -9
- pipecat/services/groq/tts.py +40 -0
- pipecat/services/hamsa/__init__.py +9 -0
- pipecat/services/hamsa/stt.py +241 -0
- pipecat/services/heygen/__init__.py +5 -0
- pipecat/services/heygen/api.py +281 -0
- pipecat/services/heygen/client.py +620 -0
- pipecat/services/heygen/video.py +338 -0
- pipecat/services/image_service.py +5 -3
- pipecat/services/inworld/__init__.py +1 -0
- pipecat/services/inworld/tts.py +592 -0
- pipecat/services/llm_service.py +127 -45
- pipecat/services/lmnt/tts.py +80 -7
- pipecat/services/mcp_service.py +85 -44
- pipecat/services/mem0/memory.py +42 -13
- pipecat/services/minimax/tts.py +74 -15
- pipecat/services/mistral/__init__.py +0 -0
- pipecat/services/mistral/llm.py +185 -0
- pipecat/services/moondream/vision.py +55 -10
- pipecat/services/neuphonic/tts.py +275 -48
- pipecat/services/nim/llm.py +8 -6
- pipecat/services/ollama/llm.py +27 -7
- pipecat/services/openai/base_llm.py +54 -16
- pipecat/services/openai/image.py +30 -0
- pipecat/services/openai/llm.py +7 -5
- pipecat/services/openai/stt.py +13 -9
- pipecat/services/openai/tts.py +42 -10
- pipecat/services/openai_realtime_beta/azure.py +11 -9
- pipecat/services/openai_realtime_beta/context.py +7 -5
- pipecat/services/openai_realtime_beta/events.py +10 -7
- pipecat/services/openai_realtime_beta/openai.py +37 -18
- pipecat/services/openpipe/llm.py +30 -24
- pipecat/services/openrouter/llm.py +9 -7
- pipecat/services/perplexity/llm.py +15 -19
- pipecat/services/piper/tts.py +26 -12
- pipecat/services/playht/tts.py +227 -65
- pipecat/services/qwen/llm.py +8 -6
- pipecat/services/rime/tts.py +128 -17
- pipecat/services/riva/stt.py +160 -22
- pipecat/services/riva/tts.py +67 -2
- pipecat/services/sambanova/llm.py +19 -17
- pipecat/services/sambanova/stt.py +14 -8
- pipecat/services/sarvam/tts.py +60 -13
- pipecat/services/simli/video.py +82 -21
- pipecat/services/soniox/__init__.py +0 -0
- pipecat/services/soniox/stt.py +398 -0
- pipecat/services/speechmatics/stt.py +29 -17
- pipecat/services/stt_service.py +47 -11
- pipecat/services/tavus/video.py +94 -25
- pipecat/services/together/llm.py +8 -6
- pipecat/services/tts_service.py +77 -53
- pipecat/services/ultravox/stt.py +46 -43
- pipecat/services/vision_service.py +5 -3
- pipecat/services/websocket_service.py +12 -11
- pipecat/services/whisper/base_stt.py +58 -12
- pipecat/services/whisper/stt.py +69 -58
- pipecat/services/xtts/tts.py +59 -2
- pipecat/sync/base_notifier.py +19 -0
- pipecat/sync/event_notifier.py +24 -0
- pipecat/tests/utils.py +73 -5
- pipecat/transcriptions/language.py +24 -0
- pipecat/transports/base_input.py +112 -8
- pipecat/transports/base_output.py +235 -13
- pipecat/transports/base_transport.py +119 -0
- pipecat/transports/local/audio.py +76 -0
- pipecat/transports/local/tk.py +84 -0
- pipecat/transports/network/fastapi_websocket.py +174 -15
- pipecat/transports/network/small_webrtc.py +383 -39
- pipecat/transports/network/webrtc_connection.py +214 -8
- pipecat/transports/network/websocket_client.py +171 -1
- pipecat/transports/network/websocket_server.py +147 -9
- pipecat/transports/services/daily.py +792 -70
- pipecat/transports/services/helpers/daily_rest.py +122 -129
- pipecat/transports/services/livekit.py +339 -4
- pipecat/transports/services/tavus.py +273 -38
- pipecat/utils/asyncio/task_manager.py +92 -186
- pipecat/utils/base_object.py +83 -1
- pipecat/utils/network.py +2 -0
- pipecat/utils/string.py +114 -58
- pipecat/utils/text/base_text_aggregator.py +44 -13
- pipecat/utils/text/base_text_filter.py +46 -0
- pipecat/utils/text/markdown_text_filter.py +70 -14
- pipecat/utils/text/pattern_pair_aggregator.py +18 -14
- pipecat/utils/text/simple_text_aggregator.py +43 -2
- pipecat/utils/text/skip_tags_aggregator.py +21 -13
- pipecat/utils/time.py +36 -0
- pipecat/utils/tracing/class_decorators.py +32 -7
- pipecat/utils/tracing/conversation_context_provider.py +12 -2
- pipecat/utils/tracing/service_attributes.py +80 -64
- pipecat/utils/tracing/service_decorators.py +48 -21
- pipecat/utils/tracing/setup.py +13 -7
- pipecat/utils/tracing/turn_context_provider.py +12 -2
- pipecat/utils/tracing/turn_trace_observer.py +27 -0
- pipecat/utils/utils.py +14 -14
- dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
- pipecat/examples/daily_runner.py +0 -64
- pipecat/examples/run.py +0 -265
- pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
- pipecat/utils/asyncio/watchdog_event.py +0 -42
- pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
- pipecat/utils/asyncio/watchdog_queue.py +0 -48
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
- /pipecat/{examples → extensions}/__init__.py +0 -0
pipecat/services/ultravox/stt.py
CHANGED
|
@@ -44,13 +44,12 @@ except ModuleNotFoundError as e:
|
|
|
44
44
|
class AudioBuffer:
|
|
45
45
|
"""Buffer to collect audio frames before processing.
|
|
46
46
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
started_at: Timestamp when speech started
|
|
50
|
-
is_processing: Flag to prevent concurrent processing
|
|
47
|
+
Manages the collection and state of audio frames during speech
|
|
48
|
+
recording sessions, including timing and processing flags.
|
|
51
49
|
"""
|
|
52
50
|
|
|
53
51
|
def __init__(self):
|
|
52
|
+
"""Initialize the audio buffer."""
|
|
54
53
|
self.frames: List[AudioRawFrame] = []
|
|
55
54
|
self.started_at: Optional[float] = None
|
|
56
55
|
self.is_processing: bool = False
|
|
@@ -59,19 +58,17 @@ class AudioBuffer:
|
|
|
59
58
|
class UltravoxModel:
|
|
60
59
|
"""Model wrapper for the Ultravox multimodal model.
|
|
61
60
|
|
|
62
|
-
This class handles loading and running the Ultravox model for speech-to-text
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
model_name: The name or path of the Ultravox model to load
|
|
66
|
-
|
|
67
|
-
Attributes:
|
|
68
|
-
model_name: The name of the loaded model
|
|
69
|
-
engine: The vLLM engine for model inference
|
|
70
|
-
tokenizer: The tokenizer for the model
|
|
71
|
-
stop_token_ids: Optional token IDs to stop generation
|
|
61
|
+
This class handles loading and running the Ultravox model for speech-to-text
|
|
62
|
+
transcription using vLLM for efficient inference.
|
|
72
63
|
"""
|
|
73
64
|
|
|
74
65
|
def __init__(self, model_name: str = "fixie-ai/ultravox-v0_5-llama-3_1-8b"):
|
|
66
|
+
"""Initialize the Ultravox model.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
model_name: The name or path of the Ultravox model to load.
|
|
70
|
+
Defaults to "fixie-ai/ultravox-v0_5-llama-3_1-8b".
|
|
71
|
+
"""
|
|
75
72
|
self.model_name = model_name
|
|
76
73
|
self._initialize_engine()
|
|
77
74
|
self._initialize_tokenizer()
|
|
@@ -95,10 +92,10 @@ class UltravoxModel:
|
|
|
95
92
|
"""Format chat messages into a prompt for the model.
|
|
96
93
|
|
|
97
94
|
Args:
|
|
98
|
-
messages: List of message dictionaries with 'role' and 'content'
|
|
95
|
+
messages: List of message dictionaries with 'role' and 'content'.
|
|
99
96
|
|
|
100
97
|
Returns:
|
|
101
|
-
str: Formatted prompt string
|
|
98
|
+
str: Formatted prompt string ready for model input.
|
|
102
99
|
"""
|
|
103
100
|
return self.tokenizer.apply_chat_template(
|
|
104
101
|
messages, tokenize=False, add_generation_prompt=True
|
|
@@ -114,13 +111,13 @@ class UltravoxModel:
|
|
|
114
111
|
"""Generate text from audio input using the model.
|
|
115
112
|
|
|
116
113
|
Args:
|
|
117
|
-
messages: List of message dictionaries
|
|
118
|
-
temperature: Sampling temperature
|
|
119
|
-
max_tokens: Maximum tokens to generate
|
|
120
|
-
audio: Audio data as numpy array
|
|
114
|
+
messages: List of message dictionaries for conversation context.
|
|
115
|
+
temperature: Sampling temperature for generation randomness.
|
|
116
|
+
max_tokens: Maximum number of tokens to generate.
|
|
117
|
+
audio: Audio data as numpy array in float32 format.
|
|
121
118
|
|
|
122
119
|
Yields:
|
|
123
|
-
str: JSON chunks of the generated response
|
|
120
|
+
str: JSON chunks of the generated response in OpenAI format.
|
|
124
121
|
"""
|
|
125
122
|
sampling_params = SamplingParams(
|
|
126
123
|
temperature=temperature, max_tokens=max_tokens, stop_token_ids=self.stop_token_ids
|
|
@@ -173,22 +170,9 @@ class UltravoxModel:
|
|
|
173
170
|
class UltravoxSTTService(AIService):
|
|
174
171
|
"""Service to transcribe audio using the Ultravox multimodal model.
|
|
175
172
|
|
|
176
|
-
This service collects audio frames and processes them with
|
|
177
|
-
to generate text transcriptions.
|
|
178
|
-
|
|
179
|
-
Args:
|
|
180
|
-
model_name: The Ultravox model to use (ModelSize enum or string)
|
|
181
|
-
hf_token: Hugging Face token for model access
|
|
182
|
-
temperature: Sampling temperature for generation
|
|
183
|
-
max_tokens: Maximum tokens to generate
|
|
184
|
-
**kwargs: Additional arguments passed to AIService
|
|
185
|
-
|
|
186
|
-
Attributes:
|
|
187
|
-
model: The UltravoxModel instance
|
|
188
|
-
buffer: Buffer to collect audio frames
|
|
189
|
-
temperature: Temperature for text generation
|
|
190
|
-
max_tokens: Maximum tokens to generate
|
|
191
|
-
_connection_active: Flag indicating if service is active
|
|
173
|
+
This service collects audio frames during speech and processes them with
|
|
174
|
+
Ultravox to generate text transcriptions. It handles real-time audio
|
|
175
|
+
buffering, model warm-up, and streaming text generation.
|
|
192
176
|
"""
|
|
193
177
|
|
|
194
178
|
def __init__(
|
|
@@ -200,6 +184,17 @@ class UltravoxSTTService(AIService):
|
|
|
200
184
|
max_tokens: int = 100,
|
|
201
185
|
**kwargs,
|
|
202
186
|
):
|
|
187
|
+
"""Initialize the UltravoxSTTService.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
model_name: The Ultravox model to use. Defaults to
|
|
191
|
+
"fixie-ai/ultravox-v0_5-llama-3_1-8b".
|
|
192
|
+
hf_token: Hugging Face token for model access. If None, will try
|
|
193
|
+
to use HF_TOKEN environment variable.
|
|
194
|
+
temperature: Sampling temperature for generation. Defaults to 0.7.
|
|
195
|
+
max_tokens: Maximum tokens to generate. Defaults to 100.
|
|
196
|
+
**kwargs: Additional arguments passed to AIService.
|
|
197
|
+
"""
|
|
203
198
|
super().__init__(**kwargs)
|
|
204
199
|
|
|
205
200
|
# Authenticate with Hugging Face if token provided
|
|
@@ -283,8 +278,11 @@ class UltravoxSTTService(AIService):
|
|
|
283
278
|
async def start(self, frame: StartFrame):
|
|
284
279
|
"""Handle service start.
|
|
285
280
|
|
|
281
|
+
Starts the service, marks it as active, and performs model warm-up
|
|
282
|
+
to ensure optimal performance for the first inference.
|
|
283
|
+
|
|
286
284
|
Args:
|
|
287
|
-
frame: StartFrame that triggered this method
|
|
285
|
+
frame: StartFrame that triggered this method.
|
|
288
286
|
"""
|
|
289
287
|
await super().start(frame)
|
|
290
288
|
self._connection_active = True
|
|
@@ -296,8 +294,10 @@ class UltravoxSTTService(AIService):
|
|
|
296
294
|
async def stop(self, frame: EndFrame):
|
|
297
295
|
"""Handle service stop.
|
|
298
296
|
|
|
297
|
+
Stops the service and marks it as inactive.
|
|
298
|
+
|
|
299
299
|
Args:
|
|
300
|
-
frame: EndFrame that triggered this method
|
|
300
|
+
frame: EndFrame that triggered this method.
|
|
301
301
|
"""
|
|
302
302
|
await super().stop(frame)
|
|
303
303
|
self._connection_active = False
|
|
@@ -306,8 +306,10 @@ class UltravoxSTTService(AIService):
|
|
|
306
306
|
async def cancel(self, frame: CancelFrame):
|
|
307
307
|
"""Handle service cancellation.
|
|
308
308
|
|
|
309
|
+
Cancels the service, clears any buffered audio, and marks it as inactive.
|
|
310
|
+
|
|
309
311
|
Args:
|
|
310
|
-
frame: CancelFrame that triggered this method
|
|
312
|
+
frame: CancelFrame that triggered this method.
|
|
311
313
|
"""
|
|
312
314
|
await super().cancel(frame)
|
|
313
315
|
self._connection_active = False
|
|
@@ -317,11 +319,12 @@ class UltravoxSTTService(AIService):
|
|
|
317
319
|
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
318
320
|
"""Process incoming frames.
|
|
319
321
|
|
|
320
|
-
This method collects audio frames and processes them
|
|
322
|
+
This method collects audio frames during speech and processes them
|
|
323
|
+
when speech ends to generate text transcriptions.
|
|
321
324
|
|
|
322
325
|
Args:
|
|
323
|
-
frame: The frame to process
|
|
324
|
-
direction: Direction of the frame (input/output)
|
|
326
|
+
frame: The frame to process.
|
|
327
|
+
direction: Direction of the frame (input/output).
|
|
325
328
|
"""
|
|
326
329
|
await super().process_frame(frame, direction)
|
|
327
330
|
|
|
@@ -25,12 +25,14 @@ class VisionService(AIService):
|
|
|
25
25
|
Provides common functionality for vision services that process images and
|
|
26
26
|
generate textual responses. Handles image frame processing and integrates
|
|
27
27
|
with the AI service infrastructure for metrics and lifecycle management.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
**kwargs: Additional arguments passed to the parent AIService.
|
|
31
28
|
"""
|
|
32
29
|
|
|
33
30
|
def __init__(self, **kwargs):
|
|
31
|
+
"""Initialize the vision service.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
**kwargs: Additional arguments passed to the parent AIService.
|
|
35
|
+
"""
|
|
34
36
|
super().__init__(**kwargs)
|
|
35
37
|
self._describe_text = None
|
|
36
38
|
|
|
@@ -12,6 +12,7 @@ from typing import Awaitable, Callable, Optional
|
|
|
12
12
|
|
|
13
13
|
import websockets
|
|
14
14
|
from loguru import logger
|
|
15
|
+
from websockets.exceptions import ConnectionClosedOK
|
|
15
16
|
from websockets.protocol import State
|
|
16
17
|
|
|
17
18
|
from pipecat.frames.frames import ErrorFrame
|
|
@@ -24,13 +25,15 @@ class WebsocketService(ABC):
|
|
|
24
25
|
Provides websocket connection management, automatic reconnection with
|
|
25
26
|
exponential backoff, connection verification, and error handling.
|
|
26
27
|
Subclasses implement service-specific connection and message handling logic.
|
|
27
|
-
|
|
28
|
-
Args:
|
|
29
|
-
reconnect_on_error: Whether to automatically reconnect on connection errors.
|
|
30
|
-
**kwargs: Additional arguments (unused, for compatibility).
|
|
31
28
|
"""
|
|
32
29
|
|
|
33
30
|
def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
|
|
31
|
+
"""Initialize the websocket service.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
reconnect_on_error: Whether to automatically reconnect on connection errors.
|
|
35
|
+
**kwargs: Additional arguments (unused, for compatibility).
|
|
36
|
+
"""
|
|
34
37
|
self._websocket: Optional[websockets.WebSocketClientProtocol] = None
|
|
35
38
|
self._reconnect_on_error = reconnect_on_error
|
|
36
39
|
|
|
@@ -41,7 +44,7 @@ class WebsocketService(ABC):
|
|
|
41
44
|
True if connection is verified working, False otherwise.
|
|
42
45
|
"""
|
|
43
46
|
try:
|
|
44
|
-
if not self._websocket or self._websocket.
|
|
47
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
45
48
|
return False
|
|
46
49
|
await self._websocket.ping()
|
|
47
50
|
return True
|
|
@@ -80,12 +83,10 @@ class WebsocketService(ABC):
|
|
|
80
83
|
try:
|
|
81
84
|
await self._receive_messages()
|
|
82
85
|
retry_count = 0 # Reset counter on successful message receive
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
self._websocket.close_rcvd_then_sent,
|
|
88
|
-
)
|
|
86
|
+
except ConnectionClosedOK as e:
|
|
87
|
+
# Normal closure, don't retry
|
|
88
|
+
logger.debug(f"{self} connection closed normally: {e}")
|
|
89
|
+
break
|
|
89
90
|
except Exception as e:
|
|
90
91
|
message = f"{self} error receiving messages: {e}"
|
|
91
92
|
logger.error(message)
|
|
@@ -4,6 +4,12 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
+
"""Base class for Whisper-based speech-to-text services.
|
|
8
|
+
|
|
9
|
+
This module provides common functionality for services implementing the Whisper API
|
|
10
|
+
interface, including language mapping, metrics generation, and error handling.
|
|
11
|
+
"""
|
|
12
|
+
|
|
7
13
|
from typing import AsyncGenerator, Optional
|
|
8
14
|
|
|
9
15
|
from loguru import logger
|
|
@@ -18,9 +24,16 @@ from pipecat.utils.tracing.service_decorators import traced_stt
|
|
|
18
24
|
|
|
19
25
|
|
|
20
26
|
def language_to_whisper_language(language: Language) -> Optional[str]:
|
|
21
|
-
"""Language
|
|
27
|
+
"""Maps pipecat Language enum to Whisper API language codes.
|
|
22
28
|
|
|
29
|
+
Language support for Whisper API.
|
|
23
30
|
Docs: https://platform.openai.com/docs/guides/speech-to-text#supported-languages
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
language: A Language enum value representing the input language.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
str or None: The corresponding Whisper language code, or None if not supported.
|
|
24
37
|
"""
|
|
25
38
|
BASE_LANGUAGES = {
|
|
26
39
|
Language.AF: "af",
|
|
@@ -98,15 +111,6 @@ class BaseWhisperSTTService(SegmentedSTTService):
|
|
|
98
111
|
|
|
99
112
|
Provides common functionality for services implementing the Whisper API interface,
|
|
100
113
|
including metrics generation and error handling.
|
|
101
|
-
|
|
102
|
-
Args:
|
|
103
|
-
model: Name of the Whisper model to use.
|
|
104
|
-
api_key: Service API key. Defaults to None.
|
|
105
|
-
base_url: Service API base URL. Defaults to None.
|
|
106
|
-
language: Language of the audio input. Defaults to English.
|
|
107
|
-
prompt: Optional text to guide the model's style or continue a previous segment.
|
|
108
|
-
temperature: Sampling temperature between 0 and 1. Defaults to 0.0.
|
|
109
|
-
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
110
114
|
"""
|
|
111
115
|
|
|
112
116
|
def __init__(
|
|
@@ -120,6 +124,17 @@ class BaseWhisperSTTService(SegmentedSTTService):
|
|
|
120
124
|
temperature: Optional[float] = None,
|
|
121
125
|
**kwargs,
|
|
122
126
|
):
|
|
127
|
+
"""Initialize the Whisper STT service.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
model: Name of the Whisper model to use.
|
|
131
|
+
api_key: Service API key. Defaults to None.
|
|
132
|
+
base_url: Service API base URL. Defaults to None.
|
|
133
|
+
language: Language of the audio input. Defaults to English.
|
|
134
|
+
prompt: Optional text to guide the model's style or continue a previous segment.
|
|
135
|
+
temperature: Sampling temperature between 0 and 1. Defaults to 0.0.
|
|
136
|
+
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
137
|
+
"""
|
|
123
138
|
super().__init__(**kwargs)
|
|
124
139
|
self.set_model_name(model)
|
|
125
140
|
self._client = self._create_client(api_key, base_url)
|
|
@@ -138,12 +153,30 @@ class BaseWhisperSTTService(SegmentedSTTService):
|
|
|
138
153
|
return AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
139
154
|
|
|
140
155
|
async def set_model(self, model: str):
|
|
156
|
+
"""Set the model name for transcription.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
model: The name of the model to use.
|
|
160
|
+
"""
|
|
141
161
|
self.set_model_name(model)
|
|
142
162
|
|
|
143
163
|
def can_generate_metrics(self) -> bool:
|
|
164
|
+
"""Indicates whether this service can generate metrics.
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
bool: True, as this service supports metric generation.
|
|
168
|
+
"""
|
|
144
169
|
return True
|
|
145
170
|
|
|
146
171
|
def language_to_service_language(self, language: Language) -> Optional[str]:
|
|
172
|
+
"""Convert from pipecat Language to service language code.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
language: The Language enum value to convert.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
str or None: The corresponding service language code, or None if not supported.
|
|
179
|
+
"""
|
|
147
180
|
return language_to_whisper_language(language)
|
|
148
181
|
|
|
149
182
|
async def set_language(self, language: Language):
|
|
@@ -153,7 +186,7 @@ class BaseWhisperSTTService(SegmentedSTTService):
|
|
|
153
186
|
language: The Language enum value to use for transcription.
|
|
154
187
|
"""
|
|
155
188
|
logger.info(f"Switching STT language to: [{language}]")
|
|
156
|
-
self._language = language
|
|
189
|
+
self._language = self.language_to_service_language(language)
|
|
157
190
|
|
|
158
191
|
@traced_stt
|
|
159
192
|
async def _handle_transcription(
|
|
@@ -163,6 +196,15 @@ class BaseWhisperSTTService(SegmentedSTTService):
|
|
|
163
196
|
pass
|
|
164
197
|
|
|
165
198
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
199
|
+
"""Transcribe audio data to text.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
audio: Raw audio data to transcribe.
|
|
203
|
+
|
|
204
|
+
Yields:
|
|
205
|
+
Frame: Either a TranscriptionFrame containing the transcribed text
|
|
206
|
+
or an ErrorFrame if transcription fails.
|
|
207
|
+
"""
|
|
166
208
|
try:
|
|
167
209
|
await self.start_processing_metrics()
|
|
168
210
|
await self.start_ttfb_metrics()
|
|
@@ -177,7 +219,11 @@ class BaseWhisperSTTService(SegmentedSTTService):
|
|
|
177
219
|
if text:
|
|
178
220
|
await self._handle_transcription(text, True, self._language)
|
|
179
221
|
logger.debug(f"Transcription: [{text}]")
|
|
180
|
-
yield TranscriptionFrame(
|
|
222
|
+
yield TranscriptionFrame(
|
|
223
|
+
text,
|
|
224
|
+
self._user_id,
|
|
225
|
+
time_now_iso8601(),
|
|
226
|
+
)
|
|
181
227
|
else:
|
|
182
228
|
logger.warning("Received empty transcription from API")
|
|
183
229
|
|
pipecat/services/whisper/stt.py
CHANGED
|
@@ -4,7 +4,11 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
"""
|
|
7
|
+
"""Whisper speech-to-text services with locally-downloaded models.
|
|
8
|
+
|
|
9
|
+
This module implements Whisper transcription using locally-downloaded models,
|
|
10
|
+
supporting both Faster Whisper and MLX Whisper backends for efficient inference.
|
|
11
|
+
"""
|
|
8
12
|
|
|
9
13
|
import asyncio
|
|
10
14
|
from enum import Enum
|
|
@@ -37,25 +41,29 @@ if TYPE_CHECKING:
|
|
|
37
41
|
|
|
38
42
|
|
|
39
43
|
class Model(Enum):
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
44
|
+
"""Whisper model selection options for Faster Whisper.
|
|
45
|
+
|
|
46
|
+
Provides various model sizes and specializations for speech recognition,
|
|
47
|
+
balancing quality and performance based on use case requirements.
|
|
48
|
+
|
|
49
|
+
Parameters:
|
|
50
|
+
TINY: Smallest multilingual model, fastest inference.
|
|
51
|
+
BASE: Basic multilingual model, good speed/quality balance.
|
|
52
|
+
SMALL: Small multilingual model, better speed/quality balance than BASE.
|
|
53
|
+
MEDIUM: Medium-sized multilingual model, better quality.
|
|
54
|
+
LARGE: Best quality multilingual model, slower inference.
|
|
55
|
+
LARGE_V3_TURBO: Fast multilingual model, slightly lower quality than LARGE.
|
|
56
|
+
DISTIL_LARGE_V2: Fast multilingual distilled model.
|
|
57
|
+
DISTIL_MEDIUM_EN: Fast English-only distilled model.
|
|
52
58
|
"""
|
|
53
59
|
|
|
54
60
|
# Multilingual models
|
|
55
61
|
TINY = "tiny"
|
|
56
62
|
BASE = "base"
|
|
63
|
+
SMALL = "small"
|
|
57
64
|
MEDIUM = "medium"
|
|
58
65
|
LARGE = "large-v3"
|
|
66
|
+
LARGE_V3_TURBO = "deepdml/faster-whisper-large-v3-turbo-ct2"
|
|
59
67
|
DISTIL_LARGE_V2 = "Systran/faster-distil-whisper-large-v2"
|
|
60
68
|
|
|
61
69
|
# English-only models
|
|
@@ -63,16 +71,18 @@ class Model(Enum):
|
|
|
63
71
|
|
|
64
72
|
|
|
65
73
|
class MLXModel(Enum):
|
|
66
|
-
"""
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
74
|
+
"""MLX Whisper model selection options for Apple Silicon.
|
|
75
|
+
|
|
76
|
+
Provides various model sizes optimized for Apple Silicon hardware,
|
|
77
|
+
including quantized variants for improved performance.
|
|
78
|
+
|
|
79
|
+
Parameters:
|
|
80
|
+
TINY: Smallest multilingual model for MLX.
|
|
81
|
+
MEDIUM: Medium-sized multilingual model for MLX.
|
|
82
|
+
LARGE_V3: Best quality multilingual model for MLX.
|
|
83
|
+
LARGE_V3_TURBO: Finetuned, pruned Whisper large-v3, much faster with slightly lower quality.
|
|
84
|
+
DISTIL_LARGE_V3: Fast multilingual distilled model for MLX.
|
|
85
|
+
LARGE_V3_TURBO_Q4: LARGE_V3_TURBO quantized to Q4 for reduced memory usage.
|
|
76
86
|
"""
|
|
77
87
|
|
|
78
88
|
# Multilingual models
|
|
@@ -256,21 +266,6 @@ class WhisperSTTService(SegmentedSTTService):
|
|
|
256
266
|
|
|
257
267
|
This service uses Faster Whisper to perform speech-to-text transcription on audio
|
|
258
268
|
segments. It supports multiple languages and various model sizes.
|
|
259
|
-
|
|
260
|
-
Args:
|
|
261
|
-
model: The Whisper model to use for transcription. Can be a Model enum or string.
|
|
262
|
-
device: The device to run inference on ('cpu', 'cuda', or 'auto').
|
|
263
|
-
compute_type: The compute type for inference ('default', 'int8', 'int8_float16', etc.).
|
|
264
|
-
no_speech_prob: Probability threshold for filtering out non-speech segments.
|
|
265
|
-
language: The default language for transcription.
|
|
266
|
-
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
267
|
-
|
|
268
|
-
Attributes:
|
|
269
|
-
_device: The device used for inference.
|
|
270
|
-
_compute_type: The compute type for inference.
|
|
271
|
-
_no_speech_prob: Threshold for non-speech filtering.
|
|
272
|
-
_model: The loaded Whisper model instance.
|
|
273
|
-
_settings: Dictionary containing service settings.
|
|
274
269
|
"""
|
|
275
270
|
|
|
276
271
|
def __init__(
|
|
@@ -283,6 +278,16 @@ class WhisperSTTService(SegmentedSTTService):
|
|
|
283
278
|
language: Language = Language.EN,
|
|
284
279
|
**kwargs,
|
|
285
280
|
):
|
|
281
|
+
"""Initialize the Whisper STT service.
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
model: The Whisper model to use for transcription. Can be a Model enum or string.
|
|
285
|
+
device: The device to run inference on ('cpu', 'cuda', or 'auto').
|
|
286
|
+
compute_type: The compute type for inference ('default', 'int8', 'int8_float16', etc.).
|
|
287
|
+
no_speech_prob: Probability threshold for filtering out non-speech segments.
|
|
288
|
+
language: The default language for transcription.
|
|
289
|
+
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
290
|
+
"""
|
|
286
291
|
super().__init__(**kwargs)
|
|
287
292
|
self._device: str = device
|
|
288
293
|
self._compute_type = compute_type
|
|
@@ -355,7 +360,7 @@ class WhisperSTTService(SegmentedSTTService):
|
|
|
355
360
|
pass
|
|
356
361
|
|
|
357
362
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
358
|
-
"""
|
|
363
|
+
"""Transcribe audio data using Whisper.
|
|
359
364
|
|
|
360
365
|
Args:
|
|
361
366
|
audio: Raw audio bytes in 16-bit PCM format.
|
|
@@ -394,7 +399,12 @@ class WhisperSTTService(SegmentedSTTService):
|
|
|
394
399
|
if text:
|
|
395
400
|
await self._handle_transcription(text, True, self._settings["language"])
|
|
396
401
|
logger.debug(f"Transcription: [{text}]")
|
|
397
|
-
yield TranscriptionFrame(
|
|
402
|
+
yield TranscriptionFrame(
|
|
403
|
+
text,
|
|
404
|
+
self._user_id,
|
|
405
|
+
time_now_iso8601(),
|
|
406
|
+
self._settings["language"],
|
|
407
|
+
)
|
|
398
408
|
|
|
399
409
|
|
|
400
410
|
class WhisperSTTServiceMLX(WhisperSTTService):
|
|
@@ -402,18 +412,6 @@ class WhisperSTTServiceMLX(WhisperSTTService):
|
|
|
402
412
|
|
|
403
413
|
This service uses MLX Whisper to perform speech-to-text transcription on audio
|
|
404
414
|
segments. It's optimized for Apple Silicon and supports multiple languages and quantizations.
|
|
405
|
-
|
|
406
|
-
Args:
|
|
407
|
-
model: The MLX Whisper model to use for transcription. Can be an MLXModel enum or string.
|
|
408
|
-
no_speech_prob: Probability threshold for filtering out non-speech segments.
|
|
409
|
-
language: The default language for transcription.
|
|
410
|
-
temperature: Temperature for sampling. Can be a float or tuple of floats.
|
|
411
|
-
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
412
|
-
|
|
413
|
-
Attributes:
|
|
414
|
-
_no_speech_threshold: Threshold for non-speech filtering.
|
|
415
|
-
_temperature: Temperature for sampling.
|
|
416
|
-
_settings: Dictionary containing service settings.
|
|
417
415
|
"""
|
|
418
416
|
|
|
419
417
|
def __init__(
|
|
@@ -425,6 +423,15 @@ class WhisperSTTServiceMLX(WhisperSTTService):
|
|
|
425
423
|
temperature: float = 0.0,
|
|
426
424
|
**kwargs,
|
|
427
425
|
):
|
|
426
|
+
"""Initialize the MLX Whisper STT service.
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
model: The MLX Whisper model to use for transcription. Can be an MLXModel enum or string.
|
|
430
|
+
no_speech_prob: Probability threshold for filtering out non-speech segments.
|
|
431
|
+
language: The default language for transcription.
|
|
432
|
+
temperature: Temperature for sampling. Can be a float or tuple of floats.
|
|
433
|
+
**kwargs: Additional arguments passed to SegmentedSTTService.
|
|
434
|
+
"""
|
|
428
435
|
# Skip WhisperSTTService.__init__ and call its parent directly
|
|
429
436
|
SegmentedSTTService.__init__(self, **kwargs)
|
|
430
437
|
|
|
@@ -455,7 +462,10 @@ class WhisperSTTServiceMLX(WhisperSTTService):
|
|
|
455
462
|
|
|
456
463
|
@override
|
|
457
464
|
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
458
|
-
"""
|
|
465
|
+
"""Transcribe audio data using MLX Whisper.
|
|
466
|
+
|
|
467
|
+
The audio is expected to be 16-bit signed PCM data.
|
|
468
|
+
MLX Whisper will handle the conversion internally.
|
|
459
469
|
|
|
460
470
|
Args:
|
|
461
471
|
audio: Raw audio bytes in 16-bit PCM format.
|
|
@@ -463,10 +473,6 @@ class WhisperSTTServiceMLX(WhisperSTTService):
|
|
|
463
473
|
Yields:
|
|
464
474
|
Frame: Either a TranscriptionFrame containing the transcribed text
|
|
465
475
|
or an ErrorFrame if transcription fails.
|
|
466
|
-
|
|
467
|
-
Note:
|
|
468
|
-
The audio is expected to be 16-bit signed PCM data.
|
|
469
|
-
MLX Whisper will handle the conversion internally.
|
|
470
476
|
"""
|
|
471
477
|
try:
|
|
472
478
|
import mlx_whisper
|
|
@@ -503,7 +509,12 @@ class WhisperSTTServiceMLX(WhisperSTTService):
|
|
|
503
509
|
if text:
|
|
504
510
|
await self._handle_transcription(text, True, self._settings["language"])
|
|
505
511
|
logger.debug(f"Transcription: [{text}]")
|
|
506
|
-
yield TranscriptionFrame(
|
|
512
|
+
yield TranscriptionFrame(
|
|
513
|
+
text,
|
|
514
|
+
self._user_id,
|
|
515
|
+
time_now_iso8601(),
|
|
516
|
+
self._settings["language"],
|
|
517
|
+
)
|
|
507
518
|
|
|
508
519
|
except Exception as e:
|
|
509
520
|
logger.exception(f"MLX Whisper transcription error: {e}")
|