PyPI - dv-pipecat-ai - Versions diffs - 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show

{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
pipecat/__init__.py +17 -0
pipecat/adapters/base_llm_adapter.py +36 -1
pipecat/adapters/schemas/direct_function.py +296 -0
pipecat/adapters/schemas/function_schema.py +15 -6
pipecat/adapters/schemas/tools_schema.py +55 -7
pipecat/adapters/services/anthropic_adapter.py +22 -3
pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
pipecat/adapters/services/bedrock_adapter.py +22 -3
pipecat/adapters/services/gemini_adapter.py +16 -3
pipecat/adapters/services/open_ai_adapter.py +17 -2
pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
pipecat/audio/filters/base_audio_filter.py +30 -6
pipecat/audio/filters/koala_filter.py +37 -2
pipecat/audio/filters/krisp_filter.py +59 -6
pipecat/audio/filters/noisereduce_filter.py +37 -0
pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
pipecat/audio/mixers/base_audio_mixer.py +30 -7
pipecat/audio/mixers/soundfile_mixer.py +53 -6
pipecat/audio/resamplers/base_audio_resampler.py +17 -9
pipecat/audio/resamplers/resampy_resampler.py +26 -1
pipecat/audio/resamplers/soxr_resampler.py +32 -1
pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
pipecat/audio/utils.py +194 -1
pipecat/audio/vad/silero.py +60 -3
pipecat/audio/vad/vad_analyzer.py +114 -30
pipecat/clocks/base_clock.py +19 -0
pipecat/clocks/system_clock.py +25 -0
pipecat/extensions/voicemail/__init__.py +0 -0
pipecat/extensions/voicemail/voicemail_detector.py +707 -0
pipecat/frames/frames.py +590 -156
pipecat/metrics/metrics.py +64 -1
pipecat/observers/base_observer.py +58 -19
pipecat/observers/loggers/debug_log_observer.py +56 -64
pipecat/observers/loggers/llm_log_observer.py +8 -1
pipecat/observers/loggers/transcription_log_observer.py +19 -7
pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
pipecat/observers/turn_tracking_observer.py +26 -1
pipecat/pipeline/base_pipeline.py +5 -7
pipecat/pipeline/base_task.py +52 -9
pipecat/pipeline/parallel_pipeline.py +121 -177
pipecat/pipeline/pipeline.py +129 -20
pipecat/pipeline/runner.py +50 -1
pipecat/pipeline/sync_parallel_pipeline.py +132 -32
pipecat/pipeline/task.py +263 -280
pipecat/pipeline/task_observer.py +85 -34
pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
pipecat/processors/aggregators/gated.py +25 -24
pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
pipecat/processors/aggregators/llm_response.py +398 -89
pipecat/processors/aggregators/openai_llm_context.py +161 -13
pipecat/processors/aggregators/sentence.py +25 -14
pipecat/processors/aggregators/user_response.py +28 -3
pipecat/processors/aggregators/vision_image_frame.py +24 -14
pipecat/processors/async_generator.py +28 -0
pipecat/processors/audio/audio_buffer_processor.py +78 -37
pipecat/processors/consumer_processor.py +25 -6
pipecat/processors/filters/frame_filter.py +23 -0
pipecat/processors/filters/function_filter.py +30 -0
pipecat/processors/filters/identity_filter.py +17 -2
pipecat/processors/filters/null_filter.py +24 -1
pipecat/processors/filters/stt_mute_filter.py +56 -21
pipecat/processors/filters/wake_check_filter.py +46 -3
pipecat/processors/filters/wake_notifier_filter.py +21 -3
pipecat/processors/frame_processor.py +488 -131
pipecat/processors/frameworks/langchain.py +38 -3
pipecat/processors/frameworks/rtvi.py +719 -34
pipecat/processors/gstreamer/pipeline_source.py +41 -0
pipecat/processors/idle_frame_processor.py +26 -3
pipecat/processors/logger.py +23 -0
pipecat/processors/metrics/frame_processor_metrics.py +77 -4
pipecat/processors/metrics/sentry.py +42 -4
pipecat/processors/producer_processor.py +34 -14
pipecat/processors/text_transformer.py +22 -10
pipecat/processors/transcript_processor.py +48 -29
pipecat/processors/user_idle_processor.py +31 -21
pipecat/runner/__init__.py +1 -0
pipecat/runner/daily.py +132 -0
pipecat/runner/livekit.py +148 -0
pipecat/runner/run.py +543 -0
pipecat/runner/types.py +67 -0
pipecat/runner/utils.py +515 -0
pipecat/serializers/base_serializer.py +42 -0
pipecat/serializers/exotel.py +17 -6
pipecat/serializers/genesys.py +95 -0
pipecat/serializers/livekit.py +33 -0
pipecat/serializers/plivo.py +16 -15
pipecat/serializers/protobuf.py +37 -1
pipecat/serializers/telnyx.py +18 -17
pipecat/serializers/twilio.py +32 -16
pipecat/services/ai_service.py +5 -3
pipecat/services/anthropic/llm.py +113 -43
pipecat/services/assemblyai/models.py +63 -5
pipecat/services/assemblyai/stt.py +64 -11
pipecat/services/asyncai/__init__.py +0 -0
pipecat/services/asyncai/tts.py +501 -0
pipecat/services/aws/llm.py +185 -111
pipecat/services/aws/stt.py +217 -23
pipecat/services/aws/tts.py +118 -52
pipecat/services/aws/utils.py +101 -5
pipecat/services/aws_nova_sonic/aws.py +82 -64
pipecat/services/aws_nova_sonic/context.py +15 -6
pipecat/services/azure/common.py +10 -2
pipecat/services/azure/image.py +32 -0
pipecat/services/azure/llm.py +9 -7
pipecat/services/azure/stt.py +65 -2
pipecat/services/azure/tts.py +154 -23
pipecat/services/cartesia/stt.py +125 -8
pipecat/services/cartesia/tts.py +102 -38
pipecat/services/cerebras/llm.py +15 -23
pipecat/services/deepgram/stt.py +19 -11
pipecat/services/deepgram/tts.py +36 -0
pipecat/services/deepseek/llm.py +14 -23
pipecat/services/elevenlabs/tts.py +330 -64
pipecat/services/fal/image.py +43 -0
pipecat/services/fal/stt.py +48 -10
pipecat/services/fireworks/llm.py +14 -21
pipecat/services/fish/tts.py +109 -9
pipecat/services/gemini_multimodal_live/__init__.py +1 -0
pipecat/services/gemini_multimodal_live/events.py +83 -2
pipecat/services/gemini_multimodal_live/file_api.py +189 -0
pipecat/services/gemini_multimodal_live/gemini.py +218 -21
pipecat/services/gladia/config.py +17 -10
pipecat/services/gladia/stt.py +82 -36
pipecat/services/google/frames.py +40 -0
pipecat/services/google/google.py +2 -0
pipecat/services/google/image.py +39 -2
pipecat/services/google/llm.py +176 -58
pipecat/services/google/llm_openai.py +26 -4
pipecat/services/google/llm_vertex.py +37 -15
pipecat/services/google/rtvi.py +41 -0
pipecat/services/google/stt.py +65 -17
pipecat/services/google/test-google-chirp.py +45 -0
pipecat/services/google/tts.py +390 -19
pipecat/services/grok/llm.py +8 -6
pipecat/services/groq/llm.py +8 -6
pipecat/services/groq/stt.py +13 -9
pipecat/services/groq/tts.py +40 -0
pipecat/services/hamsa/__init__.py +9 -0
pipecat/services/hamsa/stt.py +241 -0
pipecat/services/heygen/__init__.py +5 -0
pipecat/services/heygen/api.py +281 -0
pipecat/services/heygen/client.py +620 -0
pipecat/services/heygen/video.py +338 -0
pipecat/services/image_service.py +5 -3
pipecat/services/inworld/__init__.py +1 -0
pipecat/services/inworld/tts.py +592 -0
pipecat/services/llm_service.py +127 -45
pipecat/services/lmnt/tts.py +80 -7
pipecat/services/mcp_service.py +85 -44
pipecat/services/mem0/memory.py +42 -13
pipecat/services/minimax/tts.py +74 -15
pipecat/services/mistral/__init__.py +0 -0
pipecat/services/mistral/llm.py +185 -0
pipecat/services/moondream/vision.py +55 -10
pipecat/services/neuphonic/tts.py +275 -48
pipecat/services/nim/llm.py +8 -6
pipecat/services/ollama/llm.py +27 -7
pipecat/services/openai/base_llm.py +54 -16
pipecat/services/openai/image.py +30 -0
pipecat/services/openai/llm.py +7 -5
pipecat/services/openai/stt.py +13 -9
pipecat/services/openai/tts.py +42 -10
pipecat/services/openai_realtime_beta/azure.py +11 -9
pipecat/services/openai_realtime_beta/context.py +7 -5
pipecat/services/openai_realtime_beta/events.py +10 -7
pipecat/services/openai_realtime_beta/openai.py +37 -18
pipecat/services/openpipe/llm.py +30 -24
pipecat/services/openrouter/llm.py +9 -7
pipecat/services/perplexity/llm.py +15 -19
pipecat/services/piper/tts.py +26 -12
pipecat/services/playht/tts.py +227 -65
pipecat/services/qwen/llm.py +8 -6
pipecat/services/rime/tts.py +128 -17
pipecat/services/riva/stt.py +160 -22
pipecat/services/riva/tts.py +67 -2
pipecat/services/sambanova/llm.py +19 -17
pipecat/services/sambanova/stt.py +14 -8
pipecat/services/sarvam/tts.py +60 -13
pipecat/services/simli/video.py +82 -21
pipecat/services/soniox/__init__.py +0 -0
pipecat/services/soniox/stt.py +398 -0
pipecat/services/speechmatics/stt.py +29 -17
pipecat/services/stt_service.py +47 -11
pipecat/services/tavus/video.py +94 -25
pipecat/services/together/llm.py +8 -6
pipecat/services/tts_service.py +77 -53
pipecat/services/ultravox/stt.py +46 -43
pipecat/services/vision_service.py +5 -3
pipecat/services/websocket_service.py +12 -11
pipecat/services/whisper/base_stt.py +58 -12
pipecat/services/whisper/stt.py +69 -58
pipecat/services/xtts/tts.py +59 -2
pipecat/sync/base_notifier.py +19 -0
pipecat/sync/event_notifier.py +24 -0
pipecat/tests/utils.py +73 -5
pipecat/transcriptions/language.py +24 -0
pipecat/transports/base_input.py +112 -8
pipecat/transports/base_output.py +235 -13
pipecat/transports/base_transport.py +119 -0
pipecat/transports/local/audio.py +76 -0
pipecat/transports/local/tk.py +84 -0
pipecat/transports/network/fastapi_websocket.py +174 -15
pipecat/transports/network/small_webrtc.py +383 -39
pipecat/transports/network/webrtc_connection.py +214 -8
pipecat/transports/network/websocket_client.py +171 -1
pipecat/transports/network/websocket_server.py +147 -9
pipecat/transports/services/daily.py +792 -70
pipecat/transports/services/helpers/daily_rest.py +122 -129
pipecat/transports/services/livekit.py +339 -4
pipecat/transports/services/tavus.py +273 -38
pipecat/utils/asyncio/task_manager.py +92 -186
pipecat/utils/base_object.py +83 -1
pipecat/utils/network.py +2 -0
pipecat/utils/string.py +114 -58
pipecat/utils/text/base_text_aggregator.py +44 -13
pipecat/utils/text/base_text_filter.py +46 -0
pipecat/utils/text/markdown_text_filter.py +70 -14
pipecat/utils/text/pattern_pair_aggregator.py +18 -14
pipecat/utils/text/simple_text_aggregator.py +43 -2
pipecat/utils/text/skip_tags_aggregator.py +21 -13
pipecat/utils/time.py +36 -0
pipecat/utils/tracing/class_decorators.py +32 -7
pipecat/utils/tracing/conversation_context_provider.py +12 -2
pipecat/utils/tracing/service_attributes.py +80 -64
pipecat/utils/tracing/service_decorators.py +48 -21
pipecat/utils/tracing/setup.py +13 -7
pipecat/utils/tracing/turn_context_provider.py +12 -2
pipecat/utils/tracing/turn_trace_observer.py +27 -0
pipecat/utils/utils.py +14 -14
dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
pipecat/examples/daily_runner.py +0 -64
pipecat/examples/run.py +0 -265
pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
pipecat/utils/asyncio/watchdog_event.py +0 -42
pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
pipecat/utils/asyncio/watchdog_queue.py +0 -48
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
/pipecat/{examples → extensions}/__init__.py +0 -0

pipecat/services/ultravox/stt.py CHANGED Viewed

@@ -44,13 +44,12 @@ except ModuleNotFoundError as e:
 class AudioBuffer:
     """Buffer to collect audio frames before processing.
-    Attributes:
-        frames: List of AudioRawFrames to process
-        started_at: Timestamp when speech started
-        is_processing: Flag to prevent concurrent processing
+    Manages the collection and state of audio frames during speech
+    recording sessions, including timing and processing flags.
     """
     def __init__(self):
+        """Initialize the audio buffer."""
         self.frames: List[AudioRawFrame] = []
         self.started_at: Optional[float] = None
         self.is_processing: bool = False
@@ -59,19 +58,17 @@ class AudioBuffer:
 class UltravoxModel:
     """Model wrapper for the Ultravox multimodal model.
-    This class handles loading and running the Ultravox model for speech-to-text.
-    Args:
-        model_name: The name or path of the Ultravox model to load
-    Attributes:
-        model_name: The name of the loaded model
-        engine: The vLLM engine for model inference
-        tokenizer: The tokenizer for the model
-        stop_token_ids: Optional token IDs to stop generation
+    This class handles loading and running the Ultravox model for speech-to-text
+    transcription using vLLM for efficient inference.
     """
     def __init__(self, model_name: str = "fixie-ai/ultravox-v0_5-llama-3_1-8b"):
+        """Initialize the Ultravox model.
+        Args:
+            model_name: The name or path of the Ultravox model to load.
+                Defaults to "fixie-ai/ultravox-v0_5-llama-3_1-8b".
+        """
         self.model_name = model_name
         self._initialize_engine()
         self._initialize_tokenizer()
@@ -95,10 +92,10 @@ class UltravoxModel:
         """Format chat messages into a prompt for the model.
         Args:
-            messages: List of message dictionaries with 'role' and 'content'
+            messages: List of message dictionaries with 'role' and 'content'.
         Returns:
-            str: Formatted prompt string
+            str: Formatted prompt string ready for model input.
         """
         return self.tokenizer.apply_chat_template(
             messages, tokenize=False, add_generation_prompt=True
@@ -114,13 +111,13 @@ class UltravoxModel:
         """Generate text from audio input using the model.
         Args:
-            messages: List of message dictionaries
-            temperature: Sampling temperature
-            max_tokens: Maximum tokens to generate
-            audio: Audio data as numpy array
+            messages: List of message dictionaries for conversation context.
+            temperature: Sampling temperature for generation randomness.
+            max_tokens: Maximum number of tokens to generate.
+            audio: Audio data as numpy array in float32 format.
         Yields:
-            str: JSON chunks of the generated response
+            str: JSON chunks of the generated response in OpenAI format.
         """
         sampling_params = SamplingParams(
             temperature=temperature, max_tokens=max_tokens, stop_token_ids=self.stop_token_ids
@@ -173,22 +170,9 @@ class UltravoxModel:
 class UltravoxSTTService(AIService):
     """Service to transcribe audio using the Ultravox multimodal model.
-    This service collects audio frames and processes them with Ultravox
-    to generate text transcriptions.
-    Args:
-        model_name: The Ultravox model to use (ModelSize enum or string)
-        hf_token: Hugging Face token for model access
-        temperature: Sampling temperature for generation
-        max_tokens: Maximum tokens to generate
-        **kwargs: Additional arguments passed to AIService
-    Attributes:
-        model: The UltravoxModel instance
-        buffer: Buffer to collect audio frames
-        temperature: Temperature for text generation
-        max_tokens: Maximum tokens to generate
-        _connection_active: Flag indicating if service is active
+    This service collects audio frames during speech and processes them with
+    Ultravox to generate text transcriptions. It handles real-time audio
+    buffering, model warm-up, and streaming text generation.
     """
     def __init__(
@@ -200,6 +184,17 @@ class UltravoxSTTService(AIService):
         max_tokens: int = 100,
         **kwargs,
     ):
+        """Initialize the UltravoxSTTService.
+        Args:
+            model_name: The Ultravox model to use. Defaults to
+                "fixie-ai/ultravox-v0_5-llama-3_1-8b".
+            hf_token: Hugging Face token for model access. If None, will try
+                to use HF_TOKEN environment variable.
+            temperature: Sampling temperature for generation. Defaults to 0.7.
+            max_tokens: Maximum tokens to generate. Defaults to 100.
+            **kwargs: Additional arguments passed to AIService.
+        """
         super().__init__(**kwargs)
         # Authenticate with Hugging Face if token provided
@@ -283,8 +278,11 @@ class UltravoxSTTService(AIService):
     async def start(self, frame: StartFrame):
         """Handle service start.
+        Starts the service, marks it as active, and performs model warm-up
+        to ensure optimal performance for the first inference.
         Args:
-            frame: StartFrame that triggered this method
+            frame: StartFrame that triggered this method.
         """
         await super().start(frame)
         self._connection_active = True
@@ -296,8 +294,10 @@ class UltravoxSTTService(AIService):
     async def stop(self, frame: EndFrame):
         """Handle service stop.
+        Stops the service and marks it as inactive.
         Args:
-            frame: EndFrame that triggered this method
+            frame: EndFrame that triggered this method.
         """
         await super().stop(frame)
         self._connection_active = False
@@ -306,8 +306,10 @@ class UltravoxSTTService(AIService):
     async def cancel(self, frame: CancelFrame):
         """Handle service cancellation.
+        Cancels the service, clears any buffered audio, and marks it as inactive.
         Args:
-            frame: CancelFrame that triggered this method
+            frame: CancelFrame that triggered this method.
         """
         await super().cancel(frame)
         self._connection_active = False
@@ -317,11 +319,12 @@ class UltravoxSTTService(AIService):
     async def process_frame(self, frame: Frame, direction: FrameDirection):
         """Process incoming frames.
-        This method collects audio frames and processes them when speech ends.
+        This method collects audio frames during speech and processes them
+        when speech ends to generate text transcriptions.
         Args:
-            frame: The frame to process
-            direction: Direction of the frame (input/output)
+            frame: The frame to process.
+            direction: Direction of the frame (input/output).
         """
         await super().process_frame(frame, direction)

pipecat/services/vision_service.py CHANGED Viewed

@@ -25,12 +25,14 @@ class VisionService(AIService):
     Provides common functionality for vision services that process images and
     generate textual responses. Handles image frame processing and integrates
     with the AI service infrastructure for metrics and lifecycle management.
-    Args:
-        **kwargs: Additional arguments passed to the parent AIService.
     """
     def __init__(self, **kwargs):
+        """Initialize the vision service.
+        Args:
+            **kwargs: Additional arguments passed to the parent AIService.
+        """
         super().__init__(**kwargs)
         self._describe_text = None

pipecat/services/websocket_service.py CHANGED Viewed

@@ -12,6 +12,7 @@ from typing import Awaitable, Callable, Optional
 import websockets
 from loguru import logger
+from websockets.exceptions import ConnectionClosedOK
 from websockets.protocol import State
 from pipecat.frames.frames import ErrorFrame
@@ -24,13 +25,15 @@ class WebsocketService(ABC):
     Provides websocket connection management, automatic reconnection with
     exponential backoff, connection verification, and error handling.
     Subclasses implement service-specific connection and message handling logic.
-    Args:
-        reconnect_on_error: Whether to automatically reconnect on connection errors.
-        **kwargs: Additional arguments (unused, for compatibility).
     """
     def __init__(self, *, reconnect_on_error: bool = True, **kwargs):
+        """Initialize the websocket service.
+        Args:
+            reconnect_on_error: Whether to automatically reconnect on connection errors.
+            **kwargs: Additional arguments (unused, for compatibility).
+        """
         self._websocket: Optional[websockets.WebSocketClientProtocol] = None
         self._reconnect_on_error = reconnect_on_error
@@ -41,7 +44,7 @@ class WebsocketService(ABC):
             True if connection is verified working, False otherwise.
         """
         try:
-            if not self._websocket or self._websocket.closed:
+            if not self._websocket or self._websocket.state is State.CLOSED:
                 return False
             await self._websocket.ping()
             return True
@@ -80,12 +83,10 @@ class WebsocketService(ABC):
             try:
                 await self._receive_messages()
                 retry_count = 0  # Reset counter on successful message receive
-                if self._websocket and self._websocket.state == State.CLOSED:
-                    raise websockets.ConnectionClosedOK(
-                        self._websocket.close_rcvd,
-                        self._websocket.close_sent,
-                        self._websocket.close_rcvd_then_sent,
-                    )
+            except ConnectionClosedOK as e:
+                # Normal closure, don't retry
+                logger.debug(f"{self} connection closed normally: {e}")
+                break
             except Exception as e:
                 message = f"{self} error receiving messages: {e}"
                 logger.error(message)

pipecat/services/whisper/base_stt.py CHANGED Viewed

@@ -4,6 +4,12 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""Base class for Whisper-based speech-to-text services.
+This module provides common functionality for services implementing the Whisper API
+interface, including language mapping, metrics generation, and error handling.
+"""
 from typing import AsyncGenerator, Optional
 from loguru import logger
@@ -18,9 +24,16 @@ from pipecat.utils.tracing.service_decorators import traced_stt
 def language_to_whisper_language(language: Language) -> Optional[str]:
-    """Language support for Whisper API.
+    """Maps pipecat Language enum to Whisper API language codes.
+    Language support for Whisper API.
     Docs: https://platform.openai.com/docs/guides/speech-to-text#supported-languages
+    Args:
+        language: A Language enum value representing the input language.
+    Returns:
+        str or None: The corresponding Whisper language code, or None if not supported.
     """
     BASE_LANGUAGES = {
         Language.AF: "af",
@@ -98,15 +111,6 @@ class BaseWhisperSTTService(SegmentedSTTService):
     Provides common functionality for services implementing the Whisper API interface,
     including metrics generation and error handling.
-    Args:
-        model: Name of the Whisper model to use.
-        api_key: Service API key. Defaults to None.
-        base_url: Service API base URL. Defaults to None.
-        language: Language of the audio input. Defaults to English.
-        prompt: Optional text to guide the model's style or continue a previous segment.
-        temperature: Sampling temperature between 0 and 1. Defaults to 0.0.
-        **kwargs: Additional arguments passed to SegmentedSTTService.
     """
     def __init__(
@@ -120,6 +124,17 @@ class BaseWhisperSTTService(SegmentedSTTService):
         temperature: Optional[float] = None,
         **kwargs,
     ):
+        """Initialize the Whisper STT service.
+        Args:
+            model: Name of the Whisper model to use.
+            api_key: Service API key. Defaults to None.
+            base_url: Service API base URL. Defaults to None.
+            language: Language of the audio input. Defaults to English.
+            prompt: Optional text to guide the model's style or continue a previous segment.
+            temperature: Sampling temperature between 0 and 1. Defaults to 0.0.
+            **kwargs: Additional arguments passed to SegmentedSTTService.
+        """
         super().__init__(**kwargs)
         self.set_model_name(model)
         self._client = self._create_client(api_key, base_url)
@@ -138,12 +153,30 @@ class BaseWhisperSTTService(SegmentedSTTService):
         return AsyncOpenAI(api_key=api_key, base_url=base_url)
     async def set_model(self, model: str):
+        """Set the model name for transcription.
+        Args:
+            model: The name of the model to use.
+        """
         self.set_model_name(model)
     def can_generate_metrics(self) -> bool:
+        """Indicates whether this service can generate metrics.
+        Returns:
+            bool: True, as this service supports metric generation.
+        """
         return True
     def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert from pipecat Language to service language code.
+        Args:
+            language: The Language enum value to convert.
+        Returns:
+            str or None: The corresponding service language code, or None if not supported.
+        """
         return language_to_whisper_language(language)
     async def set_language(self, language: Language):
@@ -153,7 +186,7 @@ class BaseWhisperSTTService(SegmentedSTTService):
             language: The Language enum value to use for transcription.
         """
         logger.info(f"Switching STT language to: [{language}]")
-        self._language = language
+        self._language = self.language_to_service_language(language)
     @traced_stt
     async def _handle_transcription(
@@ -163,6 +196,15 @@ class BaseWhisperSTTService(SegmentedSTTService):
         pass
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
+        """Transcribe audio data to text.
+        Args:
+            audio: Raw audio data to transcribe.
+        Yields:
+            Frame: Either a TranscriptionFrame containing the transcribed text
+                  or an ErrorFrame if transcription fails.
+        """
         try:
             await self.start_processing_metrics()
             await self.start_ttfb_metrics()
@@ -177,7 +219,11 @@ class BaseWhisperSTTService(SegmentedSTTService):
             if text:
                 await self._handle_transcription(text, True, self._language)
                 logger.debug(f"Transcription: [{text}]")
-                yield TranscriptionFrame(text, "", time_now_iso8601())
+                yield TranscriptionFrame(
+                    text,
+                    self._user_id,
+                    time_now_iso8601(),
+                )
             else:
                 logger.warning("Received empty transcription from API")

pipecat/services/whisper/stt.py CHANGED Viewed

@@ -4,7 +4,11 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
-"""This module implements Whisper transcription with a locally-downloaded model."""
+"""Whisper speech-to-text services with locally-downloaded models.
+This module implements Whisper transcription using locally-downloaded models,
+supporting both Faster Whisper and MLX Whisper backends for efficient inference.
+"""
 import asyncio
 from enum import Enum
@@ -37,25 +41,29 @@ if TYPE_CHECKING:
 class Model(Enum):
-    """Class of basic Whisper model selection options.
-    Available models:
-        Multilingual models:
-            TINY: Smallest multilingual model
-            BASE: Basic multilingual model
-            MEDIUM: Good balance for multilingual
-            LARGE: Best quality multilingual
-            DISTIL_LARGE_V2: Fast multilingual
-        English-only models:
-            DISTIL_MEDIUM_EN: Fast English-only
+    """Whisper model selection options for Faster Whisper.
+    Provides various model sizes and specializations for speech recognition,
+    balancing quality and performance based on use case requirements.
+    Parameters:
+        TINY: Smallest multilingual model, fastest inference.
+        BASE: Basic multilingual model, good speed/quality balance.
+        SMALL: Small multilingual model, better speed/quality balance than BASE.
+        MEDIUM: Medium-sized multilingual model, better quality.
+        LARGE: Best quality multilingual model, slower inference.
+        LARGE_V3_TURBO: Fast multilingual model, slightly lower quality than LARGE.
+        DISTIL_LARGE_V2: Fast multilingual distilled model.
+        DISTIL_MEDIUM_EN: Fast English-only distilled model.
     """
     # Multilingual models
     TINY = "tiny"
     BASE = "base"
+    SMALL = "small"
     MEDIUM = "medium"
     LARGE = "large-v3"
+    LARGE_V3_TURBO = "deepdml/faster-whisper-large-v3-turbo-ct2"
     DISTIL_LARGE_V2 = "Systran/faster-distil-whisper-large-v2"
     # English-only models
@@ -63,16 +71,18 @@ class Model(Enum):
 class MLXModel(Enum):
-    """Class of MLX Whisper model selection options.
-    Available models:
-        Multilingual models:
-            TINY: Smallest multilingual model
-            MEDIUM: Good balance for multilingual
-            LARGE_V3: Best quality multilingual
-            LARGE_V3_TURBO: Finetuned, pruned Whisper large-v3, much faster, slightly lower quality
-            DISTIL_LARGE_V3: Fast multilingual
-            LARGE_V3_TURBO_Q4: LARGE_V3_TURBO, quantized to Q4
+    """MLX Whisper model selection options for Apple Silicon.
+    Provides various model sizes optimized for Apple Silicon hardware,
+    including quantized variants for improved performance.
+    Parameters:
+        TINY: Smallest multilingual model for MLX.
+        MEDIUM: Medium-sized multilingual model for MLX.
+        LARGE_V3: Best quality multilingual model for MLX.
+        LARGE_V3_TURBO: Finetuned, pruned Whisper large-v3, much faster with slightly lower quality.
+        DISTIL_LARGE_V3: Fast multilingual distilled model for MLX.
+        LARGE_V3_TURBO_Q4: LARGE_V3_TURBO quantized to Q4 for reduced memory usage.
     """
     # Multilingual models
@@ -256,21 +266,6 @@ class WhisperSTTService(SegmentedSTTService):
     This service uses Faster Whisper to perform speech-to-text transcription on audio
     segments. It supports multiple languages and various model sizes.
-    Args:
-        model: The Whisper model to use for transcription. Can be a Model enum or string.
-        device: The device to run inference on ('cpu', 'cuda', or 'auto').
-        compute_type: The compute type for inference ('default', 'int8', 'int8_float16', etc.).
-        no_speech_prob: Probability threshold for filtering out non-speech segments.
-        language: The default language for transcription.
-        **kwargs: Additional arguments passed to SegmentedSTTService.
-    Attributes:
-        _device: The device used for inference.
-        _compute_type: The compute type for inference.
-        _no_speech_prob: Threshold for non-speech filtering.
-        _model: The loaded Whisper model instance.
-        _settings: Dictionary containing service settings.
     """
     def __init__(
@@ -283,6 +278,16 @@ class WhisperSTTService(SegmentedSTTService):
         language: Language = Language.EN,
         **kwargs,
     ):
+        """Initialize the Whisper STT service.
+        Args:
+            model: The Whisper model to use for transcription. Can be a Model enum or string.
+            device: The device to run inference on ('cpu', 'cuda', or 'auto').
+            compute_type: The compute type for inference ('default', 'int8', 'int8_float16', etc.).
+            no_speech_prob: Probability threshold for filtering out non-speech segments.
+            language: The default language for transcription.
+            **kwargs: Additional arguments passed to SegmentedSTTService.
+        """
         super().__init__(**kwargs)
         self._device: str = device
         self._compute_type = compute_type
@@ -355,7 +360,7 @@ class WhisperSTTService(SegmentedSTTService):
         pass
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
-        """Transcribes given audio using Whisper.
+        """Transcribe audio data using Whisper.
         Args:
             audio: Raw audio bytes in 16-bit PCM format.
@@ -394,7 +399,12 @@ class WhisperSTTService(SegmentedSTTService):
         if text:
             await self._handle_transcription(text, True, self._settings["language"])
             logger.debug(f"Transcription: [{text}]")
-            yield TranscriptionFrame(text, "", time_now_iso8601(), self._settings["language"])
+            yield TranscriptionFrame(
+                text,
+                self._user_id,
+                time_now_iso8601(),
+                self._settings["language"],
+            )
 class WhisperSTTServiceMLX(WhisperSTTService):
@@ -402,18 +412,6 @@ class WhisperSTTServiceMLX(WhisperSTTService):
     This service uses MLX Whisper to perform speech-to-text transcription on audio
     segments. It's optimized for Apple Silicon and supports multiple languages and quantizations.
-    Args:
-        model: The MLX Whisper model to use for transcription. Can be an MLXModel enum or string.
-        no_speech_prob: Probability threshold for filtering out non-speech segments.
-        language: The default language for transcription.
-        temperature: Temperature for sampling. Can be a float or tuple of floats.
-        **kwargs: Additional arguments passed to SegmentedSTTService.
-    Attributes:
-        _no_speech_threshold: Threshold for non-speech filtering.
-        _temperature: Temperature for sampling.
-        _settings: Dictionary containing service settings.
     """
     def __init__(
@@ -425,6 +423,15 @@ class WhisperSTTServiceMLX(WhisperSTTService):
         temperature: float = 0.0,
         **kwargs,
     ):
+        """Initialize the MLX Whisper STT service.
+        Args:
+            model: The MLX Whisper model to use for transcription. Can be an MLXModel enum or string.
+            no_speech_prob: Probability threshold for filtering out non-speech segments.
+            language: The default language for transcription.
+            temperature: Temperature for sampling. Can be a float or tuple of floats.
+            **kwargs: Additional arguments passed to SegmentedSTTService.
+        """
         # Skip WhisperSTTService.__init__ and call its parent directly
         SegmentedSTTService.__init__(self, **kwargs)
@@ -455,7 +462,10 @@ class WhisperSTTServiceMLX(WhisperSTTService):
     @override
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
-        """Transcribes given audio using MLX Whisper.
+        """Transcribe audio data using MLX Whisper.
+        The audio is expected to be 16-bit signed PCM data.
+        MLX Whisper will handle the conversion internally.
         Args:
             audio: Raw audio bytes in 16-bit PCM format.
@@ -463,10 +473,6 @@ class WhisperSTTServiceMLX(WhisperSTTService):
         Yields:
             Frame: Either a TranscriptionFrame containing the transcribed text
                   or an ErrorFrame if transcription fails.
-        Note:
-            The audio is expected to be 16-bit signed PCM data.
-            MLX Whisper will handle the conversion internally.
         """
         try:
             import mlx_whisper
@@ -503,7 +509,12 @@ class WhisperSTTServiceMLX(WhisperSTTService):
             if text:
                 await self._handle_transcription(text, True, self._settings["language"])
                 logger.debug(f"Transcription: [{text}]")
-                yield TranscriptionFrame(text, "", time_now_iso8601(), self._settings["language"])
+                yield TranscriptionFrame(
+                    text,
+                    self._user_id,
+                    time_now_iso8601(),
+                    self._settings["language"],
+                )
         except Exception as e:
             logger.exception(f"MLX Whisper transcription error: {e}")

dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl