PyPI - dv-pipecat-ai - Versions diffs - 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show

{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
pipecat/__init__.py +17 -0
pipecat/adapters/base_llm_adapter.py +36 -1
pipecat/adapters/schemas/direct_function.py +296 -0
pipecat/adapters/schemas/function_schema.py +15 -6
pipecat/adapters/schemas/tools_schema.py +55 -7
pipecat/adapters/services/anthropic_adapter.py +22 -3
pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
pipecat/adapters/services/bedrock_adapter.py +22 -3
pipecat/adapters/services/gemini_adapter.py +16 -3
pipecat/adapters/services/open_ai_adapter.py +17 -2
pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
pipecat/audio/filters/base_audio_filter.py +30 -6
pipecat/audio/filters/koala_filter.py +37 -2
pipecat/audio/filters/krisp_filter.py +59 -6
pipecat/audio/filters/noisereduce_filter.py +37 -0
pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
pipecat/audio/mixers/base_audio_mixer.py +30 -7
pipecat/audio/mixers/soundfile_mixer.py +53 -6
pipecat/audio/resamplers/base_audio_resampler.py +17 -9
pipecat/audio/resamplers/resampy_resampler.py +26 -1
pipecat/audio/resamplers/soxr_resampler.py +32 -1
pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
pipecat/audio/utils.py +194 -1
pipecat/audio/vad/silero.py +60 -3
pipecat/audio/vad/vad_analyzer.py +114 -30
pipecat/clocks/base_clock.py +19 -0
pipecat/clocks/system_clock.py +25 -0
pipecat/extensions/voicemail/__init__.py +0 -0
pipecat/extensions/voicemail/voicemail_detector.py +707 -0
pipecat/frames/frames.py +590 -156
pipecat/metrics/metrics.py +64 -1
pipecat/observers/base_observer.py +58 -19
pipecat/observers/loggers/debug_log_observer.py +56 -64
pipecat/observers/loggers/llm_log_observer.py +8 -1
pipecat/observers/loggers/transcription_log_observer.py +19 -7
pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
pipecat/observers/turn_tracking_observer.py +26 -1
pipecat/pipeline/base_pipeline.py +5 -7
pipecat/pipeline/base_task.py +52 -9
pipecat/pipeline/parallel_pipeline.py +121 -177
pipecat/pipeline/pipeline.py +129 -20
pipecat/pipeline/runner.py +50 -1
pipecat/pipeline/sync_parallel_pipeline.py +132 -32
pipecat/pipeline/task.py +263 -280
pipecat/pipeline/task_observer.py +85 -34
pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
pipecat/processors/aggregators/gated.py +25 -24
pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
pipecat/processors/aggregators/llm_response.py +398 -89
pipecat/processors/aggregators/openai_llm_context.py +161 -13
pipecat/processors/aggregators/sentence.py +25 -14
pipecat/processors/aggregators/user_response.py +28 -3
pipecat/processors/aggregators/vision_image_frame.py +24 -14
pipecat/processors/async_generator.py +28 -0
pipecat/processors/audio/audio_buffer_processor.py +78 -37
pipecat/processors/consumer_processor.py +25 -6
pipecat/processors/filters/frame_filter.py +23 -0
pipecat/processors/filters/function_filter.py +30 -0
pipecat/processors/filters/identity_filter.py +17 -2
pipecat/processors/filters/null_filter.py +24 -1
pipecat/processors/filters/stt_mute_filter.py +56 -21
pipecat/processors/filters/wake_check_filter.py +46 -3
pipecat/processors/filters/wake_notifier_filter.py +21 -3
pipecat/processors/frame_processor.py +488 -131
pipecat/processors/frameworks/langchain.py +38 -3
pipecat/processors/frameworks/rtvi.py +719 -34
pipecat/processors/gstreamer/pipeline_source.py +41 -0
pipecat/processors/idle_frame_processor.py +26 -3
pipecat/processors/logger.py +23 -0
pipecat/processors/metrics/frame_processor_metrics.py +77 -4
pipecat/processors/metrics/sentry.py +42 -4
pipecat/processors/producer_processor.py +34 -14
pipecat/processors/text_transformer.py +22 -10
pipecat/processors/transcript_processor.py +48 -29
pipecat/processors/user_idle_processor.py +31 -21
pipecat/runner/__init__.py +1 -0
pipecat/runner/daily.py +132 -0
pipecat/runner/livekit.py +148 -0
pipecat/runner/run.py +543 -0
pipecat/runner/types.py +67 -0
pipecat/runner/utils.py +515 -0
pipecat/serializers/base_serializer.py +42 -0
pipecat/serializers/exotel.py +17 -6
pipecat/serializers/genesys.py +95 -0
pipecat/serializers/livekit.py +33 -0
pipecat/serializers/plivo.py +16 -15
pipecat/serializers/protobuf.py +37 -1
pipecat/serializers/telnyx.py +18 -17
pipecat/serializers/twilio.py +32 -16
pipecat/services/ai_service.py +5 -3
pipecat/services/anthropic/llm.py +113 -43
pipecat/services/assemblyai/models.py +63 -5
pipecat/services/assemblyai/stt.py +64 -11
pipecat/services/asyncai/__init__.py +0 -0
pipecat/services/asyncai/tts.py +501 -0
pipecat/services/aws/llm.py +185 -111
pipecat/services/aws/stt.py +217 -23
pipecat/services/aws/tts.py +118 -52
pipecat/services/aws/utils.py +101 -5
pipecat/services/aws_nova_sonic/aws.py +82 -64
pipecat/services/aws_nova_sonic/context.py +15 -6
pipecat/services/azure/common.py +10 -2
pipecat/services/azure/image.py +32 -0
pipecat/services/azure/llm.py +9 -7
pipecat/services/azure/stt.py +65 -2
pipecat/services/azure/tts.py +154 -23
pipecat/services/cartesia/stt.py +125 -8
pipecat/services/cartesia/tts.py +102 -38
pipecat/services/cerebras/llm.py +15 -23
pipecat/services/deepgram/stt.py +19 -11
pipecat/services/deepgram/tts.py +36 -0
pipecat/services/deepseek/llm.py +14 -23
pipecat/services/elevenlabs/tts.py +330 -64
pipecat/services/fal/image.py +43 -0
pipecat/services/fal/stt.py +48 -10
pipecat/services/fireworks/llm.py +14 -21
pipecat/services/fish/tts.py +109 -9
pipecat/services/gemini_multimodal_live/__init__.py +1 -0
pipecat/services/gemini_multimodal_live/events.py +83 -2
pipecat/services/gemini_multimodal_live/file_api.py +189 -0
pipecat/services/gemini_multimodal_live/gemini.py +218 -21
pipecat/services/gladia/config.py +17 -10
pipecat/services/gladia/stt.py +82 -36
pipecat/services/google/frames.py +40 -0
pipecat/services/google/google.py +2 -0
pipecat/services/google/image.py +39 -2
pipecat/services/google/llm.py +176 -58
pipecat/services/google/llm_openai.py +26 -4
pipecat/services/google/llm_vertex.py +37 -15
pipecat/services/google/rtvi.py +41 -0
pipecat/services/google/stt.py +65 -17
pipecat/services/google/test-google-chirp.py +45 -0
pipecat/services/google/tts.py +390 -19
pipecat/services/grok/llm.py +8 -6
pipecat/services/groq/llm.py +8 -6
pipecat/services/groq/stt.py +13 -9
pipecat/services/groq/tts.py +40 -0
pipecat/services/hamsa/__init__.py +9 -0
pipecat/services/hamsa/stt.py +241 -0
pipecat/services/heygen/__init__.py +5 -0
pipecat/services/heygen/api.py +281 -0
pipecat/services/heygen/client.py +620 -0
pipecat/services/heygen/video.py +338 -0
pipecat/services/image_service.py +5 -3
pipecat/services/inworld/__init__.py +1 -0
pipecat/services/inworld/tts.py +592 -0
pipecat/services/llm_service.py +127 -45
pipecat/services/lmnt/tts.py +80 -7
pipecat/services/mcp_service.py +85 -44
pipecat/services/mem0/memory.py +42 -13
pipecat/services/minimax/tts.py +74 -15
pipecat/services/mistral/__init__.py +0 -0
pipecat/services/mistral/llm.py +185 -0
pipecat/services/moondream/vision.py +55 -10
pipecat/services/neuphonic/tts.py +275 -48
pipecat/services/nim/llm.py +8 -6
pipecat/services/ollama/llm.py +27 -7
pipecat/services/openai/base_llm.py +54 -16
pipecat/services/openai/image.py +30 -0
pipecat/services/openai/llm.py +7 -5
pipecat/services/openai/stt.py +13 -9
pipecat/services/openai/tts.py +42 -10
pipecat/services/openai_realtime_beta/azure.py +11 -9
pipecat/services/openai_realtime_beta/context.py +7 -5
pipecat/services/openai_realtime_beta/events.py +10 -7
pipecat/services/openai_realtime_beta/openai.py +37 -18
pipecat/services/openpipe/llm.py +30 -24
pipecat/services/openrouter/llm.py +9 -7
pipecat/services/perplexity/llm.py +15 -19
pipecat/services/piper/tts.py +26 -12
pipecat/services/playht/tts.py +227 -65
pipecat/services/qwen/llm.py +8 -6
pipecat/services/rime/tts.py +128 -17
pipecat/services/riva/stt.py +160 -22
pipecat/services/riva/tts.py +67 -2
pipecat/services/sambanova/llm.py +19 -17
pipecat/services/sambanova/stt.py +14 -8
pipecat/services/sarvam/tts.py +60 -13
pipecat/services/simli/video.py +82 -21
pipecat/services/soniox/__init__.py +0 -0
pipecat/services/soniox/stt.py +398 -0
pipecat/services/speechmatics/stt.py +29 -17
pipecat/services/stt_service.py +47 -11
pipecat/services/tavus/video.py +94 -25
pipecat/services/together/llm.py +8 -6
pipecat/services/tts_service.py +77 -53
pipecat/services/ultravox/stt.py +46 -43
pipecat/services/vision_service.py +5 -3
pipecat/services/websocket_service.py +12 -11
pipecat/services/whisper/base_stt.py +58 -12
pipecat/services/whisper/stt.py +69 -58
pipecat/services/xtts/tts.py +59 -2
pipecat/sync/base_notifier.py +19 -0
pipecat/sync/event_notifier.py +24 -0
pipecat/tests/utils.py +73 -5
pipecat/transcriptions/language.py +24 -0
pipecat/transports/base_input.py +112 -8
pipecat/transports/base_output.py +235 -13
pipecat/transports/base_transport.py +119 -0
pipecat/transports/local/audio.py +76 -0
pipecat/transports/local/tk.py +84 -0
pipecat/transports/network/fastapi_websocket.py +174 -15
pipecat/transports/network/small_webrtc.py +383 -39
pipecat/transports/network/webrtc_connection.py +214 -8
pipecat/transports/network/websocket_client.py +171 -1
pipecat/transports/network/websocket_server.py +147 -9
pipecat/transports/services/daily.py +792 -70
pipecat/transports/services/helpers/daily_rest.py +122 -129
pipecat/transports/services/livekit.py +339 -4
pipecat/transports/services/tavus.py +273 -38
pipecat/utils/asyncio/task_manager.py +92 -186
pipecat/utils/base_object.py +83 -1
pipecat/utils/network.py +2 -0
pipecat/utils/string.py +114 -58
pipecat/utils/text/base_text_aggregator.py +44 -13
pipecat/utils/text/base_text_filter.py +46 -0
pipecat/utils/text/markdown_text_filter.py +70 -14
pipecat/utils/text/pattern_pair_aggregator.py +18 -14
pipecat/utils/text/simple_text_aggregator.py +43 -2
pipecat/utils/text/skip_tags_aggregator.py +21 -13
pipecat/utils/time.py +36 -0
pipecat/utils/tracing/class_decorators.py +32 -7
pipecat/utils/tracing/conversation_context_provider.py +12 -2
pipecat/utils/tracing/service_attributes.py +80 -64
pipecat/utils/tracing/service_decorators.py +48 -21
pipecat/utils/tracing/setup.py +13 -7
pipecat/utils/tracing/turn_context_provider.py +12 -2
pipecat/utils/tracing/turn_trace_observer.py +27 -0
pipecat/utils/utils.py +14 -14
dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
pipecat/examples/daily_runner.py +0 -64
pipecat/examples/run.py +0 -265
pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
pipecat/utils/asyncio/watchdog_event.py +0 -42
pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
pipecat/utils/asyncio/watchdog_queue.py +0 -48
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
/pipecat/{examples → extensions}/__init__.py +0 -0

pipecat/services/aws/utils.py CHANGED Viewed

@@ -4,6 +4,12 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""AWS Transcribe utility functions and classes for WebSocket streaming.
+This module provides utilities for creating presigned URLs, building event messages,
+and handling AWS event stream protocol for real-time transcription services.
+"""
 import binascii
 import datetime
 import hashlib
@@ -29,7 +35,29 @@ def get_presigned_url(
     show_speaker_label: bool = False,
     enable_channel_identification: bool = False,
 ) -> str:
-    """Create a presigned URL for AWS Transcribe streaming."""
+    """Create a presigned URL for AWS Transcribe streaming.
+    Args:
+        region: AWS region for the service.
+        credentials: Dictionary containing AWS credentials. Must include
+            'access_key' and 'secret_key', with optional 'session_token'.
+        language_code: Language code for transcription (e.g., "en-US").
+        media_encoding: Audio encoding format. Defaults to "pcm".
+        sample_rate: Audio sample rate in Hz. Defaults to 16000.
+        number_of_channels: Number of audio channels. Defaults to 1.
+        enable_partial_results_stabilization: Whether to enable partial result stabilization.
+        partial_results_stability: Stability level for partial results.
+        vocabulary_name: Custom vocabulary name to use.
+        vocabulary_filter_name: Vocabulary filter name to apply.
+        show_speaker_label: Whether to include speaker labels.
+        enable_channel_identification: Whether to enable channel identification.
+    Returns:
+        Presigned WebSocket URL for AWS Transcribe streaming.
+    Raises:
+        ValueError: If required AWS credentials are missing.
+    """
     access_key = credentials.get("access_key")
     secret_key = credentials.get("secret_key")
     session_token = credentials.get("session_token")
@@ -58,9 +86,23 @@ def get_presigned_url(
 class AWSTranscribePresignedURL:
+    """Generator for AWS Transcribe presigned WebSocket URLs.
+    Handles AWS Signature Version 4 signing process to create authenticated
+    WebSocket URLs for streaming transcription requests.
+    """
     def __init__(
         self, access_key: str, secret_key: str, session_token: str, region: str = "us-east-1"
     ):
+        """Initialize the presigned URL generator.
+        Args:
+            access_key: AWS access key ID.
+            secret_key: AWS secret access key.
+            session_token: AWS session token for temporary credentials.
+            region: AWS region for the service. Defaults to "us-east-1".
+        """
         self.access_key = access_key
         self.secret_key = secret_key
         self.session_token = session_token
@@ -96,6 +138,23 @@ class AWSTranscribePresignedURL:
         enable_partial_results_stabilization: bool = False,
         partial_results_stability: str = "",
     ) -> str:
+        """Generate a presigned WebSocket URL for AWS Transcribe.
+        Args:
+            sample_rate: Audio sample rate in Hz.
+            language_code: Language code for transcription.
+            media_encoding: Audio encoding format.
+            vocabulary_name: Custom vocabulary name.
+            vocabulary_filter_name: Vocabulary filter name.
+            show_speaker_label: Whether to include speaker labels.
+            enable_channel_identification: Whether to enable channel identification.
+            number_of_channels: Number of audio channels.
+            enable_partial_results_stabilization: Whether to enable partial result stabilization.
+            partial_results_stability: Stability level for partial results.
+        Returns:
+            Presigned WebSocket URL with authentication parameters.
+        """
         self.endpoint = f"wss://transcribestreaming.{self.region}.amazonaws.com:8443"
         self.host = f"transcribestreaming.{self.region}.amazonaws.com:8443"
@@ -172,7 +231,15 @@ class AWSTranscribePresignedURL:
 def get_headers(header_name: str, header_value: str) -> bytearray:
-    """Build a header following AWS event stream format."""
+    """Build a header following AWS event stream format.
+    Args:
+        header_name: Name of the header.
+        header_value: Value of the header.
+    Returns:
+        Encoded header as a bytearray following AWS event stream protocol.
+    """
     name = header_name.encode("utf-8")
     name_byte_length = bytes([len(name)])
     value_type = bytes([7])  # 7 represents a string
@@ -190,9 +257,21 @@ def get_headers(header_name: str, header_value: str) -> bytearray:
 def build_event_message(payload: bytes) -> bytes:
-    """
-    Build an event message for AWS Transcribe streaming.
-    Matches AWS sample: https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
+    """Build an event message for AWS Transcribe streaming.
+    Creates a properly formatted AWS event stream message containing audio data
+    for real-time transcription. Follows the AWS event stream protocol with
+    prelude, headers, payload, and CRC checksums.
+    Args:
+        payload: Raw audio bytes to include in the event message.
+    Returns:
+        Complete event message as bytes, ready to send via WebSocket.
+    Note:
+        Implementation matches AWS sample:
+        https://github.com/aws-samples/amazon-transcribe-streaming-python-websockets/blob/main/eventstream.py
     """
     # Build headers
     content_type_header = get_headers(":content-type", "application/octet-stream")
@@ -235,6 +314,23 @@ def build_event_message(payload: bytes) -> bytes:
 def decode_event(message):
+    """Decode an AWS event stream message.
+    Parses an AWS event stream message to extract headers and payload,
+    verifying CRC checksums for data integrity.
+    Args:
+        message: Raw event stream message bytes received from AWS.
+    Returns:
+        A tuple of (headers, payload) where:
+        - headers: Dictionary of parsed headers
+        - payload: Dictionary of parsed JSON payload
+    Raises:
+        AssertionError: If CRC checksum verification fails.
+    """
     # Extract the prelude, headers, payload and CRC
     prelude = message[:8]
     total_length, headers_length = struct.unpack(">II", prelude)

pipecat/services/aws_nova_sonic/aws.py CHANGED Viewed

@@ -95,7 +95,13 @@ class AWSNovaSonicUnhandledFunctionException(Exception):
 class ContentType(Enum):
-    """Content types supported by AWS Nova Sonic."""
+    """Content types supported by AWS Nova Sonic.
+    Parameters:
+        AUDIO: Audio content type.
+        TEXT: Text content type.
+        TOOL: Tool content type.
+    """
     AUDIO = "AUDIO"
     TEXT = "TEXT"
@@ -103,7 +109,12 @@ class ContentType(Enum):
 class TextStage(Enum):
-    """Text generation stages in AWS Nova Sonic responses."""
+    """Text generation stages in AWS Nova Sonic responses.
+    Parameters:
+        FINAL: Final text that has been fully generated.
+        SPECULATIVE: Speculative text that is still being generated.
+    """
     FINAL = "FINAL"  # what has been said
     SPECULATIVE = "SPECULATIVE"  # what's planned to be said
@@ -126,6 +137,7 @@ class CurrentContent:
     text_content: str  # starts as None, then fills in if text
     def __str__(self):
+        """String representation of the current content."""
         return (
             f"CurrentContent(\n"
             f"  type={self.type.name},\n"
@@ -138,7 +150,7 @@ class CurrentContent:
 class Params(BaseModel):
     """Configuration parameters for AWS Nova Sonic.
-    Attributes:
+    Parameters:
         input_sample_rate: Audio input sample rate in Hz.
         input_sample_size: Audio input sample size in bits.
         input_channel_count: Number of input audio channels.
@@ -171,18 +183,6 @@ class AWSNovaSonicLLMService(LLMService):
     Provides bidirectional audio streaming, real-time transcription, text generation,
     and function calling capabilities using AWS Nova Sonic model.
-    Args:
-        secret_access_key: AWS secret access key for authentication.
-        access_key_id: AWS access key ID for authentication.
-        region: AWS region where the service is hosted.
-        model: Model identifier. Defaults to "amazon.nova-sonic-v1:0".
-        voice_id: Voice ID for speech synthesis. Options: matthew, tiffany, amy.
-        params: Model parameters for audio configuration and inference.
-        system_instruction: System-level instruction for the model.
-        tools: Available tools/functions for the model to use.
-        send_transcription_frames: Whether to emit transcription frames.
-        **kwargs: Additional arguments passed to the parent LLMService.
     """
     # Override the default adapter to use the AWSNovaSonicLLMAdapter one
@@ -193,6 +193,7 @@ class AWSNovaSonicLLMService(LLMService):
         *,
         secret_access_key: str,
         access_key_id: str,
+        session_token: Optional[str] = None,
         region: str,
         model: str = "amazon.nova-sonic-v1:0",
         voice_id: str = "matthew",  # matthew, tiffany, amy
@@ -202,9 +203,25 @@ class AWSNovaSonicLLMService(LLMService):
         send_transcription_frames: bool = True,
         **kwargs,
     ):
+        """Initializes the AWS Nova Sonic LLM service.
+        Args:
+            secret_access_key: AWS secret access key for authentication.
+            access_key_id: AWS access key ID for authentication.
+            session_token: AWS session token for authentication.
+            region: AWS region where the service is hosted.
+            model: Model identifier. Defaults to "amazon.nova-sonic-v1:0".
+            voice_id: Voice ID for speech synthesis. Options: matthew, tiffany, amy.
+            params: Model parameters for audio configuration and inference.
+            system_instruction: System-level instruction for the model.
+            tools: Available tools/functions for the model to use.
+            send_transcription_frames: Whether to emit transcription frames.
+            **kwargs: Additional arguments passed to the parent LLMService.
+        """
         super().__init__(**kwargs)
         self._secret_access_key = secret_access_key
         self._access_key_id = access_key_id
+        self._session_token = session_token
         self._region = region
         self._model = model
         self._client: Optional[BedrockRuntimeClient] = None
@@ -456,7 +473,6 @@ class AWSNovaSonicLLMService(LLMService):
         # If we need to, send assistant response trigger (depends on self._connected_time)
         if self._triggering_assistant_response:
             await self._send_assistant_response_trigger()
-            self._triggering_assistant_response = False
     async def _disconnect(self):
         try:
@@ -508,7 +524,9 @@ class AWSNovaSonicLLMService(LLMService):
             region=self._region,
             aws_credentials_identity_resolver=StaticCredentialsResolver(
                 credentials=AWSCredentialsIdentity(
-                    access_key_id=self._access_key_id, secret_access_key=self._secret_access_key
+                    access_key_id=self._access_key_id,
+                    secret_access_key=self._secret_access_key,
+                    session_token=self._session_token,
                 )
             ),
             http_auth_scheme_resolver=HTTPAuthSchemeResolver(),
@@ -776,9 +794,7 @@ class AWSNovaSonicLLMService(LLMService):
         try:
             while self._stream and not self._disconnecting:
                 output = await self._stream.await_output()
-                result = await asyncio.wait_for(output[1].receive(), timeout=1.0)
-                self.reset_watchdog()
+                result = await output[1].receive()
                 if result.value and result.value.bytes_:
                     response_data = result.value.bytes_.decode("utf-8")
@@ -807,8 +823,6 @@ class AWSNovaSonicLLMService(LLMService):
                         elif "completionEnd" in event_json:
                             # Handle the LLM completion ending
                             await self._handle_completion_end_event(event_json)
-        except asyncio.TimeoutError:
-            self.reset_watchdog()
         except Exception as e:
             logger.error(f"{self} error processing responses: {e}")
             if self._wants_connection:
@@ -1089,7 +1103,6 @@ class AWSNovaSonicLLMService(LLMService):
         # Send the trigger audio, if we're fully connected and set up
         if self._connected_time is not None:
             await self._send_assistant_response_trigger()
-            self._triggering_assistant_response = False
     async def _send_assistant_response_trigger(self):
         if (
@@ -1097,46 +1110,51 @@ class AWSNovaSonicLLMService(LLMService):
         ):  # should never happen
             return
-        logger.debug("Sending assistant response trigger...")
-        chunk_duration = 0.02  # what we might get from InputAudioRawFrame
-        chunk_size = int(
-            chunk_duration
-            * self._params.input_sample_rate
-            * self._params.input_channel_count
-            * (self._params.input_sample_size / 8)
-        )  # e.g. 0.02 seconds of 16-bit (2-byte) PCM mono audio at 16kHz is 640 bytes
-        # Lead with a bit of blank audio, if needed.
-        # It seems like the LLM can't quite "hear" the first little bit of audio sent on a
-        # connection.
-        current_time = time.time()
-        max_blank_audio_duration = 0.5
-        blank_audio_duration = (
-            max_blank_audio_duration - (current_time - self._connected_time)
-            if self._connected_time is not None
-            and (current_time - self._connected_time) < max_blank_audio_duration
-            else None
-        )
-        if blank_audio_duration:
-            logger.debug(
-                f"Leading assistant response trigger with {blank_audio_duration}s of blank audio"
+        try:
+            logger.debug("Sending assistant response trigger...")
+            chunk_duration = 0.02  # what we might get from InputAudioRawFrame
+            chunk_size = int(
+                chunk_duration
+                * self._params.input_sample_rate
+                * self._params.input_channel_count
+                * (self._params.input_sample_size / 8)
+            )  # e.g. 0.02 seconds of 16-bit (2-byte) PCM mono audio at 16kHz is 640 bytes
+            # Lead with a bit of blank audio, if needed.
+            # It seems like the LLM can't quite "hear" the first little bit of audio sent on a
+            # connection.
+            current_time = time.time()
+            max_blank_audio_duration = 0.5
+            blank_audio_duration = (
+                max_blank_audio_duration - (current_time - self._connected_time)
+                if self._connected_time is not None
+                and (current_time - self._connected_time) < max_blank_audio_duration
+                else None
             )
-            blank_audio_chunk = b"\x00" * chunk_size
-            num_chunks = int(blank_audio_duration / chunk_duration)
-            for _ in range(num_chunks):
-                await self._send_user_audio_event(blank_audio_chunk)
+            if blank_audio_duration:
+                logger.debug(
+                    f"Leading assistant response trigger with {blank_audio_duration}s of blank audio"
+                )
+                blank_audio_chunk = b"\x00" * chunk_size
+                num_chunks = int(blank_audio_duration / chunk_duration)
+                for _ in range(num_chunks):
+                    await self._send_user_audio_event(blank_audio_chunk)
+                    await asyncio.sleep(chunk_duration)
+            # Send trigger audio
+            # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK:
+            # if we ever need to seed this service again with context it would make sense to include it
+            # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the
+            # context as well.
+            audio_chunks = [
+                self._assistant_response_trigger_audio[i : i + chunk_size]
+                for i in range(0, len(self._assistant_response_trigger_audio), chunk_size)
+            ]
+            for chunk in audio_chunks:
+                await self._send_user_audio_event(chunk)
                 await asyncio.sleep(chunk_duration)
-        # Send trigger audio
-        # NOTE: this audio *will* be transcribed and eventually make it into the context. That's OK:
-        # if we ever need to seed this service again with context it would make sense to include it
-        # since the instruction (i.e. the "wait for the trigger" instruction) will be part of the
-        # context as well.
-        audio_chunks = [
-            self._assistant_response_trigger_audio[i : i + chunk_size]
-            for i in range(0, len(self._assistant_response_trigger_audio), chunk_size)
-        ]
-        for chunk in audio_chunks:
-            await self._send_user_audio_event(chunk)
-            await asyncio.sleep(chunk_duration)
+        finally:
+            # We need to clean up in case sending the trigger was cancelled, e.g. in the case of a user interruption.
+            # (An asyncio.CancelledError would be raised in that case.)
+            self._triggering_assistant_response = False

pipecat/services/aws_nova_sonic/context.py CHANGED Viewed

@@ -41,7 +41,14 @@ from pipecat.services.openai.llm import (
 class Role(Enum):
-    """Roles supported in AWS Nova Sonic conversations."""
+    """Roles supported in AWS Nova Sonic conversations.
+    Parameters:
+        SYSTEM: System-level messages (not used in conversation history).
+        USER: Messages sent by the user.
+        ASSISTANT: Messages sent by the assistant.
+        TOOL: Messages sent by tools (not used in conversation history).
+    """
     SYSTEM = "SYSTEM"
     USER = "USER"
@@ -80,14 +87,16 @@ class AWSNovaSonicLLMContext(OpenAILLMContext):
     Extends OpenAI context with Nova Sonic-specific message handling,
     conversation history management, and text buffering capabilities.
-    Args:
-        messages: Initial messages for the context.
-        tools: Available tools for the context.
-        **kwargs: Additional arguments passed to parent class.
     """
     def __init__(self, messages=None, tools=None, **kwargs):
+        """Initialize AWS Nova Sonic LLM context.
+        Args:
+            messages: Initial messages for the context.
+            tools: Available tools for the context.
+            **kwargs: Additional arguments passed to parent class.
+        """
         super().__init__(messages=messages, tools=tools, **kwargs)
         self.__setup_local()

pipecat/services/azure/common.py CHANGED Viewed

@@ -4,14 +4,22 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
-from typing import Optional
+"""Language conversion utilities for Azure services."""
-from loguru import logger
+from typing import Optional
 from pipecat.transcriptions.language import Language
 def language_to_azure_language(language: Language) -> Optional[str]:
+    """Convert a Language enum to Azure language code.
+    Args:
+        language: The Language enum value to convert.
+    Returns:
+        The corresponding Azure language code, or None if not supported.
+    """
     language_map = {
         # Afrikaans
         Language.AF: "af-ZA",

pipecat/services/azure/image.py CHANGED Viewed

@@ -4,6 +4,12 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""Azure OpenAI image generation service implementation.
+This module provides integration with Azure's OpenAI image generation API
+using REST endpoints for creating images from text prompts.
+"""
 import asyncio
 import io
 from typing import AsyncGenerator
@@ -17,6 +23,13 @@ from pipecat.services.image_service import ImageGenService
 class AzureImageGenServiceREST(ImageGenService):
+    """Azure OpenAI REST-based image generation service.
+    Provides image generation using Azure's OpenAI service via REST API.
+    Supports asynchronous image generation with polling for completion
+    and automatic image download and processing.
+    """
     def __init__(
         self,
         *,
@@ -27,6 +40,16 @@ class AzureImageGenServiceREST(ImageGenService):
         aiohttp_session: aiohttp.ClientSession,
         api_version="2023-06-01-preview",
     ):
+        """Initialize the AzureImageGenServiceREST.
+        Args:
+            image_size: Size specification for generated images (e.g., "1024x1024").
+            api_key: Azure OpenAI API key for authentication.
+            endpoint: Azure OpenAI endpoint URL.
+            model: The image generation model to use.
+            aiohttp_session: Shared aiohttp session for HTTP requests.
+            api_version: Azure API version string. Defaults to "2023-06-01-preview".
+        """
         super().__init__()
         self._api_key = api_key
@@ -37,6 +60,15 @@ class AzureImageGenServiceREST(ImageGenService):
         self._aiohttp_session = aiohttp_session
     async def run_image_gen(self, prompt: str) -> AsyncGenerator[Frame, None]:
+        """Generate an image from a text prompt using Azure OpenAI.
+        Args:
+            prompt: The text prompt describing the desired image.
+        Yields:
+            URLImageRawFrame containing the generated image data, or
+            ErrorFrame if generation fails.
+        """
         url = f"{self._azure_endpoint}openai/images/generations:submit?api-version={self._api_version}"
         headers = {"api-key": self._api_key, "Content-Type": "application/json"}

pipecat/services/azure/llm.py CHANGED Viewed

@@ -17,13 +17,6 @@ class AzureLLMService(OpenAILLMService):
     This service extends OpenAILLMService to connect to Azure's OpenAI endpoint while
     maintaining full compatibility with OpenAI's interface and functionality.
-    Args:
-        api_key: The API key for accessing Azure OpenAI.
-        endpoint: The Azure endpoint URL.
-        model: The model identifier to use.
-        api_version: Azure API version. Defaults to "2024-09-01-preview".
-        **kwargs: Additional keyword arguments passed to OpenAILLMService.
     """
     def __init__(
@@ -35,6 +28,15 @@ class AzureLLMService(OpenAILLMService):
         api_version: str = "2024-09-01-preview",
         **kwargs,
     ):
+        """Initialize the Azure LLM service.
+        Args:
+            api_key: The API key for accessing Azure OpenAI.
+            endpoint: The Azure endpoint URL.
+            model: The model identifier to use.
+            api_version: Azure API version. Defaults to "2024-09-01-preview".
+            **kwargs: Additional keyword arguments passed to OpenAILLMService.
+        """
         # Initialize variables before calling parent __init__() because that
         # will call create_client() and we need those values there.
         self._endpoint = endpoint

pipecat/services/azure/stt.py CHANGED Viewed

@@ -4,6 +4,12 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""Azure Speech-to-Text service implementation for Pipecat.
+This module provides speech-to-text functionality using Azure Cognitive Services
+Speech SDK for real-time audio transcription.
+"""
 import asyncio
 from typing import AsyncGenerator, List, Optional  # Add List
@@ -42,6 +48,13 @@ except ModuleNotFoundError as e:
 class AzureSTTService(STTService):
+    """Azure Speech-to-Text service for real-time audio transcription.
+    This service uses Azure Cognitive Services Speech SDK to convert speech
+    audio into text transcriptions. It supports continuous recognition and
+    provides real-time transcription results with timing information.
+    """
     def __init__(
         self,
         *,
@@ -50,8 +63,19 @@ class AzureSTTService(STTService):
         language: Language = Language.EN_US,
         additional_languages: list[Language] = None,
         sample_rate: Optional[int] = None,
+        endpoint_id: Optional[str] = None,
         **kwargs,
     ):
+        """Initialize the Azure STT service.
+        Args:
+            api_key: Azure Cognitive Services subscription key.
+            region: Azure region for the Speech service (e.g., 'eastus').
+            language: Language for speech recognition. Defaults to English (US).
+            sample_rate: Audio sample rate in Hz. If None, uses service default.
+            endpoint_id: Custom model endpoint id.
+            **kwargs: Additional arguments passed to parent STTService.
+        """
         super().__init__(sample_rate=sample_rate, **kwargs)
         self._vocab: Optional[List[str]] = kwargs.pop("vocab", None)  # Get vocab from kwargs
@@ -65,6 +89,8 @@ class AzureSTTService(STTService):
         self._speech_config.set_property(PropertyId.Speech_SegmentationSilenceTimeoutMs, "400")
         self._primary_language = language
         self._additional_languages = additional_languages
+        if endpoint_id:
+            self._speech_config.endpoint_id = endpoint_id
         self._audio_stream = None
         self._speech_recognizer = None
@@ -75,10 +101,25 @@ class AzureSTTService(STTService):
         }
     def can_generate_metrics(self) -> bool:
+        """Check if this service can generate performance metrics.
+        Returns:
+            True as this service supports metrics generation.
+        """
         return True
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
-        # Entry point for streaming audio to Azure STT and yielding transcription frames
+        """Process audio data for speech-to-text conversion.
+        Feeds audio data to the Azure speech recognizer for processing.
+        Recognition results are handled asynchronously through callbacks.
+        Args:
+            audio: Raw audio bytes to process.
+        Yields:
+            None - actual transcription frames are pushed via callbacks.
+        """
         await self.start_processing_metrics()
         await self.start_ttfb_metrics()
         if self._audio_stream:
@@ -87,6 +128,14 @@ class AzureSTTService(STTService):
         yield None
     async def start(self, frame: StartFrame):
+        """Start the speech recognition service.
+        Initializes the Azure speech recognizer with audio stream configuration
+        and begins continuous speech recognition.
+        Args:
+            frame: Frame indicating the start of processing.
+        """
         await super().start(frame)
         if self._audio_stream:
@@ -139,6 +188,13 @@ class AzureSTTService(STTService):
         self._speech_recognizer.start_continuous_recognition_async()
     async def stop(self, frame: EndFrame):
+        """Stop the speech recognition service.
+        Cleanly shuts down the Azure speech recognizer and closes audio streams.
+        Args:
+            frame: Frame indicating the end of processing.
+        """
         await super().stop(frame)
         if self._speech_recognizer:
@@ -150,6 +206,13 @@ class AzureSTTService(STTService):
             self._audio_stream.close()
     async def cancel(self, frame: CancelFrame):
+        """Cancel the speech recognition service.
+        Immediately stops recognition and closes resources.
+        Args:
+            frame: Frame indicating cancellation.
+        """
         await super().cancel(frame)
         if self._speech_recognizer:
@@ -175,7 +238,7 @@ class AzureSTTService(STTService):
             language = getattr(event.result, "language", None) or self._settings.get("language")
             frame = TranscriptionFrame(
                 event.result.text,
-                "",
+                self._user_id,
                 time_now_iso8601(),
                 language,
                 result=event,

dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl