PyPI - dv-pipecat-ai - Versions diffs - 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show

{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
pipecat/__init__.py +17 -0
pipecat/adapters/base_llm_adapter.py +36 -1
pipecat/adapters/schemas/direct_function.py +296 -0
pipecat/adapters/schemas/function_schema.py +15 -6
pipecat/adapters/schemas/tools_schema.py +55 -7
pipecat/adapters/services/anthropic_adapter.py +22 -3
pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
pipecat/adapters/services/bedrock_adapter.py +22 -3
pipecat/adapters/services/gemini_adapter.py +16 -3
pipecat/adapters/services/open_ai_adapter.py +17 -2
pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
pipecat/audio/filters/base_audio_filter.py +30 -6
pipecat/audio/filters/koala_filter.py +37 -2
pipecat/audio/filters/krisp_filter.py +59 -6
pipecat/audio/filters/noisereduce_filter.py +37 -0
pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
pipecat/audio/mixers/base_audio_mixer.py +30 -7
pipecat/audio/mixers/soundfile_mixer.py +53 -6
pipecat/audio/resamplers/base_audio_resampler.py +17 -9
pipecat/audio/resamplers/resampy_resampler.py +26 -1
pipecat/audio/resamplers/soxr_resampler.py +32 -1
pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
pipecat/audio/utils.py +194 -1
pipecat/audio/vad/silero.py +60 -3
pipecat/audio/vad/vad_analyzer.py +114 -30
pipecat/clocks/base_clock.py +19 -0
pipecat/clocks/system_clock.py +25 -0
pipecat/extensions/voicemail/__init__.py +0 -0
pipecat/extensions/voicemail/voicemail_detector.py +707 -0
pipecat/frames/frames.py +590 -156
pipecat/metrics/metrics.py +64 -1
pipecat/observers/base_observer.py +58 -19
pipecat/observers/loggers/debug_log_observer.py +56 -64
pipecat/observers/loggers/llm_log_observer.py +8 -1
pipecat/observers/loggers/transcription_log_observer.py +19 -7
pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
pipecat/observers/turn_tracking_observer.py +26 -1
pipecat/pipeline/base_pipeline.py +5 -7
pipecat/pipeline/base_task.py +52 -9
pipecat/pipeline/parallel_pipeline.py +121 -177
pipecat/pipeline/pipeline.py +129 -20
pipecat/pipeline/runner.py +50 -1
pipecat/pipeline/sync_parallel_pipeline.py +132 -32
pipecat/pipeline/task.py +263 -280
pipecat/pipeline/task_observer.py +85 -34
pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
pipecat/processors/aggregators/gated.py +25 -24
pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
pipecat/processors/aggregators/llm_response.py +398 -89
pipecat/processors/aggregators/openai_llm_context.py +161 -13
pipecat/processors/aggregators/sentence.py +25 -14
pipecat/processors/aggregators/user_response.py +28 -3
pipecat/processors/aggregators/vision_image_frame.py +24 -14
pipecat/processors/async_generator.py +28 -0
pipecat/processors/audio/audio_buffer_processor.py +78 -37
pipecat/processors/consumer_processor.py +25 -6
pipecat/processors/filters/frame_filter.py +23 -0
pipecat/processors/filters/function_filter.py +30 -0
pipecat/processors/filters/identity_filter.py +17 -2
pipecat/processors/filters/null_filter.py +24 -1
pipecat/processors/filters/stt_mute_filter.py +56 -21
pipecat/processors/filters/wake_check_filter.py +46 -3
pipecat/processors/filters/wake_notifier_filter.py +21 -3
pipecat/processors/frame_processor.py +488 -131
pipecat/processors/frameworks/langchain.py +38 -3
pipecat/processors/frameworks/rtvi.py +719 -34
pipecat/processors/gstreamer/pipeline_source.py +41 -0
pipecat/processors/idle_frame_processor.py +26 -3
pipecat/processors/logger.py +23 -0
pipecat/processors/metrics/frame_processor_metrics.py +77 -4
pipecat/processors/metrics/sentry.py +42 -4
pipecat/processors/producer_processor.py +34 -14
pipecat/processors/text_transformer.py +22 -10
pipecat/processors/transcript_processor.py +48 -29
pipecat/processors/user_idle_processor.py +31 -21
pipecat/runner/__init__.py +1 -0
pipecat/runner/daily.py +132 -0
pipecat/runner/livekit.py +148 -0
pipecat/runner/run.py +543 -0
pipecat/runner/types.py +67 -0
pipecat/runner/utils.py +515 -0
pipecat/serializers/base_serializer.py +42 -0
pipecat/serializers/exotel.py +17 -6
pipecat/serializers/genesys.py +95 -0
pipecat/serializers/livekit.py +33 -0
pipecat/serializers/plivo.py +16 -15
pipecat/serializers/protobuf.py +37 -1
pipecat/serializers/telnyx.py +18 -17
pipecat/serializers/twilio.py +32 -16
pipecat/services/ai_service.py +5 -3
pipecat/services/anthropic/llm.py +113 -43
pipecat/services/assemblyai/models.py +63 -5
pipecat/services/assemblyai/stt.py +64 -11
pipecat/services/asyncai/__init__.py +0 -0
pipecat/services/asyncai/tts.py +501 -0
pipecat/services/aws/llm.py +185 -111
pipecat/services/aws/stt.py +217 -23
pipecat/services/aws/tts.py +118 -52
pipecat/services/aws/utils.py +101 -5
pipecat/services/aws_nova_sonic/aws.py +82 -64
pipecat/services/aws_nova_sonic/context.py +15 -6
pipecat/services/azure/common.py +10 -2
pipecat/services/azure/image.py +32 -0
pipecat/services/azure/llm.py +9 -7
pipecat/services/azure/stt.py +65 -2
pipecat/services/azure/tts.py +154 -23
pipecat/services/cartesia/stt.py +125 -8
pipecat/services/cartesia/tts.py +102 -38
pipecat/services/cerebras/llm.py +15 -23
pipecat/services/deepgram/stt.py +19 -11
pipecat/services/deepgram/tts.py +36 -0
pipecat/services/deepseek/llm.py +14 -23
pipecat/services/elevenlabs/tts.py +330 -64
pipecat/services/fal/image.py +43 -0
pipecat/services/fal/stt.py +48 -10
pipecat/services/fireworks/llm.py +14 -21
pipecat/services/fish/tts.py +109 -9
pipecat/services/gemini_multimodal_live/__init__.py +1 -0
pipecat/services/gemini_multimodal_live/events.py +83 -2
pipecat/services/gemini_multimodal_live/file_api.py +189 -0
pipecat/services/gemini_multimodal_live/gemini.py +218 -21
pipecat/services/gladia/config.py +17 -10
pipecat/services/gladia/stt.py +82 -36
pipecat/services/google/frames.py +40 -0
pipecat/services/google/google.py +2 -0
pipecat/services/google/image.py +39 -2
pipecat/services/google/llm.py +176 -58
pipecat/services/google/llm_openai.py +26 -4
pipecat/services/google/llm_vertex.py +37 -15
pipecat/services/google/rtvi.py +41 -0
pipecat/services/google/stt.py +65 -17
pipecat/services/google/test-google-chirp.py +45 -0
pipecat/services/google/tts.py +390 -19
pipecat/services/grok/llm.py +8 -6
pipecat/services/groq/llm.py +8 -6
pipecat/services/groq/stt.py +13 -9
pipecat/services/groq/tts.py +40 -0
pipecat/services/hamsa/__init__.py +9 -0
pipecat/services/hamsa/stt.py +241 -0
pipecat/services/heygen/__init__.py +5 -0
pipecat/services/heygen/api.py +281 -0
pipecat/services/heygen/client.py +620 -0
pipecat/services/heygen/video.py +338 -0
pipecat/services/image_service.py +5 -3
pipecat/services/inworld/__init__.py +1 -0
pipecat/services/inworld/tts.py +592 -0
pipecat/services/llm_service.py +127 -45
pipecat/services/lmnt/tts.py +80 -7
pipecat/services/mcp_service.py +85 -44
pipecat/services/mem0/memory.py +42 -13
pipecat/services/minimax/tts.py +74 -15
pipecat/services/mistral/__init__.py +0 -0
pipecat/services/mistral/llm.py +185 -0
pipecat/services/moondream/vision.py +55 -10
pipecat/services/neuphonic/tts.py +275 -48
pipecat/services/nim/llm.py +8 -6
pipecat/services/ollama/llm.py +27 -7
pipecat/services/openai/base_llm.py +54 -16
pipecat/services/openai/image.py +30 -0
pipecat/services/openai/llm.py +7 -5
pipecat/services/openai/stt.py +13 -9
pipecat/services/openai/tts.py +42 -10
pipecat/services/openai_realtime_beta/azure.py +11 -9
pipecat/services/openai_realtime_beta/context.py +7 -5
pipecat/services/openai_realtime_beta/events.py +10 -7
pipecat/services/openai_realtime_beta/openai.py +37 -18
pipecat/services/openpipe/llm.py +30 -24
pipecat/services/openrouter/llm.py +9 -7
pipecat/services/perplexity/llm.py +15 -19
pipecat/services/piper/tts.py +26 -12
pipecat/services/playht/tts.py +227 -65
pipecat/services/qwen/llm.py +8 -6
pipecat/services/rime/tts.py +128 -17
pipecat/services/riva/stt.py +160 -22
pipecat/services/riva/tts.py +67 -2
pipecat/services/sambanova/llm.py +19 -17
pipecat/services/sambanova/stt.py +14 -8
pipecat/services/sarvam/tts.py +60 -13
pipecat/services/simli/video.py +82 -21
pipecat/services/soniox/__init__.py +0 -0
pipecat/services/soniox/stt.py +398 -0
pipecat/services/speechmatics/stt.py +29 -17
pipecat/services/stt_service.py +47 -11
pipecat/services/tavus/video.py +94 -25
pipecat/services/together/llm.py +8 -6
pipecat/services/tts_service.py +77 -53
pipecat/services/ultravox/stt.py +46 -43
pipecat/services/vision_service.py +5 -3
pipecat/services/websocket_service.py +12 -11
pipecat/services/whisper/base_stt.py +58 -12
pipecat/services/whisper/stt.py +69 -58
pipecat/services/xtts/tts.py +59 -2
pipecat/sync/base_notifier.py +19 -0
pipecat/sync/event_notifier.py +24 -0
pipecat/tests/utils.py +73 -5
pipecat/transcriptions/language.py +24 -0
pipecat/transports/base_input.py +112 -8
pipecat/transports/base_output.py +235 -13
pipecat/transports/base_transport.py +119 -0
pipecat/transports/local/audio.py +76 -0
pipecat/transports/local/tk.py +84 -0
pipecat/transports/network/fastapi_websocket.py +174 -15
pipecat/transports/network/small_webrtc.py +383 -39
pipecat/transports/network/webrtc_connection.py +214 -8
pipecat/transports/network/websocket_client.py +171 -1
pipecat/transports/network/websocket_server.py +147 -9
pipecat/transports/services/daily.py +792 -70
pipecat/transports/services/helpers/daily_rest.py +122 -129
pipecat/transports/services/livekit.py +339 -4
pipecat/transports/services/tavus.py +273 -38
pipecat/utils/asyncio/task_manager.py +92 -186
pipecat/utils/base_object.py +83 -1
pipecat/utils/network.py +2 -0
pipecat/utils/string.py +114 -58
pipecat/utils/text/base_text_aggregator.py +44 -13
pipecat/utils/text/base_text_filter.py +46 -0
pipecat/utils/text/markdown_text_filter.py +70 -14
pipecat/utils/text/pattern_pair_aggregator.py +18 -14
pipecat/utils/text/simple_text_aggregator.py +43 -2
pipecat/utils/text/skip_tags_aggregator.py +21 -13
pipecat/utils/time.py +36 -0
pipecat/utils/tracing/class_decorators.py +32 -7
pipecat/utils/tracing/conversation_context_provider.py +12 -2
pipecat/utils/tracing/service_attributes.py +80 -64
pipecat/utils/tracing/service_decorators.py +48 -21
pipecat/utils/tracing/setup.py +13 -7
pipecat/utils/tracing/turn_context_provider.py +12 -2
pipecat/utils/tracing/turn_trace_observer.py +27 -0
pipecat/utils/utils.py +14 -14
dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
pipecat/examples/daily_runner.py +0 -64
pipecat/examples/run.py +0 -265
pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
pipecat/utils/asyncio/watchdog_event.py +0 -42
pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
pipecat/utils/asyncio/watchdog_queue.py +0 -48
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
/pipecat/{examples → extensions}/__init__.py +0 -0

pipecat/services/azure/tts.py CHANGED Viewed

@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""Azure Cognitive Services Text-to-Speech service implementations."""
 import asyncio
 from typing import AsyncGenerator, Optional
@@ -21,8 +23,8 @@ from pipecat.frames.frames import (
 from pipecat.services.azure.common import language_to_azure_language
 from pipecat.services.tts_service import TTSService
 from pipecat.transcriptions.language import Language
-from pipecat.utils.utils import detect_language_from_script
 from pipecat.utils.tracing.service_decorators import traced_tts
+from pipecat.utils.utils import detect_language_from_script
 try:
     from azure.cognitiveservices.speech import (
@@ -40,6 +42,15 @@ except ModuleNotFoundError as e:
 def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputFormat:
+    """Convert sample rate to Azure speech synthesis output format.
+    Args:
+        sample_rate: Sample rate in Hz.
+    Returns:
+        Corresponding Azure SpeechSynthesisOutputFormat enum value.
+        Defaults to Raw24Khz16BitMonoPcm if sample rate not found.
+    """
     sample_rate_map = {
         8000: SpeechSynthesisOutputFormat.Raw8Khz16BitMonoPcm,
         16000: SpeechSynthesisOutputFormat.Raw16Khz16BitMonoPcm,
@@ -52,7 +63,36 @@ def sample_rate_to_output_format(sample_rate: int) -> SpeechSynthesisOutputForma
 class AzureBaseTTSService(TTSService):
+    """Base class for Azure Cognitive Services text-to-speech implementations.
+    Provides common functionality for Azure TTS services including SSML
+    construction, voice configuration, and parameter management.
+    """
+    # Define SSML escape mappings based on SSML reserved characters
+    # See - https://learn.microsoft.com/en-us/azure/ai-services/speech-service/speech-synthesis-markup-structure
+    SSML_ESCAPE_CHARS = {
+        "&": "&amp;",
+        "<": "&lt;",
+        ">": "&gt;",
+        '"': "&quot;",
+        "'": "&apos;",
+    }
     class InputParams(BaseModel):
+        """Input parameters for Azure TTS voice configuration.
+        Parameters:
+            emphasis: Emphasis level for speech ("strong", "moderate", "reduced").
+            language: Language for synthesis. Defaults to English (US).
+            pitch: Voice pitch adjustment (e.g., "+10%", "-5Hz", "high").
+            rate: Speech rate multiplier. Defaults to "1.05".
+            role: Voice role for expression (e.g., "YoungAdultFemale").
+            style: Speaking style (e.g., "cheerful", "sad", "excited").
+            style_degree: Intensity of the speaking style (0.01 to 2.0).
+            volume: Volume level (e.g., "+20%", "loud", "x-soft").
+        """
         emphasis: Optional[str] = None
         language: Optional[Language] = Language.EN_US
         pitch: Optional[str] = None
@@ -75,6 +115,16 @@ class AzureBaseTTSService(TTSService):
         params: Optional[InputParams] = None,
         **kwargs,
     ):
+        """Initialize the Azure TTS service with configuration parameters.
+        Args:
+            api_key: Azure Cognitive Services subscription key.
+            region: Azure region identifier (e.g., "eastus", "westus2").
+            voice: Voice name to use for synthesis. Defaults to "en-US-SaraNeural".
+            sample_rate: Audio sample rate in Hz. If None, uses service default.
+            params: Voice and synthesis parameters configuration.
+            **kwargs: Additional arguments passed to parent TTSService.
+        """
         super().__init__(sample_rate=sample_rate, **kwargs)
         params = params or AzureBaseTTSService.InputParams()
@@ -138,9 +188,22 @@ class AzureBaseTTSService(TTSService):
         logger.debug(f"Final additional language map: {self._additional_lang_map}")
     def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+        Returns:
+            True, as Azure TTS service supports metrics generation.
+        """
         return True
     def language_to_service_language(self, language: Language) -> Optional[str]:
+        """Convert a Language enum to Azure language format.
+        Args:
+            language: The language to convert.
+        Returns:
+            The Azure-specific language code, or None if not supported.
+        """
         return language_to_azure_language(language)
     def _construct_ssml(self, text: str) -> str:
@@ -162,6 +225,10 @@ class AzureBaseTTSService(TTSService):
             )
         # 3. Construct SSML with the selected language and voice
+        # Escape special characters
+        escaped_text = self._escape_text(text)
         ssml = (
             f"<speak version='1.0' xml:lang='{target_language}' "
             "xmlns='http://www.w3.org/2001/10/synthesis' "
@@ -193,10 +260,10 @@ class AzureBaseTTSService(TTSService):
         if "Multilingual" in target_voice:
             ssml += f"<lang xml:lang='{target_language}'>"
-            ssml += text
+            ssml += escaped_text
             ssml += "</lang>"
         else:
-            ssml += text
+            ssml += escaped_text
         if self._settings["emphasis"]:
             ssml += "</emphasis>"
@@ -210,9 +277,42 @@ class AzureBaseTTSService(TTSService):
         return ssml
+    def _escape_text(self, text: str) -> str:
+        """Escapes XML/SSML reserved characters according to Microsoft documentation.
+        This method escapes the following characters:
+        - & becomes &amp;
+        - < becomes &lt;
+        - > becomes &gt;
+        - " becomes &quot;
+        - ' becomes &apos;
+        Args:
+            text: The text to escape.
+        Returns:
+            The escaped text.
+        """
+        escaped_text = text
+        for char, escape_code in AzureBaseTTSService.SSML_ESCAPE_CHARS.items():
+            escaped_text = escaped_text.replace(char, escape_code)
+        return escaped_text
 class AzureTTSService(AzureBaseTTSService):
+    """Azure Cognitive Services streaming TTS service.
+    Provides real-time text-to-speech synthesis using Azure's WebSocket-based
+    streaming API. Audio chunks are streamed as they become available for
+    lower latency playback.
+    """
     def __init__(self, **kwargs):
+        """Initialize the Azure streaming TTS service.
+        Args:
+            **kwargs: All arguments passed to AzureBaseTTSService parent class.
+        """
         super().__init__(**kwargs)
         self._speech_config = None
         self._speech_synthesizer = None
@@ -220,6 +320,11 @@ class AzureTTSService(AzureBaseTTSService):
         self._clear_audio = False
     async def start(self, frame: StartFrame):
+        """Start the Azure TTS service and initialize speech synthesizer.
+        Args:
+            frame: Start frame containing initialization parameters.
+        """
         await super().start(frame)
         if self._speech_config:
@@ -250,12 +355,12 @@ class AzureTTSService(AzureBaseTTSService):
         self._speech_synthesizer.synthesis_canceled.connect(self._handle_canceled)
     def _handle_synthesizing(self, evt):
-        """Handle audio chunks as they arrive"""
+        """Handle audio chunks as they arriv."""
         if evt.result and evt.result.audio_data:
             self._audio_queue.put_nowait(evt.result.audio_data)
     def _handle_completed(self, evt):
-        """Handle synthesis completion"""
+        """Handle synthesis completion."""
         self._audio_queue.put_nowait(None)  # Signal completion
     def _handle_canceled(self, evt):
@@ -263,29 +368,30 @@ class AzureTTSService(AzureBaseTTSService):
         self.logger.error(f"Speech synthesis canceled: {evt.result.cancellation_details.reason}")
         self._audio_queue.put_nowait(None)
-    async def clear_azure_audio(self):
-        self.logger.debug("Flushing audio")
-        self._clear_audio = True
-        if self._speech_synthesizer is not None:
-            future = self._speech_synthesizer.stop_speaking_async()
-            async def wait_for_future_completion():
-                loop = self.get_event_loop()
-                await loop.run_in_executor(None, future.get)
-            task = self.create_task(wait_for_future_completion())
-            await self.wait_for_task(task)
-        while not self._audio_queue.empty():
-            try:
-                self._audio_queue.get_nowait()
-            except asyncio.QueueEmpty:
-                break
-        self._clear_audio = False
+    async def flush_audio(self):
+        """Flush any pending audio data."""
+        logger.trace(f"{self}: flushing audio")
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Azure's streaming synthesis.
+        Args:
+            text: The text to synthesize into speech.
+        Yields:
+            Frame: Audio frames containing synthesized speech data.
+        """
         text = text.lstrip()
         self.logger.debug(f"{self}: Generating TTS [{text}]")
+        # Clear the audio queue in case there's still audio in it, causing the next audio response
+        # to be cut off by the 'None' element returned at the end of the previous audio synthesis.
+        # Empty the audio queue before processing the new text
+        while not self._audio_queue.empty():
+            self._audio_queue.get_nowait()
+            self._audio_queue.task_done()
         try:
             if self._speech_synthesizer is None:
                 error_msg = "Speech synthesizer not initialized."
@@ -324,12 +430,29 @@ class AzureTTSService(AzureBaseTTSService):
 class AzureHttpTTSService(AzureBaseTTSService):
+    """Azure Cognitive Services HTTP-based TTS service.
+    Provides text-to-speech synthesis using Azure's HTTP API for simpler,
+    non-streaming synthesis. Suitable for use cases where streaming is not
+    required and simpler integration is preferred.
+    """
     def __init__(self, **kwargs):
+        """Initialize the Azure HTTP TTS service.
+        Args:
+            **kwargs: All arguments passed to AzureBaseTTSService parent class.
+        """
         super().__init__(**kwargs)
         self._speech_config = None
         self._speech_synthesizer = None
     async def start(self, frame: StartFrame):
+        """Start the Azure HTTP TTS service and initialize speech synthesizer.
+        Args:
+            frame: Start frame containing initialization parameters.
+        """
         await super().start(frame)
         if self._speech_config:
@@ -349,6 +472,14 @@ class AzureHttpTTSService(AzureBaseTTSService):
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Azure's HTTP synthesis API.
+        Args:
+            text: The text to synthesize into speech.
+        Yields:
+            Frame: Audio frames containing the complete synthesized speech.
+        """
         logger.debug(f"{self}: Generating TTS [{text}]")
         await self.start_ttfb_metrics()

pipecat/services/cartesia/stt.py CHANGED Viewed

@@ -4,12 +4,17 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
+"""Cartesia Speech-to-Text service implementation.
+This module provides a WebSocket-based STT service that integrates with
+the Cartesia Live transcription API for real-time speech recognition.
+"""
 import asyncio
 import json
 import urllib.parse
 from typing import AsyncGenerator, Optional
-import websockets
 from loguru import logger
 from pipecat.frames.frames import (
@@ -28,8 +33,23 @@ from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601
 from pipecat.utils.tracing.service_decorators import traced_stt
+try:
+    import websockets
+    from websockets.asyncio.client import connect as websocket_connect
+    from websockets.protocol import State
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error("In order to use Cartesia, you need to `pip install pipecat-ai[cartesia]`.")
+    raise Exception(f"Missing module: {e}")
 class CartesiaLiveOptions:
+    """Configuration options for Cartesia Live STT service.
+    Manages transcription parameters including model selection, language,
+    audio encoding format, and sample rate settings.
+    """
     def __init__(
         self,
         *,
@@ -39,6 +59,15 @@ class CartesiaLiveOptions:
         sample_rate: int = 16000,
         **kwargs,
     ):
+        """Initialize CartesiaLiveOptions with default or provided parameters.
+        Args:
+            model: The transcription model to use. Defaults to "ink-whisper".
+            language: Target language for transcription. Defaults to English.
+            encoding: Audio encoding format. Defaults to "pcm_s16le".
+            sample_rate: Audio sample rate in Hz. Defaults to 16000.
+            **kwargs: Additional parameters for the transcription service.
+        """
         self.model = model
         self.language = language
         self.encoding = encoding
@@ -46,6 +75,11 @@ class CartesiaLiveOptions:
         self.additional_params = kwargs
     def to_dict(self):
+        """Convert options to dictionary format.
+        Returns:
+            Dictionary containing all configuration parameters.
+        """
         params = {
             "model": self.model,
             "language": self.language if isinstance(self.language, str) else self.language.value,
@@ -56,19 +90,48 @@ class CartesiaLiveOptions:
         return params
     def items(self):
+        """Get configuration items as key-value pairs.
+        Returns:
+            Iterator of (key, value) tuples for all configuration parameters.
+        """
         return self.to_dict().items()
     def get(self, key, default=None):
+        """Get a configuration value by key.
+        Args:
+            key: The configuration parameter name to retrieve.
+            default: Default value if key is not found.
+        Returns:
+            The configuration value or default if not found.
+        """
         if hasattr(self, key):
             return getattr(self, key)
         return self.additional_params.get(key, default)
     @classmethod
     def from_json(cls, json_str: str) -> "CartesiaLiveOptions":
+        """Create options from JSON string.
+        Args:
+            json_str: JSON string containing configuration parameters.
+        Returns:
+            New CartesiaLiveOptions instance with parsed parameters.
+        """
         return cls(**json.loads(json_str))
 class CartesiaSTTService(STTService):
+    """Speech-to-text service using Cartesia Live API.
+    Provides real-time speech transcription through WebSocket connection
+    to Cartesia's Live transcription service. Supports both interim and
+    final transcriptions with configurable models and languages.
+    """
     def __init__(
         self,
         *,
@@ -78,6 +141,15 @@ class CartesiaSTTService(STTService):
         live_options: Optional[CartesiaLiveOptions] = None,
         **kwargs,
     ):
+        """Initialize CartesiaSTTService with API key and options.
+        Args:
+            api_key: Authentication key for Cartesia API.
+            base_url: Custom API endpoint URL. If empty, uses default.
+            sample_rate: Audio sample rate in Hz. Defaults to 16000.
+            live_options: Configuration options for transcription service.
+            **kwargs: Additional arguments passed to parent STTService.
+        """
         sample_rate = sample_rate or (live_options.sample_rate if live_options else None)
         super().__init__(sample_rate=sample_rate, **kwargs)
@@ -108,23 +180,51 @@ class CartesiaSTTService(STTService):
         self._receiver_task = None
     def can_generate_metrics(self) -> bool:
+        """Check if the service can generate processing metrics.
+        Returns:
+            True, indicating metrics are supported.
+        """
         return True
     async def start(self, frame: StartFrame):
+        """Start the STT service and establish connection.
+        Args:
+            frame: Frame indicating service should start.
+        """
         await super().start(frame)
         await self._connect()
     async def stop(self, frame: EndFrame):
+        """Stop the STT service and close connection.
+        Args:
+            frame: Frame indicating service should stop.
+        """
         await super().stop(frame)
         await self._disconnect()
     async def cancel(self, frame: CancelFrame):
+        """Cancel the STT service and close connection.
+        Args:
+            frame: Frame indicating service should be cancelled.
+        """
         await super().cancel(frame)
         await self._disconnect()
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
+        """Process audio data for speech-to-text transcription.
+        Args:
+            audio: Raw audio bytes to transcribe.
+        Yields:
+            None - transcription results are handled via WebSocket responses.
+        """
         # If the connection is closed, due to timeout, we need to reconnect when the user starts speaking again
-        if not self._connection or self._connection.closed:
+        if not self._connection or self._connection.state is State.CLOSED:
             await self._connect()
         await self._connection.send(audio)
@@ -137,7 +237,7 @@ class CartesiaSTTService(STTService):
         headers = {"Cartesia-Version": "2025-04-16", "X-API-Key": self._api_key}
         try:
-            self._connection = await websockets.connect(ws_url, extra_headers=headers)
+            self._connection = await websocket_connect(ws_url, additional_headers=headers)
             # Setup the receiver task to handle the incoming messages from the Cartesia server
             if self._receiver_task is None or self._receiver_task.done():
                 self._receiver_task = asyncio.create_task(self._receive_messages())
@@ -148,7 +248,7 @@ class CartesiaSTTService(STTService):
     async def _receive_messages(self):
         try:
             while True:
-                if not self._connection or self._connection.closed:
+                if not self._connection or self._connection.state is State.CLOSED:
                     break
                 message = await self._connection.recv()
@@ -197,14 +297,24 @@ class CartesiaSTTService(STTService):
             await self.stop_ttfb_metrics()
             if is_final:
                 await self.push_frame(
-                    TranscriptionFrame(transcript, "", time_now_iso8601(), language)
+                    TranscriptionFrame(
+                        transcript,
+                        self._user_id,
+                        time_now_iso8601(),
+                        language,
+                    )
                 )
                 await self._handle_transcription(transcript, is_final, language)
                 await self.stop_processing_metrics()
             else:
                 # For interim transcriptions, just push the frame without tracing
                 await self.push_frame(
-                    InterimTranscriptionFrame(transcript, "", time_now_iso8601(), language)
+                    InterimTranscriptionFrame(
+                        transcript,
+                        self._user_id,
+                        time_now_iso8601(),
+                        language,
+                    )
                 )
     async def _disconnect(self):
@@ -218,22 +328,29 @@ class CartesiaSTTService(STTService):
                 logger.exception(f"Unexpected exception while cancelling task: {e}")
             self._receiver_task = None
-        if self._connection and self._connection.open:
+        if self._connection and self._connection.state is State.OPEN:
             logger.debug("Disconnecting from Cartesia")
             await self._connection.close()
             self._connection = None
     async def start_metrics(self):
+        """Start performance metrics collection for transcription processing."""
         await self.start_ttfb_metrics()
         await self.start_processing_metrics()
     async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Process incoming frames and handle speech events.
+        Args:
+            frame: The frame to process.
+            direction: Direction of frame flow in the pipeline.
+        """
         await super().process_frame(frame, direction)
         if isinstance(frame, UserStartedSpeakingFrame):
             await self.start_metrics()
         elif isinstance(frame, UserStoppedSpeakingFrame):
             # Send finalize command to flush the transcription session
-            if self._connection and self._connection.open:
+            if self._connection and self._connection.state is State.OPEN:
                 await self._connection.send("finalize")

dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl