PyPI - dv-pipecat-ai - Versions diffs - 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (244) hide show

{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/METADATA +137 -93
dv_pipecat_ai-0.0.82.dev776.dist-info/RECORD +340 -0
pipecat/__init__.py +17 -0
pipecat/adapters/base_llm_adapter.py +36 -1
pipecat/adapters/schemas/direct_function.py +296 -0
pipecat/adapters/schemas/function_schema.py +15 -6
pipecat/adapters/schemas/tools_schema.py +55 -7
pipecat/adapters/services/anthropic_adapter.py +22 -3
pipecat/adapters/services/aws_nova_sonic_adapter.py +23 -3
pipecat/adapters/services/bedrock_adapter.py +22 -3
pipecat/adapters/services/gemini_adapter.py +16 -3
pipecat/adapters/services/open_ai_adapter.py +17 -2
pipecat/adapters/services/open_ai_realtime_adapter.py +23 -3
pipecat/audio/filters/base_audio_filter.py +30 -6
pipecat/audio/filters/koala_filter.py +37 -2
pipecat/audio/filters/krisp_filter.py +59 -6
pipecat/audio/filters/noisereduce_filter.py +37 -0
pipecat/audio/interruptions/base_interruption_strategy.py +25 -5
pipecat/audio/interruptions/min_words_interruption_strategy.py +21 -4
pipecat/audio/mixers/base_audio_mixer.py +30 -7
pipecat/audio/mixers/soundfile_mixer.py +53 -6
pipecat/audio/resamplers/base_audio_resampler.py +17 -9
pipecat/audio/resamplers/resampy_resampler.py +26 -1
pipecat/audio/resamplers/soxr_resampler.py +32 -1
pipecat/audio/resamplers/soxr_stream_resampler.py +101 -0
pipecat/audio/utils.py +194 -1
pipecat/audio/vad/silero.py +60 -3
pipecat/audio/vad/vad_analyzer.py +114 -30
pipecat/clocks/base_clock.py +19 -0
pipecat/clocks/system_clock.py +25 -0
pipecat/extensions/voicemail/__init__.py +0 -0
pipecat/extensions/voicemail/voicemail_detector.py +707 -0
pipecat/frames/frames.py +590 -156
pipecat/metrics/metrics.py +64 -1
pipecat/observers/base_observer.py +58 -19
pipecat/observers/loggers/debug_log_observer.py +56 -64
pipecat/observers/loggers/llm_log_observer.py +8 -1
pipecat/observers/loggers/transcription_log_observer.py +19 -7
pipecat/observers/loggers/user_bot_latency_log_observer.py +32 -5
pipecat/observers/turn_tracking_observer.py +26 -1
pipecat/pipeline/base_pipeline.py +5 -7
pipecat/pipeline/base_task.py +52 -9
pipecat/pipeline/parallel_pipeline.py +121 -177
pipecat/pipeline/pipeline.py +129 -20
pipecat/pipeline/runner.py +50 -1
pipecat/pipeline/sync_parallel_pipeline.py +132 -32
pipecat/pipeline/task.py +263 -280
pipecat/pipeline/task_observer.py +85 -34
pipecat/pipeline/to_be_updated/merge_pipeline.py +32 -2
pipecat/processors/aggregators/dtmf_aggregator.py +29 -22
pipecat/processors/aggregators/gated.py +25 -24
pipecat/processors/aggregators/gated_openai_llm_context.py +22 -2
pipecat/processors/aggregators/llm_response.py +398 -89
pipecat/processors/aggregators/openai_llm_context.py +161 -13
pipecat/processors/aggregators/sentence.py +25 -14
pipecat/processors/aggregators/user_response.py +28 -3
pipecat/processors/aggregators/vision_image_frame.py +24 -14
pipecat/processors/async_generator.py +28 -0
pipecat/processors/audio/audio_buffer_processor.py +78 -37
pipecat/processors/consumer_processor.py +25 -6
pipecat/processors/filters/frame_filter.py +23 -0
pipecat/processors/filters/function_filter.py +30 -0
pipecat/processors/filters/identity_filter.py +17 -2
pipecat/processors/filters/null_filter.py +24 -1
pipecat/processors/filters/stt_mute_filter.py +56 -21
pipecat/processors/filters/wake_check_filter.py +46 -3
pipecat/processors/filters/wake_notifier_filter.py +21 -3
pipecat/processors/frame_processor.py +488 -131
pipecat/processors/frameworks/langchain.py +38 -3
pipecat/processors/frameworks/rtvi.py +719 -34
pipecat/processors/gstreamer/pipeline_source.py +41 -0
pipecat/processors/idle_frame_processor.py +26 -3
pipecat/processors/logger.py +23 -0
pipecat/processors/metrics/frame_processor_metrics.py +77 -4
pipecat/processors/metrics/sentry.py +42 -4
pipecat/processors/producer_processor.py +34 -14
pipecat/processors/text_transformer.py +22 -10
pipecat/processors/transcript_processor.py +48 -29
pipecat/processors/user_idle_processor.py +31 -21
pipecat/runner/__init__.py +1 -0
pipecat/runner/daily.py +132 -0
pipecat/runner/livekit.py +148 -0
pipecat/runner/run.py +543 -0
pipecat/runner/types.py +67 -0
pipecat/runner/utils.py +515 -0
pipecat/serializers/base_serializer.py +42 -0
pipecat/serializers/exotel.py +17 -6
pipecat/serializers/genesys.py +95 -0
pipecat/serializers/livekit.py +33 -0
pipecat/serializers/plivo.py +16 -15
pipecat/serializers/protobuf.py +37 -1
pipecat/serializers/telnyx.py +18 -17
pipecat/serializers/twilio.py +32 -16
pipecat/services/ai_service.py +5 -3
pipecat/services/anthropic/llm.py +113 -43
pipecat/services/assemblyai/models.py +63 -5
pipecat/services/assemblyai/stt.py +64 -11
pipecat/services/asyncai/__init__.py +0 -0
pipecat/services/asyncai/tts.py +501 -0
pipecat/services/aws/llm.py +185 -111
pipecat/services/aws/stt.py +217 -23
pipecat/services/aws/tts.py +118 -52
pipecat/services/aws/utils.py +101 -5
pipecat/services/aws_nova_sonic/aws.py +82 -64
pipecat/services/aws_nova_sonic/context.py +15 -6
pipecat/services/azure/common.py +10 -2
pipecat/services/azure/image.py +32 -0
pipecat/services/azure/llm.py +9 -7
pipecat/services/azure/stt.py +65 -2
pipecat/services/azure/tts.py +154 -23
pipecat/services/cartesia/stt.py +125 -8
pipecat/services/cartesia/tts.py +102 -38
pipecat/services/cerebras/llm.py +15 -23
pipecat/services/deepgram/stt.py +19 -11
pipecat/services/deepgram/tts.py +36 -0
pipecat/services/deepseek/llm.py +14 -23
pipecat/services/elevenlabs/tts.py +330 -64
pipecat/services/fal/image.py +43 -0
pipecat/services/fal/stt.py +48 -10
pipecat/services/fireworks/llm.py +14 -21
pipecat/services/fish/tts.py +109 -9
pipecat/services/gemini_multimodal_live/__init__.py +1 -0
pipecat/services/gemini_multimodal_live/events.py +83 -2
pipecat/services/gemini_multimodal_live/file_api.py +189 -0
pipecat/services/gemini_multimodal_live/gemini.py +218 -21
pipecat/services/gladia/config.py +17 -10
pipecat/services/gladia/stt.py +82 -36
pipecat/services/google/frames.py +40 -0
pipecat/services/google/google.py +2 -0
pipecat/services/google/image.py +39 -2
pipecat/services/google/llm.py +176 -58
pipecat/services/google/llm_openai.py +26 -4
pipecat/services/google/llm_vertex.py +37 -15
pipecat/services/google/rtvi.py +41 -0
pipecat/services/google/stt.py +65 -17
pipecat/services/google/test-google-chirp.py +45 -0
pipecat/services/google/tts.py +390 -19
pipecat/services/grok/llm.py +8 -6
pipecat/services/groq/llm.py +8 -6
pipecat/services/groq/stt.py +13 -9
pipecat/services/groq/tts.py +40 -0
pipecat/services/hamsa/__init__.py +9 -0
pipecat/services/hamsa/stt.py +241 -0
pipecat/services/heygen/__init__.py +5 -0
pipecat/services/heygen/api.py +281 -0
pipecat/services/heygen/client.py +620 -0
pipecat/services/heygen/video.py +338 -0
pipecat/services/image_service.py +5 -3
pipecat/services/inworld/__init__.py +1 -0
pipecat/services/inworld/tts.py +592 -0
pipecat/services/llm_service.py +127 -45
pipecat/services/lmnt/tts.py +80 -7
pipecat/services/mcp_service.py +85 -44
pipecat/services/mem0/memory.py +42 -13
pipecat/services/minimax/tts.py +74 -15
pipecat/services/mistral/__init__.py +0 -0
pipecat/services/mistral/llm.py +185 -0
pipecat/services/moondream/vision.py +55 -10
pipecat/services/neuphonic/tts.py +275 -48
pipecat/services/nim/llm.py +8 -6
pipecat/services/ollama/llm.py +27 -7
pipecat/services/openai/base_llm.py +54 -16
pipecat/services/openai/image.py +30 -0
pipecat/services/openai/llm.py +7 -5
pipecat/services/openai/stt.py +13 -9
pipecat/services/openai/tts.py +42 -10
pipecat/services/openai_realtime_beta/azure.py +11 -9
pipecat/services/openai_realtime_beta/context.py +7 -5
pipecat/services/openai_realtime_beta/events.py +10 -7
pipecat/services/openai_realtime_beta/openai.py +37 -18
pipecat/services/openpipe/llm.py +30 -24
pipecat/services/openrouter/llm.py +9 -7
pipecat/services/perplexity/llm.py +15 -19
pipecat/services/piper/tts.py +26 -12
pipecat/services/playht/tts.py +227 -65
pipecat/services/qwen/llm.py +8 -6
pipecat/services/rime/tts.py +128 -17
pipecat/services/riva/stt.py +160 -22
pipecat/services/riva/tts.py +67 -2
pipecat/services/sambanova/llm.py +19 -17
pipecat/services/sambanova/stt.py +14 -8
pipecat/services/sarvam/tts.py +60 -13
pipecat/services/simli/video.py +82 -21
pipecat/services/soniox/__init__.py +0 -0
pipecat/services/soniox/stt.py +398 -0
pipecat/services/speechmatics/stt.py +29 -17
pipecat/services/stt_service.py +47 -11
pipecat/services/tavus/video.py +94 -25
pipecat/services/together/llm.py +8 -6
pipecat/services/tts_service.py +77 -53
pipecat/services/ultravox/stt.py +46 -43
pipecat/services/vision_service.py +5 -3
pipecat/services/websocket_service.py +12 -11
pipecat/services/whisper/base_stt.py +58 -12
pipecat/services/whisper/stt.py +69 -58
pipecat/services/xtts/tts.py +59 -2
pipecat/sync/base_notifier.py +19 -0
pipecat/sync/event_notifier.py +24 -0
pipecat/tests/utils.py +73 -5
pipecat/transcriptions/language.py +24 -0
pipecat/transports/base_input.py +112 -8
pipecat/transports/base_output.py +235 -13
pipecat/transports/base_transport.py +119 -0
pipecat/transports/local/audio.py +76 -0
pipecat/transports/local/tk.py +84 -0
pipecat/transports/network/fastapi_websocket.py +174 -15
pipecat/transports/network/small_webrtc.py +383 -39
pipecat/transports/network/webrtc_connection.py +214 -8
pipecat/transports/network/websocket_client.py +171 -1
pipecat/transports/network/websocket_server.py +147 -9
pipecat/transports/services/daily.py +792 -70
pipecat/transports/services/helpers/daily_rest.py +122 -129
pipecat/transports/services/livekit.py +339 -4
pipecat/transports/services/tavus.py +273 -38
pipecat/utils/asyncio/task_manager.py +92 -186
pipecat/utils/base_object.py +83 -1
pipecat/utils/network.py +2 -0
pipecat/utils/string.py +114 -58
pipecat/utils/text/base_text_aggregator.py +44 -13
pipecat/utils/text/base_text_filter.py +46 -0
pipecat/utils/text/markdown_text_filter.py +70 -14
pipecat/utils/text/pattern_pair_aggregator.py +18 -14
pipecat/utils/text/simple_text_aggregator.py +43 -2
pipecat/utils/text/skip_tags_aggregator.py +21 -13
pipecat/utils/time.py +36 -0
pipecat/utils/tracing/class_decorators.py +32 -7
pipecat/utils/tracing/conversation_context_provider.py +12 -2
pipecat/utils/tracing/service_attributes.py +80 -64
pipecat/utils/tracing/service_decorators.py +48 -21
pipecat/utils/tracing/setup.py +13 -7
pipecat/utils/tracing/turn_context_provider.py +12 -2
pipecat/utils/tracing/turn_trace_observer.py +27 -0
pipecat/utils/utils.py +14 -14
dv_pipecat_ai-0.0.74.dev770.dist-info/RECORD +0 -319
pipecat/examples/daily_runner.py +0 -64
pipecat/examples/run.py +0 -265
pipecat/utils/asyncio/watchdog_async_iterator.py +0 -72
pipecat/utils/asyncio/watchdog_event.py +0 -42
pipecat/utils/asyncio/watchdog_priority_queue.py +0 -48
pipecat/utils/asyncio/watchdog_queue.py +0 -48
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/WHEEL +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/licenses/LICENSE +0 -0
{dv_pipecat_ai-0.0.74.dev770.dist-info → dv_pipecat_ai-0.0.82.dev776.dist-info}/top_level.txt +0 -0
/pipecat/{examples → extensions}/__init__.py +0 -0

pipecat/services/soniox/stt.py ADDED Viewed

@@ -0,0 +1,398 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+"""Soniox speech-to-text service implementation."""
+import asyncio
+import json
+import time
+from typing import AsyncGenerator, List, Optional
+from loguru import logger
+from pydantic import BaseModel
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    InterimTranscriptionFrame,
+    StartFrame,
+    TranscriptionFrame,
+    UserStoppedSpeakingFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.stt_service import STTService
+from pipecat.transcriptions.language import Language
+from pipecat.utils.time import time_now_iso8601
+from pipecat.utils.tracing.service_decorators import traced_stt
+try:
+    import websockets
+    from websockets.asyncio.client import connect as websocket_connect
+    from websockets.protocol import State
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error("In order to use Soniox, you need to `pip install pipecat-ai[soniox]`.")
+    raise Exception(f"Missing module: {e}")
+KEEPALIVE_MESSAGE = '{"type": "keepalive"}'
+FINALIZE_MESSAGE = '{"type": "finalize"}'
+END_TOKEN = "<end>"
+FINALIZED_TOKEN = "<fin>"
+class SonioxInputParams(BaseModel):
+    """Real-time transcription settings.
+    See Soniox WebSocket API documentation for more details:
+    https://soniox.com/docs/speech-to-text/api-reference/websocket-api#configuration-parameters
+    Parameters:
+        model: Model to use for transcription.
+        audio_format: Audio format to use for transcription.
+        num_channels: Number of channels to use for transcription.
+        language_hints: List of language hints to use for transcription.
+        context: Customization for transcription.
+        enable_non_final_tokens: Whether to enable non-final tokens. If false, only final tokens will be returned.
+        max_non_final_tokens_duration_ms: Maximum duration of non-final tokens.
+        client_reference_id: Client reference ID to use for transcription.
+    """
+    model: str = "stt-rt-preview"
+    audio_format: Optional[str] = "pcm_s16le"
+    num_channels: Optional[int] = 1
+    language_hints: Optional[List[Language]] = None
+    context: Optional[str] = None
+    enable_non_final_tokens: Optional[bool] = True
+    max_non_final_tokens_duration_ms: Optional[int] = None
+    client_reference_id: Optional[str] = None
+def is_end_token(token: dict) -> bool:
+    """Determine if a token is an end token."""
+    return token["text"] == END_TOKEN or token["text"] == FINALIZED_TOKEN
+def language_to_soniox_language(language: Language) -> str:
+    """Pipecat Language enum uses same ISO 2-letter codes as Soniox, except with added regional variants.
+    For a list of all supported languages, see: https://soniox.com/docs/speech-to-text/core-concepts/supported-languages
+    """
+    lang_str = str(language.value).lower()
+    if "-" in lang_str:
+        return lang_str.split("-")[0]
+    return lang_str
+def _prepare_language_hints(
+    language_hints: Optional[List[Language]],
+) -> Optional[List[str]]:
+    if language_hints is None:
+        return None
+    prepared_languages = [language_to_soniox_language(lang) for lang in language_hints]
+    # Remove duplicates (in case of language_hints with multiple regions).
+    return list(set(prepared_languages))
+class SonioxSTTService(STTService):
+    """Speech-to-Text service using Soniox's WebSocket API.
+    This service connects to Soniox's WebSocket API for real-time transcription
+    with support for multiple languages, custom context, speaker diarization,
+    and more.
+    For complete API documentation, see: https://soniox.com/docs/speech-to-text/api-reference/websocket-api
+    """
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        url: str = "wss://stt-rt.soniox.com/transcribe-websocket",
+        sample_rate: Optional[int] = None,
+        params: Optional[SonioxInputParams] = None,
+        vad_force_turn_endpoint: bool = False,
+        **kwargs,
+    ):
+        """Initialize the Soniox STT service.
+        Args:
+            api_key: Soniox API key.
+            url: Soniox WebSocket API URL.
+            sample_rate: Audio sample rate.
+            params: Additional configuration parameters, such as language hints, context and
+                speaker diarization.
+            vad_force_turn_endpoint: Listen to `UserStoppedSpeakingFrame` to send finalize message to Soniox. If disabled, Soniox will detect the end of the speech.
+            **kwargs: Additional arguments passed to the STTService.
+        """
+        super().__init__(sample_rate=sample_rate, **kwargs)
+        params = params or SonioxInputParams()
+        self._api_key = api_key
+        self._url = url
+        self.set_model_name(params.model)
+        self._params = params
+        self._vad_force_turn_endpoint = vad_force_turn_endpoint
+        self._websocket = None
+        self._final_transcription_buffer = []
+        self._last_tokens_received: Optional[float] = None
+        self._receive_task = None
+        self._keepalive_task = None
+    async def start(self, frame: StartFrame):
+        """Start the Soniox STT websocket connection.
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
+        await super().start(frame)
+        if self._websocket:
+            return
+        self._websocket = await websocket_connect(self._url)
+        if not self._websocket:
+            logger.error(f"Unable to connect to Soniox API at {self._url}")
+        # If vad_force_turn_endpoint is not enabled, we need to enable endpoint detection.
+        # Either one or the other is required.
+        enable_endpoint_detection = not self._vad_force_turn_endpoint
+        # Send the initial configuration message.
+        config = {
+            "api_key": self._api_key,
+            "model": self._model_name,
+            "audio_format": self._params.audio_format,
+            "num_channels": self._params.num_channels or 1,
+            "enable_endpoint_detection": enable_endpoint_detection,
+            "sample_rate": self.sample_rate,
+            "language_hints": _prepare_language_hints(self._params.language_hints),
+            "context": self._params.context,
+            "enable_non_final_tokens": self._params.enable_non_final_tokens,
+            "max_non_final_tokens_duration_ms": self._params.max_non_final_tokens_duration_ms,
+            "client_reference_id": self._params.client_reference_id,
+        }
+        # Send the configuration message.
+        await self._websocket.send(json.dumps(config))
+        if self._websocket and not self._receive_task:
+            self._receive_task = self.create_task(self._receive_task_handler())
+        if self._websocket and not self._keepalive_task:
+            self._keepalive_task = self.create_task(self._keepalive_task_handler())
+    async def _cleanup(self):
+        if self._keepalive_task:
+            await self.cancel_task(self._keepalive_task)
+            self._keepalive_task = None
+        if self._websocket:
+            await self._websocket.close()
+            self._websocket = None
+        if self._receive_task:
+            # Task cannot cancel itself. If task called _cleanup() we expect it to cancel itself.
+            if self._receive_task != asyncio.current_task():
+                await self._receive_task
+            self._receive_task = None
+    async def stop(self, frame: EndFrame):
+        """Stop the Soniox STT websocket connection.
+        Stopping waits for the server to close the connection as we might receive
+        additional final tokens after sending the stop recording message.
+        Args:
+            frame: The end frame.
+        """
+        await super().stop(frame)
+        await self._send_stop_recording()
+    async def cancel(self, frame: CancelFrame):
+        """Cancel the Soniox STT websocket connection.
+        Compared to stop, this method closes the connection immediately without waiting
+        for the server to close it. This is useful when we want to stop the connection
+        immediately without waiting for the server to send any final tokens.
+        Args:
+            frame: The cancel frame.
+        """
+        await super().cancel(frame)
+        await self._cleanup()
+    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
+        """Send audio data to Soniox STT Service.
+        Args:
+            audio: Raw audio bytes to transcribe.
+        Yields:
+            Frame: None (transcription results come via WebSocket callbacks).
+        """
+        await self.start_processing_metrics()
+        if self._websocket and self._websocket.state is State.OPEN:
+            await self._websocket.send(audio)
+        await self.stop_processing_metrics()
+        yield None
+    @traced_stt
+    async def _handle_transcription(
+        self, transcript: str, is_final: bool, language: Optional[Language] = None
+    ):
+        """Handle a transcription result with tracing."""
+        pass
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Processes a frame of audio data, either buffering or transcribing it.
+        Args:
+            frame: The frame to process.
+            direction: The direction of frame processing.
+        """
+        await super().process_frame(frame, direction)
+        if isinstance(frame, UserStoppedSpeakingFrame) and self._vad_force_turn_endpoint:
+            # Send finalize message to Soniox so we get the final tokens asap.
+            if self._websocket and self._websocket.state is State.OPEN:
+                await self._websocket.send(FINALIZE_MESSAGE)
+                logger.debug(f"Triggered finalize event on: {frame.name=}, {direction=}")
+    async def _send_stop_recording(self):
+        """Send stop recording message to Soniox."""
+        if self._websocket and self._websocket.state is State.OPEN:
+            # Send stop recording message
+            await self._websocket.send("")
+    async def _keepalive_task_handler(self):
+        """Connection has to be open all the time."""
+        try:
+            while True:
+                logger.trace("Sending keepalive message")
+                if self._websocket and self._websocket.state is State.OPEN:
+                    await self._websocket.send(KEEPALIVE_MESSAGE)
+                else:
+                    logger.debug("WebSocket connection closed.")
+                    break
+                await asyncio.sleep(5)
+        except websockets.exceptions.ConnectionClosed:
+            # Expected when closing the connection
+            logger.debug("WebSocket connection closed, keepalive task stopped.")
+        except Exception as e:
+            logger.error(f"{self} error (_keepalive_task_handler): {e}")
+            await self.push_error(ErrorFrame(f"{self} error (_keepalive_task_handler): {e}"))
+    async def _receive_task_handler(self):
+        if not self._websocket:
+            return
+        # Transcription frame will be only sent after we get the "endpoint" event.
+        self._final_transcription_buffer = []
+        async def send_endpoint_transcript():
+            if self._final_transcription_buffer:
+                text = "".join(map(lambda token: token["text"], self._final_transcription_buffer))
+                await self.push_frame(
+                    TranscriptionFrame(
+                        text=text,
+                        user_id=self._user_id,
+                        timestamp=time_now_iso8601(),
+                        result=self._final_transcription_buffer,
+                    )
+                )
+                await self._handle_transcription(text, is_final=True)
+                await self.stop_processing_metrics()
+                self._final_transcription_buffer = []
+        try:
+            async for message in self._websocket:
+                content = json.loads(message)
+                tokens = content["tokens"]
+                if tokens:
+                    if len(tokens) == 1 and tokens[0]["text"] == FINALIZED_TOKEN:
+                        # Ignore finalized token, prevent auto-finalize cycling.
+                        pass
+                    else:
+                        # Got at least one token, so we can reset the auto finalize delay.
+                        self._last_tokens_received = time.time()
+                # We will only send the final tokens after we get the "endpoint" event.
+                non_final_transcription = []
+                for token in tokens:
+                    if token["is_final"]:
+                        if is_end_token(token):
+                            # Found an endpoint, tokens until here will be sent as transcript,
+                            # the rest will be sent as interim tokens (even final tokens).
+                            await send_endpoint_transcript()
+                        else:
+                            self._final_transcription_buffer.append(token)
+                    else:
+                        non_final_transcription.append(token)
+                if self._final_transcription_buffer or non_final_transcription:
+                    final_text = "".join(
+                        map(lambda token: token["text"], self._final_transcription_buffer)
+                    )
+                    non_final_text = "".join(
+                        map(lambda token: token["text"], non_final_transcription)
+                    )
+                    await self.push_frame(
+                        InterimTranscriptionFrame(
+                            # Even final tokens are sent as interim tokens as we want to send
+                            # nicely formatted messages - therefore waiting for the endpoint.
+                            text=final_text + non_final_text,
+                            user_id=self._user_id,
+                            timestamp=time_now_iso8601(),
+                            result=self._final_transcription_buffer + non_final_transcription,
+                        )
+                    )
+                error_code = content.get("error_code")
+                error_message = content.get("error_message")
+                if error_code or error_message:
+                    # In case of error, still send the final transcript (if any remaining in the buffer).
+                    await send_endpoint_transcript()
+                    logger.error(
+                        f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
+                    )
+                    await self.push_error(
+                        ErrorFrame(
+                            f"{self} error: {error_code} (_receive_task_handler) - {error_message}"
+                        )
+                    )
+                finished = content.get("finished")
+                if finished:
+                    # When finished, still send the final transcript (if any remaining in the buffer).
+                    await send_endpoint_transcript()
+                    logger.debug("Transcription finished.")
+                    await self._cleanup()
+                    return
+        except websockets.exceptions.ConnectionClosed:
+            # Expected when closing the connection.
+            pass
+        except Exception as e:
+            logger.error(f"{self} error: {e}")
+            await self.push_error(ErrorFrame(f"{self} error: {e}"))

pipecat/services/speechmatics/stt.py CHANGED Viewed

@@ -23,6 +23,7 @@ from pipecat.frames.frames import (
     BotInterruptionFrame,
     CancelFrame,
     EndFrame,
+    ErrorFrame,
     Frame,
     InterimTranscriptionFrame,
     StartFrame,
@@ -463,8 +464,14 @@ class SpeechmaticsSTTService(STTService):
     async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
         """Adds audio to the audio buffer and yields None."""
-        await self._client.send_audio(audio)
-        yield None
+        try:
+            if self._client:
+                await self._client.send_audio(audio)
+            yield None
+        except Exception as e:
+            logger.error(f"Speechmatics error: {e}")
+            yield ErrorFrame(f"Speechmatics error: {e}", fatal=False)
+            await self._disconnect()
     def update_params(
         self,
@@ -520,7 +527,7 @@ class SpeechmaticsSTTService(STTService):
         )
         # Log the event
-        logger.debug("Connected to Speechmatics STT service")
+        logger.debug(f"{self} Connecting to Speechmatics STT service")
         # Recognition started event
         @self._client.on(ServerMessageType.RECOGNITION_STARTED)
@@ -562,31 +569,36 @@ class SpeechmaticsSTTService(STTService):
                 )
         # Start session
-        await self._client.start_session(
-            transcription_config=self._transcription_config,
-            audio_format=AudioFormat(
-                encoding=self._params.audio_encoding,
-                sample_rate=self.sample_rate,
-                chunk_size=self._params.chunk_size,
-            ),
-        )
+        try:
+            await self._client.start_session(
+                transcription_config=self._transcription_config,
+                audio_format=AudioFormat(
+                    encoding=self._params.audio_encoding,
+                    sample_rate=self.sample_rate,
+                    chunk_size=self._params.chunk_size,
+                ),
+            )
+            logger.debug(f"{self} Connected to Speechmatics STT service")
+        except Exception as e:
+            logger.error(f"{self} Error connecting to Speechmatics: {e}")
+        finally:
+            self._client = None
     async def _disconnect(self) -> None:
         """Disconnect from the STT service."""
         # Disconnect the client
+        self.logger.debug(f"{self} Disconnecting from Speechmatics STT service")
         try:
             if self._client:
-                await asyncio.wait_for(self._client.close(), timeout=1.0)
+                await asyncio.wait_for(self._client.close(), timeout=5.0)
+                self.logger.debug(f"{self} Disconnected from Speechmatics STT service")
         except asyncio.TimeoutError:
-            logger.warning("Timeout while closing Speechmatics client connection")
+            logger.warning(f"{self} Timeout while closing Speechmatics client connection")
         except Exception as e:
-            logger.error(f"Error closing Speechmatics client: {e}")
+            logger.error(f"{self} Error closing Speechmatics client: {e}")
         finally:
             self._client = None
-        # Log the event
-        logger.debug("Disconnected from Speechmatics STT service")
     def _process_config(self) -> None:
         """Create a formatted STT transcription config.

pipecat/services/stt_service.py CHANGED Viewed

@@ -34,13 +34,6 @@ class STTService(AIService):
     Provides common functionality for STT services including audio passthrough,
     muting, settings management, and audio processing. Subclasses must implement
     the run_stt method to provide actual speech recognition.
-    Args:
-            audio_passthrough: Whether to pass audio frames downstream after processing.
-                Defaults to True.
-            sample_rate: The sample rate for audio input. If None, will be determined
-                from the start frame.
-            **kwargs: Additional arguments passed to the parent AIService.
     """
     def __init__(
@@ -50,15 +43,26 @@ class STTService(AIService):
         sample_rate: Optional[int] = None,
         **kwargs,
     ):
+        """Initialize the STT service.
+        Args:
+            audio_passthrough: Whether to pass audio frames downstream after processing.
+                Defaults to True.
+            sample_rate: The sample rate for audio input. If None, will be determined
+                from the start frame.
+            **kwargs: Additional arguments passed to the parent AIService.
+        """
         super().__init__(**kwargs)
         self._audio_passthrough = audio_passthrough
         self._init_sample_rate = sample_rate
         self._sample_rate = 0
         self._settings: Dict[str, Any] = {}
+        self._tracing_enabled: bool = False
         self._muted: bool = False
         # Custom fields from ai_services.py for voicemail and first speech handling
         self._first_speech_handled: bool = False
         self._voicemail_detect: bool = False
+        self._user_id: str = ""
     @property
     def is_muted(self) -> bool:
@@ -119,6 +123,7 @@ class STTService(AIService):
         self._sample_rate = self._init_sample_rate or frame.audio_in_sample_rate
         if hasattr(frame, "metadata") and "voicemail_detect" in frame.metadata:
             self._voicemail_detect = frame.metadata["voicemail_detect"]
+        self._tracing_enabled = frame.enable_tracing
     async def _update_settings(self, settings: Mapping[str, Any]):
         self.logger.info(f"Updating STT settings: {self._settings}")
@@ -138,6 +143,11 @@ class STTService(AIService):
     async def process_audio_frame(self, frame: AudioRawFrame, direction: FrameDirection):
         """Process an audio frame for speech recognition.
+        If the service is muted, this method does nothing. Otherwise, it
+        processes the audio frame and runs speech-to-text on it, yielding
+        transcription results. If the frame has a user_id, it is stored
+        for later use in transcription.
         Args:
             frame: The audio frame to process.
             direction: The direction of frame processing.
@@ -146,6 +156,21 @@ class STTService(AIService):
         # If first speech is handled, we dont need to worry anymore.
         if self._muted and ((not self._voicemail_detect) or self._first_speech_handled):
             return
+        # UserAudioRawFrame contains a user_id (e.g. Daily, Livekit)
+        if hasattr(frame, "user_id"):
+            self._user_id = frame.user_id
+        # AudioRawFrame does not have a user_id (e.g. SmallWebRTCTransport, websockets)
+        else:
+            self._user_id = ""
+        if not frame.audio:
+            # Ignoring in case we don't have audio to transcribe.
+            logger.warning(
+                f"Empty audio frame received for STT service: {self.name} {frame.num_frames}"
+            )
+            return
         await self.process_generator(self.run_stt(frame.audio))
     async def process_frame(self, frame: Frame, direction: FrameDirection):
@@ -187,14 +212,16 @@ class SegmentedSTTService(STTService):
     Requires VAD to be enabled in the pipeline to function properly. Maintains a
     small audio buffer to account for the delay between actual speech start and
     VAD detection.
+    """
+    def __init__(self, *, sample_rate: Optional[int] = None, **kwargs):
+        """Initialize the segmented STT service.
-    Args:
+        Args:
             sample_rate: The sample rate for audio input. If None, will be determined
                 from the start frame.
             **kwargs: Additional arguments passed to the parent STTService.
-    """
-    def __init__(self, *, sample_rate: Optional[int] = None, **kwargs):
+        """
         super().__init__(sample_rate=sample_rate, **kwargs)
         self._content = None
         self._wave = None
@@ -251,10 +278,19 @@ class SegmentedSTTService(STTService):
         Continuously buffers audio, growing the buffer while user is speaking and
         maintaining a small buffer when not speaking to account for VAD delay.
+        If the frame has a user_id, it is stored for later use in transcription.
         Args:
             frame: The audio frame to process.
             direction: The direction of frame processing.
         """
+        # UserAudioRawFrame contains a user_id (e.g. Daily, Livekit)
+        if hasattr(frame, "user_id"):
+            self._user_id = frame.user_id
+        # AudioRawFrame does not have a user_id (e.g. SmallWebRTCTransport, websockets)
+        else:
+            self._user_id = ""
         # If the user is speaking the audio buffer will keep growing.
         self._audio_buffer += frame.audio

dv-pipecat-ai 0.0.74.dev770__py3-none-any.whl → 0.0.82.dev776__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.74.dev770py3-none-any.whl → 0.0.82.dev776py3-none-any.whl