PyPI - dv-pipecat-ai - Versions diffs - 0.0.85.dev847__py3-none-any.whl → 0.0.85.dev850__py3-none-any.whl - Mend

dv-pipecat-ai 0.0.85.dev847py3-none-any.whl → 0.0.85.dev850py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (9) hide show

{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dv-pipecat-ai
-Version: 0.0.85.dev847
+Version: 0.0.85.dev850
 Summary: An open source framework for voice (and multimodal) assistants
 License-Expression: BSD-2-Clause
 Project-URL: Source, https://github.com/pipecat-ai/pipecat

{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-dv_pipecat_ai-0.0.85.dev847.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
+dv_pipecat_ai-0.0.85.dev850.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
 pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
 pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -79,7 +79,7 @@ pipecat/extensions/voicemail/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
 pipecat/extensions/voicemail/voicemail_detector.py,sha256=JxmU2752iWP_1_GmzZReNESUTFAeyEa4XBPL20_C208,30004
 pipecat/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/frames/frames.proto,sha256=JXZm3VXLR8zMOUcOuhVoe2mhM3MQIQGMJXLopdJO_5Y,839
-pipecat/frames/frames.py,sha256=vuYtmyK1QSU2AWx2c_pFQhcmpXqSTnfqAXF6DXKzTG8,49605
+pipecat/frames/frames.py,sha256=248d54lNOyO04dq9ni51yUTWUItmGw8b9QKarrDGNeo,50354
 pipecat/frames/protobufs/frames_pb2.py,sha256=VHgGV_W7qQ4sfQK6RHb5_DggLm3PiSYMr6aBZ8_p1cQ,2590
 pipecat/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/metrics/metrics.py,sha256=bdZNciEtLTtA-xgoKDz2RJAy6fKrXkTwz3pryVHzc2M,2713
@@ -210,14 +210,14 @@ pipecat/services/cartesia/tts.py,sha256=I_OZCINywkDXmYzFL35MjSN8cAuNEaJs7nj0YB_o
 pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
 pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
 pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
-pipecat/services/deepgram/stt.py,sha256=fzKirTjTopwXNQEEPuUOIgk4AMvTJQcrh6H11w13q2c,16185
+pipecat/services/deepgram/stt.py,sha256=t7P0zWLBitSF_KQqHr5aYjKdJZRnC36styl_eL86R88,24752
 pipecat/services/deepgram/tts.py,sha256=H_2WCJEx3_L4ytrHHRNkA-6GKTd1coou_vvTfiEodpQ,3745
 pipecat/services/deepgram/flux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41gHMXVxEM,25887
 pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
 pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
 pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
-pipecat/services/elevenlabs/stt.py,sha256=F3xD82eOIy5OyyE-5StdoFFvKjIXlos2yyP0cyNQj6Y,12214
+pipecat/services/elevenlabs/stt.py,sha256=_RhBKpUYEGKMpcO7y4RLxmEOMK11LZFdZqDFIA-DZXk,27303
 pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
 pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
 pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
@@ -280,7 +280,7 @@ pipecat/services/nim/llm.py,sha256=o4WPGI6kOmSiMV7WwOZ0cNEAoq9hW4Aqs2R8X7c9i94,4
 pipecat/services/ollama/__init__.py,sha256=aw-25zYsR8LR74OFFlMKMTnJjaKwOzdPWVsClueNRkI,255
 pipecat/services/ollama/llm.py,sha256=rfpG92LRHGJlpENKhF6ld8CLVS9DxlKW-WRVNldOIGs,1605
 pipecat/services/openai/__init__.py,sha256=V0ZVa8PzEm3hmcStYICbAsYwfgk4ytZ6kiQoq9UZPmI,354
-pipecat/services/openai/base_llm.py,sha256=J4Ltg1KOXciiUIMBFLn0SmDTZereEE-1LKrPfBsLzFw,19127
+pipecat/services/openai/base_llm.py,sha256=jOiWacimREywCMZZwAwH8RAHCbwnnXvbqAjWQUYA0yM,20727
 pipecat/services/openai/image.py,sha256=3e3h-dVQ6DQuQE7fp8akXwRMd-oYOdGuZg7RCOjHu9A,2994
 pipecat/services/openai/llm.py,sha256=_aKtz1VebSFUUenT3tH6mBW9pSCm65_u45cDu_dkTzs,7396
 pipecat/services/openai/stt.py,sha256=Idf0k73kxFyDgNRBt62MFpoKKNsBV9bwvJteJ6MGWzQ,2419
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
 pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
 pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
 pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
-dv_pipecat_ai-0.0.85.dev847.dist-info/METADATA,sha256=UDO7OKPaUOT1xGmii54o0h6Ee5jwWVp2ztvSbSKK-KU,32955
-dv_pipecat_ai-0.0.85.dev847.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-dv_pipecat_ai-0.0.85.dev847.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
-dv_pipecat_ai-0.0.85.dev847.dist-info/RECORD,,
+dv_pipecat_ai-0.0.85.dev850.dist-info/METADATA,sha256=rqzfsDkrkClO-BvwwJr5_b2ggADWXFKhgzPgToBwDm0,32955
+dv_pipecat_ai-0.0.85.dev850.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+dv_pipecat_ai-0.0.85.dev850.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
+dv_pipecat_ai-0.0.85.dev850.dist-info/RECORD,,

pipecat/frames/frames.py CHANGED Viewed

@@ -586,6 +586,27 @@ class LLMRunFrame(DataFrame):
     pass
+@dataclass
+class WarmupLLMFrame(DataFrame):
+    """Frame to trigger prompt caching/warmup in supported LLM providers.
+    This frame instructs the LLM service to cache the provided messages
+    without generating a visible response. Primarily used for warming up provider
+    caches (e.g., Claude's prompt caching, OpenAI's prompt caching) to improve
+    latency for subsequent requests.
+    The LLM service should:
+    1. Send the messages to the provider to trigger caching
+    2. Generate a minimal response (e.g., single word)
+    3. Discard the response without emitting LLM output frames
+    Parameters:
+        messages: List of messages to send for cache warming (should match conversation structure).
+    """
+    messages: List[dict]
 @dataclass
 class LLMMessagesAppendFrame(DataFrame):
     """Frame containing LLM messages to append to current context.

pipecat/services/deepgram/stt.py CHANGED Viewed

@@ -8,7 +8,11 @@
 import asyncio
 import logging
-from typing import AsyncGenerator, Dict, Optional
+import os
+import socket
+import time
+from typing import AsyncGenerator, Callable, Dict, Optional
+from urllib.parse import urlparse
 from loguru import logger
@@ -29,6 +33,155 @@ from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601
 from pipecat.utils.tracing.service_decorators import traced_stt
+_PROCESS_START_MONOTONIC = time.monotonic()
+def _read_first_numeric_file(paths):
+    for path in paths:
+        try:
+            with open(path, "r", encoding="utf-8") as file:
+                value = file.read().strip()
+        except FileNotFoundError:
+            continue
+        except OSError:
+            continue
+        if not value or value == "max":
+            return None
+        try:
+            return int(value)
+        except ValueError:
+            continue
+    return None
+def _read_proc_status_value(key):
+    try:
+        with open("/proc/self/status", "r", encoding="utf-8") as status_file:
+            for line in status_file:
+                if line.startswith(key):
+                    parts = line.split()
+                    if len(parts) >= 2:
+                        return int(parts[1]) * 1024  # kB -> bytes
+    except FileNotFoundError:
+        return None
+    except OSError:
+        return None
+    return None
+def _read_cpu_throttling():
+    paths = ["/sys/fs/cgroup/cpu.stat", "/sys/fs/cgroup/cpu/cpu.stat"]
+    for path in paths:
+        try:
+            with open(path, "r", encoding="utf-8") as cpu_file:
+                for line in cpu_file:
+                    if line.startswith("nr_throttled"):
+                        parts = line.split()
+                        if len(parts) >= 2:
+                            return int(parts[1])
+        except FileNotFoundError:
+            continue
+        except OSError:
+            continue
+    return None
+def _collect_runtime_diagnostics(
+    loop: Optional[asyncio.AbstractEventLoop] = None,
+    extra_context: Optional[Dict] = None,
+    context_provider: Optional[Callable[[], Dict]] = None,
+):
+    if loop is None:
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+    uptime_s = round(time.monotonic() - _PROCESS_START_MONOTONIC, 1)
+    rss_bytes = _read_proc_status_value("VmRSS:")
+    rss_mb = round(rss_bytes / (1024**2), 2) if rss_bytes else None
+    cgroup_usage_bytes = _read_first_numeric_file(
+        ["/sys/fs/cgroup/memory.current", "/sys/fs/cgroup/memory/memory.usage_in_bytes"]
+    )
+    cgroup_limit_bytes = _read_first_numeric_file(
+        ["/sys/fs/cgroup/memory.max", "/sys/fs/cgroup/memory/memory.limit_in_bytes"]
+    )
+    cgroup_usage_mb = (
+        round(cgroup_usage_bytes / (1024**2), 2) if cgroup_usage_bytes is not None else None
+    )
+    cgroup_limit_mb = (
+        round(cgroup_limit_bytes / (1024**2), 2) if cgroup_limit_bytes not in (None, 0) else None
+    )
+    cgroup_pct = (
+        round(cgroup_usage_bytes / cgroup_limit_bytes * 100, 2)
+        if cgroup_usage_bytes is not None and cgroup_limit_bytes not in (None, 0)
+        else None
+    )
+    try:
+        open_fds = len(os.listdir("/proc/self/fd"))
+    except Exception:
+        open_fds = None
+    pending_tasks = None
+    if loop:
+        try:
+            pending_tasks = len(asyncio.all_tasks(loop))
+        except Exception:
+            pending_tasks = None
+    suspected_cause = "unknown"
+    if cgroup_pct and cgroup_pct >= 90:
+        suspected_cause = "memory_pressure"
+    elif uptime_s < 180:
+        suspected_cause = "pod_cold_start"
+    diagnostics = {
+        "uptime_s": uptime_s,
+        "rss_mb": rss_mb,
+        "cgroup_usage_mb": cgroup_usage_mb,
+        "cgroup_limit_mb": cgroup_limit_mb,
+        "cgroup_usage_pct": cgroup_pct,
+        "open_fds": open_fds,
+        "pending_tasks": pending_tasks,
+        "suspected_cause": suspected_cause,
+    }
+    cpu_throttled = _read_cpu_throttling()
+    if cpu_throttled is not None:
+        diagnostics["cpu_nr_throttled"] = cpu_throttled
+    if context_provider:
+        try:
+            ctx = context_provider() or {}
+            if isinstance(ctx, dict):
+                diagnostics.update({k: v for k, v in ctx.items() if v is not None})
+        except Exception as exc:
+            diagnostics["context_provider_error"] = str(exc)
+    if extra_context:
+        diagnostics.update({k: v for k, v in extra_context.items() if v is not None})
+    return {k: v for k, v in diagnostics.items() if v is not None}
+def _derive_connect_endpoint(base_url: str):
+    if not base_url:
+        return "api.deepgram.com", 443
+    parsed = urlparse(base_url)
+    host = parsed.hostname or "api.deepgram.com"
+    if parsed.port:
+        port = parsed.port
+    elif parsed.scheme in ("https", "wss"):
+        port = 443
+    else:
+        port = 80
+    return host, port
 try:
     from deepgram import (
         AsyncListenWebSocketClient,
@@ -64,6 +217,7 @@ class DeepgramSTTService(STTService):
         addons: Optional[Dict] = None,
         max_connect_retries: int = 3,
         connect_timeout_s: float = 2.5,
+        diagnostics_context_provider: Optional[Callable[[], Dict]] = None,
         **kwargs,
     ):
         """Initialize the Deepgram STT service.
@@ -82,6 +236,9 @@ class DeepgramSTTService(STTService):
             max_connect_retries: Maximum number of connection attempts before giving up.
             connect_timeout_s: Maximum time in seconds to wait for a connection attempt.
                 Connection retries wait 100ms between attempts.
+            diagnostics_context_provider: Optional callable returning a dict with
+                additional runtime diagnostics (e.g., active call counts) to append
+                to warning logs.
             **kwargs: Additional arguments passed to the parent STTService.
         """
         sample_rate = sample_rate or (live_options.sample_rate if live_options else None)
@@ -125,6 +282,7 @@ class DeepgramSTTService(STTService):
         self.set_model_name(merged_options["model"])
         self._settings = merged_options
         self._addons = addons
+        self._diagnostics_context_provider = diagnostics_context_provider
         # Connection retry settings (100ms delay between retries)
         self._max_connect_retries = max_connect_retries
@@ -142,6 +300,7 @@ class DeepgramSTTService(STTService):
                 verbose=logging.ERROR,  # Enable error level and above logging
             ),
         )
+        self._connect_host, self._connect_port = _derive_connect_endpoint(base_url)
         if self.vad_enabled:
             self._register_event_handler("on_speech_started")
@@ -230,7 +389,10 @@ class DeepgramSTTService(STTService):
     async def _connect(self):
         self.logger.debug("Attempting to connect to Deepgram...")
+        loop = asyncio.get_running_loop()
         for attempt in range(self._max_connect_retries):
+            attempt_started = time.perf_counter()
+            dns_ms = await self._measure_dns_resolution(loop)
             try:
                 # Clean up any previous connection attempt in background (non-blocking)
                 if hasattr(self, "_connection") and self._connection is not None:
@@ -266,18 +428,67 @@ class DeepgramSTTService(STTService):
                         timeout=self._connect_timeout_s,
                     )
                 except asyncio.TimeoutError:
+                    elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
+                    diagnostics = _collect_runtime_diagnostics(
+                        loop,
+                        extra_context={
+                            "dns_ms": dns_ms,
+                            "connect_duration_ms": elapsed_ms,
+                        },
+                        context_provider=self._diagnostics_context_provider,
+                    )
                     self.logger.warning(
-                        f"Deepgram connection attempt {attempt + 1}/{self._max_connect_retries} timed out after {self._connect_timeout_s} second(s)."
+                        (
+                            "Deepgram connection attempt {}/{} timed out after {:.2f} second(s). "
+                            "runtime_diagnostics={}"
+                        ),
+                        attempt + 1,
+                        self._max_connect_retries,
+                        self._connect_timeout_s,
+                        diagnostics,
                     )
                     start_result = False
                 except Exception as start_error:
+                    elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
+                    diagnostics = _collect_runtime_diagnostics(
+                        loop,
+                        extra_context={
+                            "dns_ms": dns_ms,
+                            "connect_duration_ms": elapsed_ms,
+                        },
+                        context_provider=self._diagnostics_context_provider,
+                    )
                     self.logger.warning(
-                        f"Deepgram connection attempt {attempt + 1}/{self._max_connect_retries} failed with an exception: {start_error}"
+                        (
+                            "Deepgram connection attempt {}/{} failed with an exception: {}. "
+                            "runtime_diagnostics={}"
+                        ),
+                        attempt + 1,
+                        self._max_connect_retries,
+                        start_error,
+                        diagnostics,
                     )
                     start_result = False
                 else:
                     if start_result:
-                        self.logger.info("Successfully connected to Deepgram.")
+                        elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
+                        diagnostics = _collect_runtime_diagnostics(
+                            loop,
+                            extra_context={
+                                "dns_ms": dns_ms,
+                                "connect_duration_ms": elapsed_ms,
+                            },
+                            context_provider=self._diagnostics_context_provider,
+                        )
+                        self.logger.info(
+                            (
+                                "Successfully connected to Deepgram on attempt {} in {:.2f} ms. "
+                                "runtime_diagnostics={}"
+                            ),
+                            attempt + 1,
+                            elapsed_ms,
+                            diagnostics,
+                        )
                         return  # Exit the method on success
                 self.logger.warning(
@@ -285,8 +496,24 @@ class DeepgramSTTService(STTService):
                 )
             except Exception as e:
+                elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
+                diagnostics = _collect_runtime_diagnostics(
+                    loop,
+                    extra_context={
+                        "dns_ms": dns_ms,
+                        "connect_duration_ms": elapsed_ms,
+                    },
+                    context_provider=self._diagnostics_context_provider,
+                )
                 self.logger.warning(
-                    f"Deepgram connection attempt {attempt + 1}/{self._max_connect_retries} failed with an exception: {e}"
+                    (
+                        "Deepgram connection attempt {}/{} failed with an exception: {}. "
+                        "runtime_diagnostics={}"
+                    ),
+                    attempt + 1,
+                    self._max_connect_retries,
+                    e,
+                    diagnostics,
                 )
             # If this is not the last attempt, wait 100ms before retrying
@@ -300,6 +527,22 @@ class DeepgramSTTService(STTService):
         self.logger.error(error_msg)
         await self.push_error(ErrorFrame(error_msg, fatal=True))
+    async def _measure_dns_resolution(self, loop: Optional[asyncio.AbstractEventLoop]):
+        if not loop or not self._connect_host:
+            return None
+        try:
+            dns_task = loop.getaddrinfo(
+                self._connect_host,
+                self._connect_port,
+                type=socket.SOCK_STREAM,
+                proto=socket.IPPROTO_TCP,
+            )
+            start = time.perf_counter()
+            await asyncio.wait_for(dns_task, timeout=1.0)
+            return round((time.perf_counter() - start) * 1000, 2)
+        except Exception:
+            return None
     async def _disconnect(self):
         # Guard against missing connection instance and ensure proper async check
         connection: AsyncListenWebSocketClient = getattr(self, "_connection", None)

pipecat/services/elevenlabs/stt.py CHANGED Viewed

@@ -4,26 +4,43 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
-"""ElevenLabs speech-to-text service implementation.
-This module provides integration with ElevenLabs' Speech-to-Text API for transcription
-using segmented audio processing. The service uploads audio files and receives
-transcription results directly.
-"""
+"""ElevenLabs speech-to-text service implementations."""
+import asyncio
+import base64
 import io
-from typing import AsyncGenerator, Optional
+import json
+import urllib.parse
+from typing import Any, AsyncGenerator, Dict, Literal, Optional
 import aiohttp
 from loguru import logger
 from pydantic import BaseModel
-from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
-from pipecat.services.stt_service import SegmentedSTTService
+from pipecat.frames.frames import (
+    CancelFrame,
+    EndFrame,
+    ErrorFrame,
+    Frame,
+    InterimTranscriptionFrame,
+    StartFrame,
+    TranscriptionFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame,
+)
+from pipecat.processors.frame_processor import FrameDirection
+from pipecat.services.stt_service import SegmentedSTTService, WebsocketSTTService
 from pipecat.transcriptions.language import Language
 from pipecat.utils.time import time_now_iso8601
 from pipecat.utils.tracing.service_decorators import traced_stt
+try:
+    from websockets.asyncio.client import connect as websocket_connect
+    from websockets.protocol import State
+except ModuleNotFoundError:
+    websocket_connect = None  # type: ignore[assignment]
+    State = None  # type: ignore[assignment]
 def language_to_elevenlabs_language(language: Language) -> Optional[str]:
     """Convert a Language enum to ElevenLabs language code.
@@ -150,6 +167,19 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
     return result
+def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
+    """Convert an ElevenLabs language code back to a Language enum value."""
+    if not language_code:
+        return None
+    normalized = language_code.lower()
+    for language in Language:
+        code = language_to_elevenlabs_language(language)
+        if code and code.lower() == normalized:
+            return language
+    return None
 class ElevenLabsSTTService(SegmentedSTTService):
     """Speech-to-text service using ElevenLabs' file-based API.
@@ -337,3 +367,376 @@ class ElevenLabsSTTService(SegmentedSTTService):
         except Exception as e:
             self.logger.error(f"ElevenLabs STT error: {e}")
             yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
+class ElevenLabsRealtimeSTTService(WebsocketSTTService):
+    """Realtime speech-to-text service using ElevenLabs Scribe v2 WebSocket API."""
+    class InputParams(BaseModel):
+        """Realtime connection parameters derived from ElevenLabs documentation."""
+        language: Optional[Language] = None
+        commit_strategy: Literal["manual", "vad"] = "manual"
+        vad_silence_threshold_secs: Optional[float] = None
+        vad_threshold: Optional[float] = None
+        min_speech_duration_ms: Optional[int] = None
+        min_silence_duration_ms: Optional[int] = None
+    def __init__(
+        self,
+        *,
+        api_key: str,
+        sample_rate: Optional[int] = None,
+        model: str = "scribe_v2_realtime",
+        url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
+        params: Optional["ElevenLabsRealtimeSTTService.InputParams"] = None,
+        reconnect_on_error: bool = True,
+        **kwargs,
+    ):
+        """Initialize the realtime STT service.
+        Args:
+            api_key: ElevenLabs API key for authentication.
+            sample_rate: Optional input sample rate. Defaults to pipeline sample rate.
+            model: Scribe realtime model identifier.
+            url: WebSocket endpoint for realtime transcription.
+            params: Optional realtime configuration options.
+            reconnect_on_error: Whether to auto-reconnect on transient failures.
+            **kwargs: Additional arguments forwarded to WebsocketSTTService.
+        """
+        if websocket_connect is None or State is None:
+            logger.error(
+                "In order to use ElevenLabsRealtimeSTTService, you need to "
+                "`pip install pipecat-ai[elevenlabs]` (websockets extra)."
+            )
+            raise ModuleNotFoundError("Missing optional dependency: websockets")
+        super().__init__(sample_rate=sample_rate, reconnect_on_error=reconnect_on_error, **kwargs)
+        self._api_key = api_key
+        self._url = url
+        self.set_model_name(model)
+        self._model = model
+        self._params = params or ElevenLabsRealtimeSTTService.InputParams()
+        self._language_override = self._params.language
+        self._encoding = None
+        self._receive_task: Optional[asyncio.Task] = None
+        self._pending_final_message: Optional[Dict[str, Any]] = None
+        self._pending_final_task: Optional[asyncio.Task] = None
+        self._timestamp_merge_delay_s = 0.25
+        self._ttfb_started = False
+    @property
+    def commit_strategy(self) -> str:
+        """Return the configured commit strategy (manual or vad)."""
+        return (self._params.commit_strategy or "manual").lower()
+    def can_generate_metrics(self) -> bool:
+        """Realtime ElevenLabs service supports latency metrics."""
+        return True
+    async def start(self, frame: StartFrame):
+        """Start the realtime STT service and establish WebSocket connection."""
+        await super().start(frame)
+        self._encoding = self._determine_encoding(self.sample_rate)
+        await self._connect()
+    async def stop(self, frame: EndFrame):
+        """Stop the realtime STT service and close WebSocket connection."""
+        await super().stop(frame)
+        await self._disconnect()
+    async def cancel(self, frame: CancelFrame):
+        """Cancel the realtime STT service and close WebSocket connection."""
+        await super().cancel(frame)
+        await self._disconnect()
+    async def set_language(self, language: Language):
+        """Update preferred transcription language (requires reconnect)."""
+        self._language_override = language
+        self._params.language = language
+        if self._websocket:
+            await self._disconnect()
+            await self._connect()
+    async def set_model(self, model: str):
+        """Set the STT model and reconnect the WebSocket."""
+        await super().set_model(model)
+        self._model = model
+        if self._websocket:
+            await self._disconnect()
+            await self._connect()
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        """Process frames and handle VAD events for commit strategy."""
+        await super().process_frame(frame, direction)
+        if isinstance(frame, UserStartedSpeakingFrame):
+            if frame.emulated:
+                return
+            self._ttfb_started = False
+            await self.start_processing_metrics()
+        elif isinstance(frame, UserStoppedSpeakingFrame):
+            if frame.emulated:
+                return
+            if self.commit_strategy == "manual":
+                await self._send_commit()
+    async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
+        """Stream audio chunks over the ElevenLabs realtime WebSocket."""
+        if not audio:
+            yield None
+            return
+        await self._ensure_connection()
+        await self._send_audio_chunk(audio)
+        yield None
+    async def _ensure_connection(self):
+        if not self._websocket or self._websocket.state is State.CLOSED:
+            await self._connect()
+    async def _connect(self):
+        await self._connect_websocket()
+        if self._websocket and not self._receive_task:
+            self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
+    async def _disconnect(self):
+        if self._receive_task:
+            await self.cancel_task(self._receive_task)
+            self._receive_task = None
+        await self._clear_pending_final()
+        await self._disconnect_websocket()
+    async def _connect_websocket(self):
+        try:
+            if self._websocket and self._websocket.state is State.OPEN:
+                return
+            ws_url = self._build_websocket_url()
+            headers = {"xi-api-key": self._api_key}
+            self.logger.debug(f"Connecting to ElevenLabs realtime STT at {ws_url}")
+            self._websocket = await websocket_connect(ws_url, additional_headers=headers)
+            await self._call_event_handler("on_connected")
+        except Exception as e:
+            self.logger.error(f"{self} unable to connect to ElevenLabs realtime STT: {e}")
+            self._websocket = None
+            await self._call_event_handler("on_connection_error", f"{e}")
+    async def _disconnect_websocket(self):
+        try:
+            await self.stop_all_metrics()
+            if self._websocket and self._websocket.state is State.OPEN:
+                self.logger.debug("Disconnecting from ElevenLabs realtime STT")
+                await self._websocket.close()
+        except Exception as e:
+            self.logger.error(f"{self} error closing ElevenLabs realtime websocket: {e}")
+        finally:
+            self._websocket = None
+            await self._call_event_handler("on_disconnected")
+    async def _receive_messages(self):
+        async for message in self._get_websocket():
+            await self._process_event(message)
+    def _get_websocket(self):
+        if not self._websocket:
+            raise RuntimeError("ElevenLabs realtime websocket not connected")
+        return self._websocket
+    async def _process_event(self, message: Any):
+        try:
+            data = json.loads(message)
+        except json.JSONDecodeError:
+            self.logger.warning(f"ElevenLabs realtime STT sent invalid JSON: {message}")
+            return
+        message_type = data.get("message_type")
+        if message_type == "session_started":
+            self.logger.debug("ElevenLabs realtime session started")
+            return
+        if message_type == "partial_transcript":
+            await self._emit_partial_transcript(data)
+        elif message_type == "committed_transcript":
+            await self._handle_committed_transcript(data)
+        elif message_type == "committed_transcript_with_timestamps":
+            await self._handle_committed_transcript_with_timestamps(data)
+        elif message_type in {
+            "auth_error",
+            "quota_exceeded",
+            "transcriber_error",
+            "input_error",
+            "error",
+        }:
+            fatal = message_type in {"auth_error", "quota_exceeded", "error"}
+            description = data.get("error", data)
+            await self.push_error(
+                ErrorFrame(f"ElevenLabs realtime error: {description}", fatal=fatal)
+            )
+        else:
+            self.logger.debug(f"Unhandled ElevenLabs realtime message: {data}")
+    async def _emit_partial_transcript(self, data: Dict[str, Any]):
+        text = (data.get("text") or data.get("transcript") or "").strip()
+        if not text:
+            return
+        language = (
+            elevenlabs_language_code_to_language(data.get("language_code"))
+            or self._language_override
+        )
+        await self.stop_ttfb_metrics()
+        await self.push_frame(
+            InterimTranscriptionFrame(
+                text,
+                self._user_id,
+                time_now_iso8601(),
+                language,
+                result=data,
+            )
+        )
+    async def _handle_committed_transcript(self, data: Dict[str, Any]):
+        if self._pending_final_message:
+            await self._emit_transcription(self._pending_final_message)
+            self._pending_final_message = None
+        self._pending_final_message = data
+        await self._schedule_pending_final_emit()
+    async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
+        if self._pending_final_message:
+            merged = {**self._pending_final_message, **data}
+            await self._emit_transcription(merged)
+            await self._clear_pending_final()
+        else:
+            await self._emit_transcription(data)
+    async def _schedule_pending_final_emit(self):
+        await self._clear_pending_final(timer_only=True)
+        self._pending_final_task = asyncio.create_task(self._emit_pending_after_delay())
+    async def _emit_pending_after_delay(self):
+        try:
+            await asyncio.sleep(self._timestamp_merge_delay_s)
+            if self._pending_final_message:
+                await self._emit_transcription(self._pending_final_message)
+                self._pending_final_message = None
+        except asyncio.CancelledError:
+            pass
+        finally:
+            self._pending_final_task = None
+    async def _clear_pending_final(self, timer_only: bool = False):
+        if self._pending_final_task:
+            await self.cancel_task(self._pending_final_task)
+            self._pending_final_task = None
+        if not timer_only:
+            self._pending_final_message = None
+    async def _emit_transcription(self, data: Dict[str, Any]):
+        text = (data.get("text") or data.get("transcript") or "").strip()
+        if not text:
+            return
+        language = (
+            elevenlabs_language_code_to_language(data.get("language_code"))
+            or self._language_override
+        )
+        await self.stop_ttfb_metrics()
+        frame = TranscriptionFrame(
+            text,
+            self._user_id,
+            time_now_iso8601(),
+            language,
+            result=data,
+        )
+        await self.push_frame(frame)
+        await self._handle_transcription(text, True, language)
+        await self.stop_processing_metrics()
+    async def _send_audio_chunk(self, audio: bytes):
+        if not audio or not self._websocket:
+            return
+        if not self._ttfb_started:
+            await self.start_ttfb_metrics()
+            self._ttfb_started = True
+        payload = {
+            "message_type": "input_audio_chunk",
+            "audio_base_64": base64.b64encode(audio).decode("ascii"),
+            "commit": False,
+            "sample_rate": self.sample_rate,
+        }
+        await self._websocket.send(json.dumps(payload))
+    async def _send_commit(self):
+        if not self._websocket:
+            return
+        payload = {
+            "message_type": "input_audio_chunk",
+            "audio_base_64": "",
+            "commit": True,
+            "sample_rate": self.sample_rate,
+        }
+        await self._websocket.send(json.dumps(payload))
+    def _build_websocket_url(self) -> str:
+        if not self.sample_rate:
+            raise ValueError(
+                "ElevenLabs realtime STT requires a valid sample rate (start() must run first)."
+            )
+        params = {
+            "model_id": self._model,
+            "encoding": self._encoding or "pcm_16000",
+            "sample_rate": str(self.sample_rate),
+            "commit_strategy": self.commit_strategy,
+        }
+        language_code = (
+            language_to_elevenlabs_language(self._language_override)
+            if self._language_override
+            else None
+        )
+        if language_code:
+            params["language_code"] = language_code
+        if self._params.vad_silence_threshold_secs is not None:
+            params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
+        if self._params.vad_threshold is not None:
+            params["vad_threshold"] = str(self._params.vad_threshold)
+        if self._params.min_speech_duration_ms is not None:
+            params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
+        if self._params.min_silence_duration_ms is not None:
+            params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
+        return f"{self._url}?{urllib.parse.urlencode(params)}"
+    def _determine_encoding(self, sample_rate: int) -> str:
+        if not sample_rate:
+            raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
+        supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
+        if sample_rate not in supported_rates:
+            raise ValueError(
+                f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
+                f"Received {sample_rate} Hz."
+            )
+        return f"pcm_{sample_rate}"
+    @traced_stt
+    async def _handle_transcription(
+        self, transcript: str, is_final: bool, language: Optional[Language] = None
+    ):
+        """Handle a transcription result with tracing."""
+        # Metrics are stopped by the caller when needed.
+        return

pipecat/services/openai/base_llm.py CHANGED Viewed

@@ -32,6 +32,7 @@ from pipecat.frames.frames import (
     LLMMessagesFrame,
     LLMTextFrame,
     LLMUpdateSettingsFrame,
+    WarmupLLMFrame,
 )
 from pipecat.metrics.metrics import LLMTokenUsage
 from pipecat.processors.aggregators.llm_context import LLMContext
@@ -438,14 +439,19 @@ class BaseOpenAILLMService(LLMService):
                 completions and manage settings.
         >>>>>>> dv-stage
-                Args:
+        Args:
                     frame: The frame to process.
                     direction: The direction of frame processing.
         """
         await super().process_frame(frame, direction)
         context = None
-        if isinstance(frame, OpenAILLMContextFrame):
+        if isinstance(frame, WarmupLLMFrame):
+            # Handle warmup frame - prime cache without emitting response
+            # Run in background to avoid blocking the pipeline
+            asyncio.create_task(self._handle_warmup_frame(frame))
+            return  # Don't process further, warmup is silent
+        elif isinstance(frame, OpenAILLMContextFrame):
             # Handle OpenAI-specific context frames
             context = frame.context
         elif isinstance(frame, LLMContextFrame):
@@ -470,3 +476,32 @@ class BaseOpenAILLMService(LLMService):
             finally:
                 await self.stop_processing_metrics()
                 await self.push_frame(LLMFullResponseEndFrame())
+    async def _handle_warmup_frame(self, frame: WarmupLLMFrame):
+        """Handle WarmupLLMFrame to prime the LLM cache without emitting responses.
+        This method sends a minimal request to the LLM to warm up any provider-side
+        caches (like prompt caching). The response is discarded and no frames are emitted.
+        Args:
+            frame: WarmupLLMFrame containing the messages to cache.
+        """
+        try:
+            # Use the provided messages for warmup
+            messages: List[ChatCompletionMessageParam] = frame.messages  # type: ignore
+            # Make a non-streaming call to warm the cache
+            # We use a minimal max_tokens to reduce latency and cost
+            await self._client.chat.completions.create(
+                model=self.model_name,  # Use the property, not self._model
+                messages=messages,
+                max_tokens=10,  # Minimal response
+                stream=False,
+            )
+            self.logger.info("LLM cache warmed successfully")
+            # Intentionally don't emit any frames - this is a silent warmup
+        except Exception as e:
+            self.logger.error(f"Failed to warm LLM cache: {e}")
+            # Don't propagate error - warmup failure shouldn't break the bot

{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/WHEEL RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/top_level.txt RENAMED Viewed

File without changes

dv-pipecat-ai 0.0.85.dev847__py3-none-any.whl → 0.0.85.dev850__py3-none-any.whl

Potentially problematic release.

dv-pipecat-ai 0.0.85.dev847py3-none-any.whl → 0.0.85.dev850py3-none-any.whl