dv-pipecat-ai 0.0.85.dev844__py3-none-any.whl → 0.0.85.dev848__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.85.dev844
3
+ Version: 0.0.85.dev848
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.85.dev844.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.85.dev848.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -79,7 +79,7 @@ pipecat/extensions/voicemail/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
79
79
  pipecat/extensions/voicemail/voicemail_detector.py,sha256=JxmU2752iWP_1_GmzZReNESUTFAeyEa4XBPL20_C208,30004
80
80
  pipecat/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
81
  pipecat/frames/frames.proto,sha256=JXZm3VXLR8zMOUcOuhVoe2mhM3MQIQGMJXLopdJO_5Y,839
82
- pipecat/frames/frames.py,sha256=vuYtmyK1QSU2AWx2c_pFQhcmpXqSTnfqAXF6DXKzTG8,49605
82
+ pipecat/frames/frames.py,sha256=248d54lNOyO04dq9ni51yUTWUItmGw8b9QKarrDGNeo,50354
83
83
  pipecat/frames/protobufs/frames_pb2.py,sha256=VHgGV_W7qQ4sfQK6RHb5_DggLm3PiSYMr6aBZ8_p1cQ,2590
84
84
  pipecat/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  pipecat/metrics/metrics.py,sha256=bdZNciEtLTtA-xgoKDz2RJAy6fKrXkTwz3pryVHzc2M,2713
@@ -217,8 +217,8 @@ pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41
217
217
  pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
218
218
  pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
219
219
  pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
220
- pipecat/services/elevenlabs/stt.py,sha256=F3xD82eOIy5OyyE-5StdoFFvKjIXlos2yyP0cyNQj6Y,12214
221
- pipecat/services/elevenlabs/tts.py,sha256=Okctydqoz2HG9B69l-bDunNHcqoul-kUxhEZjYt539U,45188
220
+ pipecat/services/elevenlabs/stt.py,sha256=_RhBKpUYEGKMpcO7y4RLxmEOMK11LZFdZqDFIA-DZXk,27303
221
+ pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
222
222
  pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
223
223
  pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
224
224
  pipecat/services/fal/stt.py,sha256=-5tw7N8srBJTS0Q65SN4csmLkIB6cLHR9pXKimxg55o,9678
@@ -280,7 +280,7 @@ pipecat/services/nim/llm.py,sha256=o4WPGI6kOmSiMV7WwOZ0cNEAoq9hW4Aqs2R8X7c9i94,4
280
280
  pipecat/services/ollama/__init__.py,sha256=aw-25zYsR8LR74OFFlMKMTnJjaKwOzdPWVsClueNRkI,255
281
281
  pipecat/services/ollama/llm.py,sha256=rfpG92LRHGJlpENKhF6ld8CLVS9DxlKW-WRVNldOIGs,1605
282
282
  pipecat/services/openai/__init__.py,sha256=V0ZVa8PzEm3hmcStYICbAsYwfgk4ytZ6kiQoq9UZPmI,354
283
- pipecat/services/openai/base_llm.py,sha256=J4Ltg1KOXciiUIMBFLn0SmDTZereEE-1LKrPfBsLzFw,19127
283
+ pipecat/services/openai/base_llm.py,sha256=jOiWacimREywCMZZwAwH8RAHCbwnnXvbqAjWQUYA0yM,20727
284
284
  pipecat/services/openai/image.py,sha256=3e3h-dVQ6DQuQE7fp8akXwRMd-oYOdGuZg7RCOjHu9A,2994
285
285
  pipecat/services/openai/llm.py,sha256=_aKtz1VebSFUUenT3tH6mBW9pSCm65_u45cDu_dkTzs,7396
286
286
  pipecat/services/openai/stt.py,sha256=Idf0k73kxFyDgNRBt62MFpoKKNsBV9bwvJteJ6MGWzQ,2419
@@ -329,7 +329,7 @@ pipecat/services/sarvam/tts.py,sha256=lrwfdC53kZ7f2QPgNRxzryISNkrJCvNtlZ-19-iXg9
329
329
  pipecat/services/simli/__init__.py,sha256=cbDcqOaGsEgKbGYKpJ1Vv7LN4ZjOWA04sE84WW5vgQI,257
330
330
  pipecat/services/simli/video.py,sha256=Zu2XLvl2Y6VHaWzT9wEdzW9d0EYoZyzYLxjQFyV8vho,8320
331
331
  pipecat/services/soniox/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
332
- pipecat/services/soniox/stt.py,sha256=Ndml6QvPQ1WZBvdGT3LSg-LLWwrZ8KlqW8wBBFsQrrM,16509
332
+ pipecat/services/soniox/stt.py,sha256=zRp5qWU051hEAikt0vB0rbHrkQkH5sT-IOe-o5vCurQ,16650
333
333
  pipecat/services/speechmatics/__init__.py,sha256=Jgq1fqrZVkpWC21D79L1cn5Ub8PnYgnnCaqC5pOlbIc,89
334
334
  pipecat/services/speechmatics/stt.py,sha256=GLGJzlMSeZ1WzTOMjhKXDl5JYkqGhnFTbP3o0ez0hSw,44696
335
335
  pipecat/services/tavus/__init__.py,sha256=SNyyi2Xq6tXIihDG2Bwvmg6Srbd-uWd1RwG-NKWcPuI,257
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
416
416
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
417
417
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
418
418
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
419
- dv_pipecat_ai-0.0.85.dev844.dist-info/METADATA,sha256=JgW9PLS_gplsOlHfyohgocRxrsiivvsAEySMY214f4U,32955
420
- dv_pipecat_ai-0.0.85.dev844.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
- dv_pipecat_ai-0.0.85.dev844.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
- dv_pipecat_ai-0.0.85.dev844.dist-info/RECORD,,
419
+ dv_pipecat_ai-0.0.85.dev848.dist-info/METADATA,sha256=T2IPoO2Nmt09lIxM0rKmJRa5ZIBQ-9fcbswOy90lkJg,32955
420
+ dv_pipecat_ai-0.0.85.dev848.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
+ dv_pipecat_ai-0.0.85.dev848.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
+ dv_pipecat_ai-0.0.85.dev848.dist-info/RECORD,,
pipecat/frames/frames.py CHANGED
@@ -586,6 +586,27 @@ class LLMRunFrame(DataFrame):
586
586
  pass
587
587
 
588
588
 
589
+ @dataclass
590
+ class WarmupLLMFrame(DataFrame):
591
+ """Frame to trigger prompt caching/warmup in supported LLM providers.
592
+
593
+ This frame instructs the LLM service to cache the provided messages
594
+ without generating a visible response. Primarily used for warming up provider
595
+ caches (e.g., Claude's prompt caching, OpenAI's prompt caching) to improve
596
+ latency for subsequent requests.
597
+
598
+ The LLM service should:
599
+ 1. Send the messages to the provider to trigger caching
600
+ 2. Generate a minimal response (e.g., single word)
601
+ 3. Discard the response without emitting LLM output frames
602
+
603
+ Parameters:
604
+ messages: List of messages to send for cache warming (should match conversation structure).
605
+ """
606
+
607
+ messages: List[dict]
608
+
609
+
589
610
  @dataclass
590
611
  class LLMMessagesAppendFrame(DataFrame):
591
612
  """Frame containing LLM messages to append to current context.
@@ -4,26 +4,43 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- """ElevenLabs speech-to-text service implementation.
8
-
9
- This module provides integration with ElevenLabs' Speech-to-Text API for transcription
10
- using segmented audio processing. The service uploads audio files and receives
11
- transcription results directly.
12
- """
7
+ """ElevenLabs speech-to-text service implementations."""
13
8
 
9
+ import asyncio
10
+ import base64
14
11
  import io
15
- from typing import AsyncGenerator, Optional
12
+ import json
13
+ import urllib.parse
14
+ from typing import Any, AsyncGenerator, Dict, Literal, Optional
16
15
 
17
16
  import aiohttp
18
17
  from loguru import logger
19
18
  from pydantic import BaseModel
20
19
 
21
- from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
22
- from pipecat.services.stt_service import SegmentedSTTService
20
+ from pipecat.frames.frames import (
21
+ CancelFrame,
22
+ EndFrame,
23
+ ErrorFrame,
24
+ Frame,
25
+ InterimTranscriptionFrame,
26
+ StartFrame,
27
+ TranscriptionFrame,
28
+ UserStartedSpeakingFrame,
29
+ UserStoppedSpeakingFrame,
30
+ )
31
+ from pipecat.processors.frame_processor import FrameDirection
32
+ from pipecat.services.stt_service import SegmentedSTTService, WebsocketSTTService
23
33
  from pipecat.transcriptions.language import Language
24
34
  from pipecat.utils.time import time_now_iso8601
25
35
  from pipecat.utils.tracing.service_decorators import traced_stt
26
36
 
37
+ try:
38
+ from websockets.asyncio.client import connect as websocket_connect
39
+ from websockets.protocol import State
40
+ except ModuleNotFoundError:
41
+ websocket_connect = None # type: ignore[assignment]
42
+ State = None # type: ignore[assignment]
43
+
27
44
 
28
45
  def language_to_elevenlabs_language(language: Language) -> Optional[str]:
29
46
  """Convert a Language enum to ElevenLabs language code.
@@ -150,6 +167,19 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
150
167
  return result
151
168
 
152
169
 
170
+ def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
171
+ """Convert an ElevenLabs language code back to a Language enum value."""
172
+ if not language_code:
173
+ return None
174
+
175
+ normalized = language_code.lower()
176
+ for language in Language:
177
+ code = language_to_elevenlabs_language(language)
178
+ if code and code.lower() == normalized:
179
+ return language
180
+ return None
181
+
182
+
153
183
  class ElevenLabsSTTService(SegmentedSTTService):
154
184
  """Speech-to-text service using ElevenLabs' file-based API.
155
185
 
@@ -337,3 +367,376 @@ class ElevenLabsSTTService(SegmentedSTTService):
337
367
  except Exception as e:
338
368
  self.logger.error(f"ElevenLabs STT error: {e}")
339
369
  yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
370
+
371
+
372
+ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
373
+ """Realtime speech-to-text service using ElevenLabs Scribe v2 WebSocket API."""
374
+
375
+ class InputParams(BaseModel):
376
+ """Realtime connection parameters derived from ElevenLabs documentation."""
377
+
378
+ language: Optional[Language] = None
379
+ commit_strategy: Literal["manual", "vad"] = "manual"
380
+ vad_silence_threshold_secs: Optional[float] = None
381
+ vad_threshold: Optional[float] = None
382
+ min_speech_duration_ms: Optional[int] = None
383
+ min_silence_duration_ms: Optional[int] = None
384
+
385
+ def __init__(
386
+ self,
387
+ *,
388
+ api_key: str,
389
+ sample_rate: Optional[int] = None,
390
+ model: str = "scribe_v2_realtime",
391
+ url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
392
+ params: Optional["ElevenLabsRealtimeSTTService.InputParams"] = None,
393
+ reconnect_on_error: bool = True,
394
+ **kwargs,
395
+ ):
396
+ """Initialize the realtime STT service.
397
+
398
+ Args:
399
+ api_key: ElevenLabs API key for authentication.
400
+ sample_rate: Optional input sample rate. Defaults to pipeline sample rate.
401
+ model: Scribe realtime model identifier.
402
+ url: WebSocket endpoint for realtime transcription.
403
+ params: Optional realtime configuration options.
404
+ reconnect_on_error: Whether to auto-reconnect on transient failures.
405
+ **kwargs: Additional arguments forwarded to WebsocketSTTService.
406
+ """
407
+ if websocket_connect is None or State is None:
408
+ logger.error(
409
+ "In order to use ElevenLabsRealtimeSTTService, you need to "
410
+ "`pip install pipecat-ai[elevenlabs]` (websockets extra)."
411
+ )
412
+ raise ModuleNotFoundError("Missing optional dependency: websockets")
413
+
414
+ super().__init__(sample_rate=sample_rate, reconnect_on_error=reconnect_on_error, **kwargs)
415
+
416
+ self._api_key = api_key
417
+ self._url = url
418
+ self.set_model_name(model)
419
+ self._model = model
420
+ self._params = params or ElevenLabsRealtimeSTTService.InputParams()
421
+ self._language_override = self._params.language
422
+ self._encoding = None
423
+ self._receive_task: Optional[asyncio.Task] = None
424
+ self._pending_final_message: Optional[Dict[str, Any]] = None
425
+ self._pending_final_task: Optional[asyncio.Task] = None
426
+ self._timestamp_merge_delay_s = 0.25
427
+ self._ttfb_started = False
428
+
429
+ @property
430
+ def commit_strategy(self) -> str:
431
+ """Return the configured commit strategy (manual or vad)."""
432
+ return (self._params.commit_strategy or "manual").lower()
433
+
434
+ def can_generate_metrics(self) -> bool:
435
+ """Realtime ElevenLabs service supports latency metrics."""
436
+ return True
437
+
438
+ async def start(self, frame: StartFrame):
439
+ """Start the realtime STT service and establish WebSocket connection."""
440
+ await super().start(frame)
441
+ self._encoding = self._determine_encoding(self.sample_rate)
442
+ await self._connect()
443
+
444
+ async def stop(self, frame: EndFrame):
445
+ """Stop the realtime STT service and close WebSocket connection."""
446
+ await super().stop(frame)
447
+ await self._disconnect()
448
+
449
+ async def cancel(self, frame: CancelFrame):
450
+ """Cancel the realtime STT service and close WebSocket connection."""
451
+ await super().cancel(frame)
452
+ await self._disconnect()
453
+
454
+ async def set_language(self, language: Language):
455
+ """Update preferred transcription language (requires reconnect)."""
456
+ self._language_override = language
457
+ self._params.language = language
458
+ if self._websocket:
459
+ await self._disconnect()
460
+ await self._connect()
461
+
462
+ async def set_model(self, model: str):
463
+ """Set the STT model and reconnect the WebSocket."""
464
+ await super().set_model(model)
465
+ self._model = model
466
+ if self._websocket:
467
+ await self._disconnect()
468
+ await self._connect()
469
+
470
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
471
+ """Process frames and handle VAD events for commit strategy."""
472
+ await super().process_frame(frame, direction)
473
+
474
+ if isinstance(frame, UserStartedSpeakingFrame):
475
+ if frame.emulated:
476
+ return
477
+ self._ttfb_started = False
478
+ await self.start_processing_metrics()
479
+ elif isinstance(frame, UserStoppedSpeakingFrame):
480
+ if frame.emulated:
481
+ return
482
+ if self.commit_strategy == "manual":
483
+ await self._send_commit()
484
+
485
+ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
486
+ """Stream audio chunks over the ElevenLabs realtime WebSocket."""
487
+ if not audio:
488
+ yield None
489
+ return
490
+
491
+ await self._ensure_connection()
492
+ await self._send_audio_chunk(audio)
493
+ yield None
494
+
495
+ async def _ensure_connection(self):
496
+ if not self._websocket or self._websocket.state is State.CLOSED:
497
+ await self._connect()
498
+
499
+ async def _connect(self):
500
+ await self._connect_websocket()
501
+ if self._websocket and not self._receive_task:
502
+ self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
503
+
504
+ async def _disconnect(self):
505
+ if self._receive_task:
506
+ await self.cancel_task(self._receive_task)
507
+ self._receive_task = None
508
+
509
+ await self._clear_pending_final()
510
+ await self._disconnect_websocket()
511
+
512
+ async def _connect_websocket(self):
513
+ try:
514
+ if self._websocket and self._websocket.state is State.OPEN:
515
+ return
516
+
517
+ ws_url = self._build_websocket_url()
518
+ headers = {"xi-api-key": self._api_key}
519
+ self.logger.debug(f"Connecting to ElevenLabs realtime STT at {ws_url}")
520
+ self._websocket = await websocket_connect(ws_url, additional_headers=headers)
521
+ await self._call_event_handler("on_connected")
522
+ except Exception as e:
523
+ self.logger.error(f"{self} unable to connect to ElevenLabs realtime STT: {e}")
524
+ self._websocket = None
525
+ await self._call_event_handler("on_connection_error", f"{e}")
526
+
527
+ async def _disconnect_websocket(self):
528
+ try:
529
+ await self.stop_all_metrics()
530
+ if self._websocket and self._websocket.state is State.OPEN:
531
+ self.logger.debug("Disconnecting from ElevenLabs realtime STT")
532
+ await self._websocket.close()
533
+ except Exception as e:
534
+ self.logger.error(f"{self} error closing ElevenLabs realtime websocket: {e}")
535
+ finally:
536
+ self._websocket = None
537
+ await self._call_event_handler("on_disconnected")
538
+
539
+ async def _receive_messages(self):
540
+ async for message in self._get_websocket():
541
+ await self._process_event(message)
542
+
543
+ def _get_websocket(self):
544
+ if not self._websocket:
545
+ raise RuntimeError("ElevenLabs realtime websocket not connected")
546
+ return self._websocket
547
+
548
+ async def _process_event(self, message: Any):
549
+ try:
550
+ data = json.loads(message)
551
+ except json.JSONDecodeError:
552
+ self.logger.warning(f"ElevenLabs realtime STT sent invalid JSON: {message}")
553
+ return
554
+
555
+ message_type = data.get("message_type")
556
+
557
+ if message_type == "session_started":
558
+ self.logger.debug("ElevenLabs realtime session started")
559
+ return
560
+
561
+ if message_type == "partial_transcript":
562
+ await self._emit_partial_transcript(data)
563
+ elif message_type == "committed_transcript":
564
+ await self._handle_committed_transcript(data)
565
+ elif message_type == "committed_transcript_with_timestamps":
566
+ await self._handle_committed_transcript_with_timestamps(data)
567
+ elif message_type in {
568
+ "auth_error",
569
+ "quota_exceeded",
570
+ "transcriber_error",
571
+ "input_error",
572
+ "error",
573
+ }:
574
+ fatal = message_type in {"auth_error", "quota_exceeded", "error"}
575
+ description = data.get("error", data)
576
+ await self.push_error(
577
+ ErrorFrame(f"ElevenLabs realtime error: {description}", fatal=fatal)
578
+ )
579
+ else:
580
+ self.logger.debug(f"Unhandled ElevenLabs realtime message: {data}")
581
+
582
+ async def _emit_partial_transcript(self, data: Dict[str, Any]):
583
+ text = (data.get("text") or data.get("transcript") or "").strip()
584
+ if not text:
585
+ return
586
+
587
+ language = (
588
+ elevenlabs_language_code_to_language(data.get("language_code"))
589
+ or self._language_override
590
+ )
591
+ await self.stop_ttfb_metrics()
592
+
593
+ await self.push_frame(
594
+ InterimTranscriptionFrame(
595
+ text,
596
+ self._user_id,
597
+ time_now_iso8601(),
598
+ language,
599
+ result=data,
600
+ )
601
+ )
602
+
603
+ async def _handle_committed_transcript(self, data: Dict[str, Any]):
604
+ if self._pending_final_message:
605
+ await self._emit_transcription(self._pending_final_message)
606
+ self._pending_final_message = None
607
+
608
+ self._pending_final_message = data
609
+ await self._schedule_pending_final_emit()
610
+
611
+ async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
612
+ if self._pending_final_message:
613
+ merged = {**self._pending_final_message, **data}
614
+ await self._emit_transcription(merged)
615
+ await self._clear_pending_final()
616
+ else:
617
+ await self._emit_transcription(data)
618
+
619
+ async def _schedule_pending_final_emit(self):
620
+ await self._clear_pending_final(timer_only=True)
621
+ self._pending_final_task = asyncio.create_task(self._emit_pending_after_delay())
622
+
623
+ async def _emit_pending_after_delay(self):
624
+ try:
625
+ await asyncio.sleep(self._timestamp_merge_delay_s)
626
+ if self._pending_final_message:
627
+ await self._emit_transcription(self._pending_final_message)
628
+ self._pending_final_message = None
629
+ except asyncio.CancelledError:
630
+ pass
631
+ finally:
632
+ self._pending_final_task = None
633
+
634
+ async def _clear_pending_final(self, timer_only: bool = False):
635
+ if self._pending_final_task:
636
+ await self.cancel_task(self._pending_final_task)
637
+ self._pending_final_task = None
638
+
639
+ if not timer_only:
640
+ self._pending_final_message = None
641
+
642
+ async def _emit_transcription(self, data: Dict[str, Any]):
643
+ text = (data.get("text") or data.get("transcript") or "").strip()
644
+ if not text:
645
+ return
646
+
647
+ language = (
648
+ elevenlabs_language_code_to_language(data.get("language_code"))
649
+ or self._language_override
650
+ )
651
+ await self.stop_ttfb_metrics()
652
+
653
+ frame = TranscriptionFrame(
654
+ text,
655
+ self._user_id,
656
+ time_now_iso8601(),
657
+ language,
658
+ result=data,
659
+ )
660
+
661
+ await self.push_frame(frame)
662
+ await self._handle_transcription(text, True, language)
663
+ await self.stop_processing_metrics()
664
+
665
+ async def _send_audio_chunk(self, audio: bytes):
666
+ if not audio or not self._websocket:
667
+ return
668
+
669
+ if not self._ttfb_started:
670
+ await self.start_ttfb_metrics()
671
+ self._ttfb_started = True
672
+
673
+ payload = {
674
+ "message_type": "input_audio_chunk",
675
+ "audio_base_64": base64.b64encode(audio).decode("ascii"),
676
+ "commit": False,
677
+ "sample_rate": self.sample_rate,
678
+ }
679
+ await self._websocket.send(json.dumps(payload))
680
+
681
+ async def _send_commit(self):
682
+ if not self._websocket:
683
+ return
684
+ payload = {
685
+ "message_type": "input_audio_chunk",
686
+ "audio_base_64": "",
687
+ "commit": True,
688
+ "sample_rate": self.sample_rate,
689
+ }
690
+ await self._websocket.send(json.dumps(payload))
691
+
692
+ def _build_websocket_url(self) -> str:
693
+ if not self.sample_rate:
694
+ raise ValueError(
695
+ "ElevenLabs realtime STT requires a valid sample rate (start() must run first)."
696
+ )
697
+
698
+ params = {
699
+ "model_id": self._model,
700
+ "encoding": self._encoding or "pcm_16000",
701
+ "sample_rate": str(self.sample_rate),
702
+ "commit_strategy": self.commit_strategy,
703
+ }
704
+
705
+ language_code = (
706
+ language_to_elevenlabs_language(self._language_override)
707
+ if self._language_override
708
+ else None
709
+ )
710
+ if language_code:
711
+ params["language_code"] = language_code
712
+
713
+ if self._params.vad_silence_threshold_secs is not None:
714
+ params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
715
+ if self._params.vad_threshold is not None:
716
+ params["vad_threshold"] = str(self._params.vad_threshold)
717
+ if self._params.min_speech_duration_ms is not None:
718
+ params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
719
+ if self._params.min_silence_duration_ms is not None:
720
+ params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
721
+
722
+ return f"{self._url}?{urllib.parse.urlencode(params)}"
723
+
724
+ def _determine_encoding(self, sample_rate: int) -> str:
725
+ if not sample_rate:
726
+ raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
727
+
728
+ supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
729
+ if sample_rate not in supported_rates:
730
+ raise ValueError(
731
+ f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
732
+ f"Received {sample_rate} Hz."
733
+ )
734
+ return f"pcm_{sample_rate}"
735
+
736
+ @traced_stt
737
+ async def _handle_transcription(
738
+ self, transcript: str, is_final: bool, language: Optional[Language] = None
739
+ ):
740
+ """Handle a transcription result with tracing."""
741
+ # Metrics are stopped by the caller when needed.
742
+ return
@@ -14,7 +14,17 @@ import asyncio
14
14
  import base64
15
15
  import json
16
16
  import uuid
17
- from typing import Any, AsyncGenerator, Dict, List, Literal, Mapping, Optional, Tuple, Union
17
+ from typing import (
18
+ Any,
19
+ AsyncGenerator,
20
+ Dict,
21
+ List,
22
+ Literal,
23
+ Mapping,
24
+ Optional,
25
+ Tuple,
26
+ Union,
27
+ )
18
28
 
19
29
  import aiohttp
20
30
  from loguru import logger
@@ -157,7 +167,13 @@ def build_elevenlabs_voice_settings(
157
167
  Returns:
158
168
  Dictionary of voice settings or None if no valid settings are provided.
159
169
  """
160
- voice_setting_keys = ["stability", "similarity_boost", "style", "use_speaker_boost", "speed"]
170
+ voice_setting_keys = [
171
+ "stability",
172
+ "similarity_boost",
173
+ "style",
174
+ "use_speaker_boost",
175
+ "speed",
176
+ ]
161
177
 
162
178
  voice_settings = {}
163
179
  for key in voice_setting_keys:
@@ -530,7 +546,9 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
530
546
 
531
547
  # Set max websocket message size to 16MB for large audio responses
532
548
  self._websocket = await websocket_connect(
533
- url, max_size=16 * 1024 * 1024, additional_headers={"xi-api-key": self._api_key}
549
+ url,
550
+ max_size=16 * 1024 * 1024,
551
+ additional_headers={"xi-api-key": self._api_key},
534
552
  )
535
553
 
536
554
  await self._call_event_handler("on_connected")
@@ -549,7 +567,7 @@ class ElevenLabsTTSService(AudioContextWordTTSService):
549
567
  if self._context_id:
550
568
  await self._websocket.send(json.dumps({"close_socket": True}))
551
569
  await self._websocket.close()
552
- logger.debug("Disconnected from ElevenLabs")
570
+ self.logger.debug("Disconnected from ElevenLabs")
553
571
  except Exception as e:
554
572
  self.logger.error(f"{self} error closing websocket: {e}")
555
573
  finally:
@@ -32,6 +32,7 @@ from pipecat.frames.frames import (
32
32
  LLMMessagesFrame,
33
33
  LLMTextFrame,
34
34
  LLMUpdateSettingsFrame,
35
+ WarmupLLMFrame,
35
36
  )
36
37
  from pipecat.metrics.metrics import LLMTokenUsage
37
38
  from pipecat.processors.aggregators.llm_context import LLMContext
@@ -438,14 +439,19 @@ class BaseOpenAILLMService(LLMService):
438
439
  completions and manage settings.
439
440
  >>>>>>> dv-stage
440
441
 
441
- Args:
442
+ Args:
442
443
  frame: The frame to process.
443
444
  direction: The direction of frame processing.
444
445
  """
445
446
  await super().process_frame(frame, direction)
446
447
 
447
448
  context = None
448
- if isinstance(frame, OpenAILLMContextFrame):
449
+ if isinstance(frame, WarmupLLMFrame):
450
+ # Handle warmup frame - prime cache without emitting response
451
+ # Run in background to avoid blocking the pipeline
452
+ asyncio.create_task(self._handle_warmup_frame(frame))
453
+ return # Don't process further, warmup is silent
454
+ elif isinstance(frame, OpenAILLMContextFrame):
449
455
  # Handle OpenAI-specific context frames
450
456
  context = frame.context
451
457
  elif isinstance(frame, LLMContextFrame):
@@ -470,3 +476,32 @@ class BaseOpenAILLMService(LLMService):
470
476
  finally:
471
477
  await self.stop_processing_metrics()
472
478
  await self.push_frame(LLMFullResponseEndFrame())
479
+
480
+ async def _handle_warmup_frame(self, frame: WarmupLLMFrame):
481
+ """Handle WarmupLLMFrame to prime the LLM cache without emitting responses.
482
+
483
+ This method sends a minimal request to the LLM to warm up any provider-side
484
+ caches (like prompt caching). The response is discarded and no frames are emitted.
485
+
486
+ Args:
487
+ frame: WarmupLLMFrame containing the messages to cache.
488
+ """
489
+ try:
490
+ # Use the provided messages for warmup
491
+ messages: List[ChatCompletionMessageParam] = frame.messages # type: ignore
492
+
493
+ # Make a non-streaming call to warm the cache
494
+ # We use a minimal max_tokens to reduce latency and cost
495
+ await self._client.chat.completions.create(
496
+ model=self.model_name, # Use the property, not self._model
497
+ messages=messages,
498
+ max_tokens=10, # Minimal response
499
+ stream=False,
500
+ )
501
+
502
+ self.logger.info("LLM cache warmed successfully")
503
+ # Intentionally don't emit any frames - this is a silent warmup
504
+
505
+ except Exception as e:
506
+ self.logger.error(f"Failed to warm LLM cache: {e}")
507
+ # Don't propagate error - warmup failure shouldn't break the bot
@@ -241,6 +241,7 @@ class SonioxSTTService(STTService):
241
241
  if self._receive_task != asyncio.current_task():
242
242
  await self._receive_task
243
243
  self._receive_task = None
244
+ self.logger.debug("Disconnected from Soniox STT")
244
245
 
245
246
  async def stop(self, frame: EndFrame):
246
247
  """Stop the Soniox STT websocket connection.
@@ -382,7 +383,10 @@ class SonioxSTTService(STTService):
382
383
 
383
384
  if self._final_transcription_buffer or non_final_transcription:
384
385
  final_text = "".join(
385
- map(lambda token: token["text"], self._final_transcription_buffer)
386
+ map(
387
+ lambda token: token["text"],
388
+ self._final_transcription_buffer,
389
+ )
386
390
  )
387
391
  non_final_text = "".join(
388
392
  map(lambda token: token["text"], non_final_transcription)