dv-pipecat-ai 0.0.85.dev847__py3-none-any.whl → 0.0.85.dev848__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev848.dist-info}/METADATA +1 -1
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev848.dist-info}/RECORD +8 -8
- pipecat/frames/frames.py +21 -0
- pipecat/services/elevenlabs/stt.py +412 -9
- pipecat/services/openai/base_llm.py +37 -2
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev848.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev848.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev848.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
dv_pipecat_ai-0.0.85.
|
|
1
|
+
dv_pipecat_ai-0.0.85.dev848.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
|
|
2
2
|
pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
|
|
3
3
|
pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -79,7 +79,7 @@ pipecat/extensions/voicemail/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
79
79
|
pipecat/extensions/voicemail/voicemail_detector.py,sha256=JxmU2752iWP_1_GmzZReNESUTFAeyEa4XBPL20_C208,30004
|
|
80
80
|
pipecat/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
81
81
|
pipecat/frames/frames.proto,sha256=JXZm3VXLR8zMOUcOuhVoe2mhM3MQIQGMJXLopdJO_5Y,839
|
|
82
|
-
pipecat/frames/frames.py,sha256=
|
|
82
|
+
pipecat/frames/frames.py,sha256=248d54lNOyO04dq9ni51yUTWUItmGw8b9QKarrDGNeo,50354
|
|
83
83
|
pipecat/frames/protobufs/frames_pb2.py,sha256=VHgGV_W7qQ4sfQK6RHb5_DggLm3PiSYMr6aBZ8_p1cQ,2590
|
|
84
84
|
pipecat/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
85
85
|
pipecat/metrics/metrics.py,sha256=bdZNciEtLTtA-xgoKDz2RJAy6fKrXkTwz3pryVHzc2M,2713
|
|
@@ -217,7 +217,7 @@ pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41
|
|
|
217
217
|
pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
|
|
218
218
|
pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
|
|
219
219
|
pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
|
|
220
|
-
pipecat/services/elevenlabs/stt.py,sha256=
|
|
220
|
+
pipecat/services/elevenlabs/stt.py,sha256=_RhBKpUYEGKMpcO7y4RLxmEOMK11LZFdZqDFIA-DZXk,27303
|
|
221
221
|
pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
|
|
222
222
|
pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
|
|
223
223
|
pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
|
|
@@ -280,7 +280,7 @@ pipecat/services/nim/llm.py,sha256=o4WPGI6kOmSiMV7WwOZ0cNEAoq9hW4Aqs2R8X7c9i94,4
|
|
|
280
280
|
pipecat/services/ollama/__init__.py,sha256=aw-25zYsR8LR74OFFlMKMTnJjaKwOzdPWVsClueNRkI,255
|
|
281
281
|
pipecat/services/ollama/llm.py,sha256=rfpG92LRHGJlpENKhF6ld8CLVS9DxlKW-WRVNldOIGs,1605
|
|
282
282
|
pipecat/services/openai/__init__.py,sha256=V0ZVa8PzEm3hmcStYICbAsYwfgk4ytZ6kiQoq9UZPmI,354
|
|
283
|
-
pipecat/services/openai/base_llm.py,sha256=
|
|
283
|
+
pipecat/services/openai/base_llm.py,sha256=jOiWacimREywCMZZwAwH8RAHCbwnnXvbqAjWQUYA0yM,20727
|
|
284
284
|
pipecat/services/openai/image.py,sha256=3e3h-dVQ6DQuQE7fp8akXwRMd-oYOdGuZg7RCOjHu9A,2994
|
|
285
285
|
pipecat/services/openai/llm.py,sha256=_aKtz1VebSFUUenT3tH6mBW9pSCm65_u45cDu_dkTzs,7396
|
|
286
286
|
pipecat/services/openai/stt.py,sha256=Idf0k73kxFyDgNRBt62MFpoKKNsBV9bwvJteJ6MGWzQ,2419
|
|
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
|
|
|
416
416
|
pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
|
|
417
417
|
pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
|
|
418
418
|
pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
|
|
419
|
-
dv_pipecat_ai-0.0.85.
|
|
420
|
-
dv_pipecat_ai-0.0.85.
|
|
421
|
-
dv_pipecat_ai-0.0.85.
|
|
422
|
-
dv_pipecat_ai-0.0.85.
|
|
419
|
+
dv_pipecat_ai-0.0.85.dev848.dist-info/METADATA,sha256=T2IPoO2Nmt09lIxM0rKmJRa5ZIBQ-9fcbswOy90lkJg,32955
|
|
420
|
+
dv_pipecat_ai-0.0.85.dev848.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
421
|
+
dv_pipecat_ai-0.0.85.dev848.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
|
|
422
|
+
dv_pipecat_ai-0.0.85.dev848.dist-info/RECORD,,
|
pipecat/frames/frames.py
CHANGED
|
@@ -586,6 +586,27 @@ class LLMRunFrame(DataFrame):
|
|
|
586
586
|
pass
|
|
587
587
|
|
|
588
588
|
|
|
589
|
+
@dataclass
|
|
590
|
+
class WarmupLLMFrame(DataFrame):
|
|
591
|
+
"""Frame to trigger prompt caching/warmup in supported LLM providers.
|
|
592
|
+
|
|
593
|
+
This frame instructs the LLM service to cache the provided messages
|
|
594
|
+
without generating a visible response. Primarily used for warming up provider
|
|
595
|
+
caches (e.g., Claude's prompt caching, OpenAI's prompt caching) to improve
|
|
596
|
+
latency for subsequent requests.
|
|
597
|
+
|
|
598
|
+
The LLM service should:
|
|
599
|
+
1. Send the messages to the provider to trigger caching
|
|
600
|
+
2. Generate a minimal response (e.g., single word)
|
|
601
|
+
3. Discard the response without emitting LLM output frames
|
|
602
|
+
|
|
603
|
+
Parameters:
|
|
604
|
+
messages: List of messages to send for cache warming (should match conversation structure).
|
|
605
|
+
"""
|
|
606
|
+
|
|
607
|
+
messages: List[dict]
|
|
608
|
+
|
|
609
|
+
|
|
589
610
|
@dataclass
|
|
590
611
|
class LLMMessagesAppendFrame(DataFrame):
|
|
591
612
|
"""Frame containing LLM messages to append to current context.
|
|
@@ -4,26 +4,43 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
"""ElevenLabs speech-to-text service
|
|
8
|
-
|
|
9
|
-
This module provides integration with ElevenLabs' Speech-to-Text API for transcription
|
|
10
|
-
using segmented audio processing. The service uploads audio files and receives
|
|
11
|
-
transcription results directly.
|
|
12
|
-
"""
|
|
7
|
+
"""ElevenLabs speech-to-text service implementations."""
|
|
13
8
|
|
|
9
|
+
import asyncio
|
|
10
|
+
import base64
|
|
14
11
|
import io
|
|
15
|
-
|
|
12
|
+
import json
|
|
13
|
+
import urllib.parse
|
|
14
|
+
from typing import Any, AsyncGenerator, Dict, Literal, Optional
|
|
16
15
|
|
|
17
16
|
import aiohttp
|
|
18
17
|
from loguru import logger
|
|
19
18
|
from pydantic import BaseModel
|
|
20
19
|
|
|
21
|
-
from pipecat.frames.frames import
|
|
22
|
-
|
|
20
|
+
from pipecat.frames.frames import (
|
|
21
|
+
CancelFrame,
|
|
22
|
+
EndFrame,
|
|
23
|
+
ErrorFrame,
|
|
24
|
+
Frame,
|
|
25
|
+
InterimTranscriptionFrame,
|
|
26
|
+
StartFrame,
|
|
27
|
+
TranscriptionFrame,
|
|
28
|
+
UserStartedSpeakingFrame,
|
|
29
|
+
UserStoppedSpeakingFrame,
|
|
30
|
+
)
|
|
31
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
32
|
+
from pipecat.services.stt_service import SegmentedSTTService, WebsocketSTTService
|
|
23
33
|
from pipecat.transcriptions.language import Language
|
|
24
34
|
from pipecat.utils.time import time_now_iso8601
|
|
25
35
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
26
36
|
|
|
37
|
+
try:
|
|
38
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
39
|
+
from websockets.protocol import State
|
|
40
|
+
except ModuleNotFoundError:
|
|
41
|
+
websocket_connect = None # type: ignore[assignment]
|
|
42
|
+
State = None # type: ignore[assignment]
|
|
43
|
+
|
|
27
44
|
|
|
28
45
|
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
29
46
|
"""Convert a Language enum to ElevenLabs language code.
|
|
@@ -150,6 +167,19 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
|
150
167
|
return result
|
|
151
168
|
|
|
152
169
|
|
|
170
|
+
def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
|
|
171
|
+
"""Convert an ElevenLabs language code back to a Language enum value."""
|
|
172
|
+
if not language_code:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
normalized = language_code.lower()
|
|
176
|
+
for language in Language:
|
|
177
|
+
code = language_to_elevenlabs_language(language)
|
|
178
|
+
if code and code.lower() == normalized:
|
|
179
|
+
return language
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
153
183
|
class ElevenLabsSTTService(SegmentedSTTService):
|
|
154
184
|
"""Speech-to-text service using ElevenLabs' file-based API.
|
|
155
185
|
|
|
@@ -337,3 +367,376 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
337
367
|
except Exception as e:
|
|
338
368
|
self.logger.error(f"ElevenLabs STT error: {e}")
|
|
339
369
|
yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
373
|
+
"""Realtime speech-to-text service using ElevenLabs Scribe v2 WebSocket API."""
|
|
374
|
+
|
|
375
|
+
class InputParams(BaseModel):
|
|
376
|
+
"""Realtime connection parameters derived from ElevenLabs documentation."""
|
|
377
|
+
|
|
378
|
+
language: Optional[Language] = None
|
|
379
|
+
commit_strategy: Literal["manual", "vad"] = "manual"
|
|
380
|
+
vad_silence_threshold_secs: Optional[float] = None
|
|
381
|
+
vad_threshold: Optional[float] = None
|
|
382
|
+
min_speech_duration_ms: Optional[int] = None
|
|
383
|
+
min_silence_duration_ms: Optional[int] = None
|
|
384
|
+
|
|
385
|
+
def __init__(
|
|
386
|
+
self,
|
|
387
|
+
*,
|
|
388
|
+
api_key: str,
|
|
389
|
+
sample_rate: Optional[int] = None,
|
|
390
|
+
model: str = "scribe_v2_realtime",
|
|
391
|
+
url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
|
|
392
|
+
params: Optional["ElevenLabsRealtimeSTTService.InputParams"] = None,
|
|
393
|
+
reconnect_on_error: bool = True,
|
|
394
|
+
**kwargs,
|
|
395
|
+
):
|
|
396
|
+
"""Initialize the realtime STT service.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
api_key: ElevenLabs API key for authentication.
|
|
400
|
+
sample_rate: Optional input sample rate. Defaults to pipeline sample rate.
|
|
401
|
+
model: Scribe realtime model identifier.
|
|
402
|
+
url: WebSocket endpoint for realtime transcription.
|
|
403
|
+
params: Optional realtime configuration options.
|
|
404
|
+
reconnect_on_error: Whether to auto-reconnect on transient failures.
|
|
405
|
+
**kwargs: Additional arguments forwarded to WebsocketSTTService.
|
|
406
|
+
"""
|
|
407
|
+
if websocket_connect is None or State is None:
|
|
408
|
+
logger.error(
|
|
409
|
+
"In order to use ElevenLabsRealtimeSTTService, you need to "
|
|
410
|
+
"`pip install pipecat-ai[elevenlabs]` (websockets extra)."
|
|
411
|
+
)
|
|
412
|
+
raise ModuleNotFoundError("Missing optional dependency: websockets")
|
|
413
|
+
|
|
414
|
+
super().__init__(sample_rate=sample_rate, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
415
|
+
|
|
416
|
+
self._api_key = api_key
|
|
417
|
+
self._url = url
|
|
418
|
+
self.set_model_name(model)
|
|
419
|
+
self._model = model
|
|
420
|
+
self._params = params or ElevenLabsRealtimeSTTService.InputParams()
|
|
421
|
+
self._language_override = self._params.language
|
|
422
|
+
self._encoding = None
|
|
423
|
+
self._receive_task: Optional[asyncio.Task] = None
|
|
424
|
+
self._pending_final_message: Optional[Dict[str, Any]] = None
|
|
425
|
+
self._pending_final_task: Optional[asyncio.Task] = None
|
|
426
|
+
self._timestamp_merge_delay_s = 0.25
|
|
427
|
+
self._ttfb_started = False
|
|
428
|
+
|
|
429
|
+
@property
|
|
430
|
+
def commit_strategy(self) -> str:
|
|
431
|
+
"""Return the configured commit strategy (manual or vad)."""
|
|
432
|
+
return (self._params.commit_strategy or "manual").lower()
|
|
433
|
+
|
|
434
|
+
def can_generate_metrics(self) -> bool:
|
|
435
|
+
"""Realtime ElevenLabs service supports latency metrics."""
|
|
436
|
+
return True
|
|
437
|
+
|
|
438
|
+
async def start(self, frame: StartFrame):
|
|
439
|
+
"""Start the realtime STT service and establish WebSocket connection."""
|
|
440
|
+
await super().start(frame)
|
|
441
|
+
self._encoding = self._determine_encoding(self.sample_rate)
|
|
442
|
+
await self._connect()
|
|
443
|
+
|
|
444
|
+
async def stop(self, frame: EndFrame):
|
|
445
|
+
"""Stop the realtime STT service and close WebSocket connection."""
|
|
446
|
+
await super().stop(frame)
|
|
447
|
+
await self._disconnect()
|
|
448
|
+
|
|
449
|
+
async def cancel(self, frame: CancelFrame):
|
|
450
|
+
"""Cancel the realtime STT service and close WebSocket connection."""
|
|
451
|
+
await super().cancel(frame)
|
|
452
|
+
await self._disconnect()
|
|
453
|
+
|
|
454
|
+
async def set_language(self, language: Language):
|
|
455
|
+
"""Update preferred transcription language (requires reconnect)."""
|
|
456
|
+
self._language_override = language
|
|
457
|
+
self._params.language = language
|
|
458
|
+
if self._websocket:
|
|
459
|
+
await self._disconnect()
|
|
460
|
+
await self._connect()
|
|
461
|
+
|
|
462
|
+
async def set_model(self, model: str):
|
|
463
|
+
"""Set the STT model and reconnect the WebSocket."""
|
|
464
|
+
await super().set_model(model)
|
|
465
|
+
self._model = model
|
|
466
|
+
if self._websocket:
|
|
467
|
+
await self._disconnect()
|
|
468
|
+
await self._connect()
|
|
469
|
+
|
|
470
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
471
|
+
"""Process frames and handle VAD events for commit strategy."""
|
|
472
|
+
await super().process_frame(frame, direction)
|
|
473
|
+
|
|
474
|
+
if isinstance(frame, UserStartedSpeakingFrame):
|
|
475
|
+
if frame.emulated:
|
|
476
|
+
return
|
|
477
|
+
self._ttfb_started = False
|
|
478
|
+
await self.start_processing_metrics()
|
|
479
|
+
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
480
|
+
if frame.emulated:
|
|
481
|
+
return
|
|
482
|
+
if self.commit_strategy == "manual":
|
|
483
|
+
await self._send_commit()
|
|
484
|
+
|
|
485
|
+
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
486
|
+
"""Stream audio chunks over the ElevenLabs realtime WebSocket."""
|
|
487
|
+
if not audio:
|
|
488
|
+
yield None
|
|
489
|
+
return
|
|
490
|
+
|
|
491
|
+
await self._ensure_connection()
|
|
492
|
+
await self._send_audio_chunk(audio)
|
|
493
|
+
yield None
|
|
494
|
+
|
|
495
|
+
async def _ensure_connection(self):
|
|
496
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
497
|
+
await self._connect()
|
|
498
|
+
|
|
499
|
+
async def _connect(self):
|
|
500
|
+
await self._connect_websocket()
|
|
501
|
+
if self._websocket and not self._receive_task:
|
|
502
|
+
self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
|
|
503
|
+
|
|
504
|
+
async def _disconnect(self):
|
|
505
|
+
if self._receive_task:
|
|
506
|
+
await self.cancel_task(self._receive_task)
|
|
507
|
+
self._receive_task = None
|
|
508
|
+
|
|
509
|
+
await self._clear_pending_final()
|
|
510
|
+
await self._disconnect_websocket()
|
|
511
|
+
|
|
512
|
+
async def _connect_websocket(self):
|
|
513
|
+
try:
|
|
514
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
515
|
+
return
|
|
516
|
+
|
|
517
|
+
ws_url = self._build_websocket_url()
|
|
518
|
+
headers = {"xi-api-key": self._api_key}
|
|
519
|
+
self.logger.debug(f"Connecting to ElevenLabs realtime STT at {ws_url}")
|
|
520
|
+
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
|
521
|
+
await self._call_event_handler("on_connected")
|
|
522
|
+
except Exception as e:
|
|
523
|
+
self.logger.error(f"{self} unable to connect to ElevenLabs realtime STT: {e}")
|
|
524
|
+
self._websocket = None
|
|
525
|
+
await self._call_event_handler("on_connection_error", f"{e}")
|
|
526
|
+
|
|
527
|
+
async def _disconnect_websocket(self):
|
|
528
|
+
try:
|
|
529
|
+
await self.stop_all_metrics()
|
|
530
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
531
|
+
self.logger.debug("Disconnecting from ElevenLabs realtime STT")
|
|
532
|
+
await self._websocket.close()
|
|
533
|
+
except Exception as e:
|
|
534
|
+
self.logger.error(f"{self} error closing ElevenLabs realtime websocket: {e}")
|
|
535
|
+
finally:
|
|
536
|
+
self._websocket = None
|
|
537
|
+
await self._call_event_handler("on_disconnected")
|
|
538
|
+
|
|
539
|
+
async def _receive_messages(self):
|
|
540
|
+
async for message in self._get_websocket():
|
|
541
|
+
await self._process_event(message)
|
|
542
|
+
|
|
543
|
+
def _get_websocket(self):
|
|
544
|
+
if not self._websocket:
|
|
545
|
+
raise RuntimeError("ElevenLabs realtime websocket not connected")
|
|
546
|
+
return self._websocket
|
|
547
|
+
|
|
548
|
+
async def _process_event(self, message: Any):
|
|
549
|
+
try:
|
|
550
|
+
data = json.loads(message)
|
|
551
|
+
except json.JSONDecodeError:
|
|
552
|
+
self.logger.warning(f"ElevenLabs realtime STT sent invalid JSON: {message}")
|
|
553
|
+
return
|
|
554
|
+
|
|
555
|
+
message_type = data.get("message_type")
|
|
556
|
+
|
|
557
|
+
if message_type == "session_started":
|
|
558
|
+
self.logger.debug("ElevenLabs realtime session started")
|
|
559
|
+
return
|
|
560
|
+
|
|
561
|
+
if message_type == "partial_transcript":
|
|
562
|
+
await self._emit_partial_transcript(data)
|
|
563
|
+
elif message_type == "committed_transcript":
|
|
564
|
+
await self._handle_committed_transcript(data)
|
|
565
|
+
elif message_type == "committed_transcript_with_timestamps":
|
|
566
|
+
await self._handle_committed_transcript_with_timestamps(data)
|
|
567
|
+
elif message_type in {
|
|
568
|
+
"auth_error",
|
|
569
|
+
"quota_exceeded",
|
|
570
|
+
"transcriber_error",
|
|
571
|
+
"input_error",
|
|
572
|
+
"error",
|
|
573
|
+
}:
|
|
574
|
+
fatal = message_type in {"auth_error", "quota_exceeded", "error"}
|
|
575
|
+
description = data.get("error", data)
|
|
576
|
+
await self.push_error(
|
|
577
|
+
ErrorFrame(f"ElevenLabs realtime error: {description}", fatal=fatal)
|
|
578
|
+
)
|
|
579
|
+
else:
|
|
580
|
+
self.logger.debug(f"Unhandled ElevenLabs realtime message: {data}")
|
|
581
|
+
|
|
582
|
+
async def _emit_partial_transcript(self, data: Dict[str, Any]):
|
|
583
|
+
text = (data.get("text") or data.get("transcript") or "").strip()
|
|
584
|
+
if not text:
|
|
585
|
+
return
|
|
586
|
+
|
|
587
|
+
language = (
|
|
588
|
+
elevenlabs_language_code_to_language(data.get("language_code"))
|
|
589
|
+
or self._language_override
|
|
590
|
+
)
|
|
591
|
+
await self.stop_ttfb_metrics()
|
|
592
|
+
|
|
593
|
+
await self.push_frame(
|
|
594
|
+
InterimTranscriptionFrame(
|
|
595
|
+
text,
|
|
596
|
+
self._user_id,
|
|
597
|
+
time_now_iso8601(),
|
|
598
|
+
language,
|
|
599
|
+
result=data,
|
|
600
|
+
)
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
async def _handle_committed_transcript(self, data: Dict[str, Any]):
|
|
604
|
+
if self._pending_final_message:
|
|
605
|
+
await self._emit_transcription(self._pending_final_message)
|
|
606
|
+
self._pending_final_message = None
|
|
607
|
+
|
|
608
|
+
self._pending_final_message = data
|
|
609
|
+
await self._schedule_pending_final_emit()
|
|
610
|
+
|
|
611
|
+
async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
|
|
612
|
+
if self._pending_final_message:
|
|
613
|
+
merged = {**self._pending_final_message, **data}
|
|
614
|
+
await self._emit_transcription(merged)
|
|
615
|
+
await self._clear_pending_final()
|
|
616
|
+
else:
|
|
617
|
+
await self._emit_transcription(data)
|
|
618
|
+
|
|
619
|
+
async def _schedule_pending_final_emit(self):
|
|
620
|
+
await self._clear_pending_final(timer_only=True)
|
|
621
|
+
self._pending_final_task = asyncio.create_task(self._emit_pending_after_delay())
|
|
622
|
+
|
|
623
|
+
async def _emit_pending_after_delay(self):
|
|
624
|
+
try:
|
|
625
|
+
await asyncio.sleep(self._timestamp_merge_delay_s)
|
|
626
|
+
if self._pending_final_message:
|
|
627
|
+
await self._emit_transcription(self._pending_final_message)
|
|
628
|
+
self._pending_final_message = None
|
|
629
|
+
except asyncio.CancelledError:
|
|
630
|
+
pass
|
|
631
|
+
finally:
|
|
632
|
+
self._pending_final_task = None
|
|
633
|
+
|
|
634
|
+
async def _clear_pending_final(self, timer_only: bool = False):
|
|
635
|
+
if self._pending_final_task:
|
|
636
|
+
await self.cancel_task(self._pending_final_task)
|
|
637
|
+
self._pending_final_task = None
|
|
638
|
+
|
|
639
|
+
if not timer_only:
|
|
640
|
+
self._pending_final_message = None
|
|
641
|
+
|
|
642
|
+
async def _emit_transcription(self, data: Dict[str, Any]):
|
|
643
|
+
text = (data.get("text") or data.get("transcript") or "").strip()
|
|
644
|
+
if not text:
|
|
645
|
+
return
|
|
646
|
+
|
|
647
|
+
language = (
|
|
648
|
+
elevenlabs_language_code_to_language(data.get("language_code"))
|
|
649
|
+
or self._language_override
|
|
650
|
+
)
|
|
651
|
+
await self.stop_ttfb_metrics()
|
|
652
|
+
|
|
653
|
+
frame = TranscriptionFrame(
|
|
654
|
+
text,
|
|
655
|
+
self._user_id,
|
|
656
|
+
time_now_iso8601(),
|
|
657
|
+
language,
|
|
658
|
+
result=data,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
await self.push_frame(frame)
|
|
662
|
+
await self._handle_transcription(text, True, language)
|
|
663
|
+
await self.stop_processing_metrics()
|
|
664
|
+
|
|
665
|
+
async def _send_audio_chunk(self, audio: bytes):
|
|
666
|
+
if not audio or not self._websocket:
|
|
667
|
+
return
|
|
668
|
+
|
|
669
|
+
if not self._ttfb_started:
|
|
670
|
+
await self.start_ttfb_metrics()
|
|
671
|
+
self._ttfb_started = True
|
|
672
|
+
|
|
673
|
+
payload = {
|
|
674
|
+
"message_type": "input_audio_chunk",
|
|
675
|
+
"audio_base_64": base64.b64encode(audio).decode("ascii"),
|
|
676
|
+
"commit": False,
|
|
677
|
+
"sample_rate": self.sample_rate,
|
|
678
|
+
}
|
|
679
|
+
await self._websocket.send(json.dumps(payload))
|
|
680
|
+
|
|
681
|
+
async def _send_commit(self):
|
|
682
|
+
if not self._websocket:
|
|
683
|
+
return
|
|
684
|
+
payload = {
|
|
685
|
+
"message_type": "input_audio_chunk",
|
|
686
|
+
"audio_base_64": "",
|
|
687
|
+
"commit": True,
|
|
688
|
+
"sample_rate": self.sample_rate,
|
|
689
|
+
}
|
|
690
|
+
await self._websocket.send(json.dumps(payload))
|
|
691
|
+
|
|
692
|
+
def _build_websocket_url(self) -> str:
|
|
693
|
+
if not self.sample_rate:
|
|
694
|
+
raise ValueError(
|
|
695
|
+
"ElevenLabs realtime STT requires a valid sample rate (start() must run first)."
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
params = {
|
|
699
|
+
"model_id": self._model,
|
|
700
|
+
"encoding": self._encoding or "pcm_16000",
|
|
701
|
+
"sample_rate": str(self.sample_rate),
|
|
702
|
+
"commit_strategy": self.commit_strategy,
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
language_code = (
|
|
706
|
+
language_to_elevenlabs_language(self._language_override)
|
|
707
|
+
if self._language_override
|
|
708
|
+
else None
|
|
709
|
+
)
|
|
710
|
+
if language_code:
|
|
711
|
+
params["language_code"] = language_code
|
|
712
|
+
|
|
713
|
+
if self._params.vad_silence_threshold_secs is not None:
|
|
714
|
+
params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
|
|
715
|
+
if self._params.vad_threshold is not None:
|
|
716
|
+
params["vad_threshold"] = str(self._params.vad_threshold)
|
|
717
|
+
if self._params.min_speech_duration_ms is not None:
|
|
718
|
+
params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
|
|
719
|
+
if self._params.min_silence_duration_ms is not None:
|
|
720
|
+
params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
|
|
721
|
+
|
|
722
|
+
return f"{self._url}?{urllib.parse.urlencode(params)}"
|
|
723
|
+
|
|
724
|
+
def _determine_encoding(self, sample_rate: int) -> str:
|
|
725
|
+
if not sample_rate:
|
|
726
|
+
raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
|
|
727
|
+
|
|
728
|
+
supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
|
|
729
|
+
if sample_rate not in supported_rates:
|
|
730
|
+
raise ValueError(
|
|
731
|
+
f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
|
|
732
|
+
f"Received {sample_rate} Hz."
|
|
733
|
+
)
|
|
734
|
+
return f"pcm_{sample_rate}"
|
|
735
|
+
|
|
736
|
+
@traced_stt
|
|
737
|
+
async def _handle_transcription(
|
|
738
|
+
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
|
739
|
+
):
|
|
740
|
+
"""Handle a transcription result with tracing."""
|
|
741
|
+
# Metrics are stopped by the caller when needed.
|
|
742
|
+
return
|
|
@@ -32,6 +32,7 @@ from pipecat.frames.frames import (
|
|
|
32
32
|
LLMMessagesFrame,
|
|
33
33
|
LLMTextFrame,
|
|
34
34
|
LLMUpdateSettingsFrame,
|
|
35
|
+
WarmupLLMFrame,
|
|
35
36
|
)
|
|
36
37
|
from pipecat.metrics.metrics import LLMTokenUsage
|
|
37
38
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
@@ -438,14 +439,19 @@ class BaseOpenAILLMService(LLMService):
|
|
|
438
439
|
completions and manage settings.
|
|
439
440
|
>>>>>>> dv-stage
|
|
440
441
|
|
|
441
|
-
|
|
442
|
+
Args:
|
|
442
443
|
frame: The frame to process.
|
|
443
444
|
direction: The direction of frame processing.
|
|
444
445
|
"""
|
|
445
446
|
await super().process_frame(frame, direction)
|
|
446
447
|
|
|
447
448
|
context = None
|
|
448
|
-
if isinstance(frame,
|
|
449
|
+
if isinstance(frame, WarmupLLMFrame):
|
|
450
|
+
# Handle warmup frame - prime cache without emitting response
|
|
451
|
+
# Run in background to avoid blocking the pipeline
|
|
452
|
+
asyncio.create_task(self._handle_warmup_frame(frame))
|
|
453
|
+
return # Don't process further, warmup is silent
|
|
454
|
+
elif isinstance(frame, OpenAILLMContextFrame):
|
|
449
455
|
# Handle OpenAI-specific context frames
|
|
450
456
|
context = frame.context
|
|
451
457
|
elif isinstance(frame, LLMContextFrame):
|
|
@@ -470,3 +476,32 @@ class BaseOpenAILLMService(LLMService):
|
|
|
470
476
|
finally:
|
|
471
477
|
await self.stop_processing_metrics()
|
|
472
478
|
await self.push_frame(LLMFullResponseEndFrame())
|
|
479
|
+
|
|
480
|
+
async def _handle_warmup_frame(self, frame: WarmupLLMFrame):
|
|
481
|
+
"""Handle WarmupLLMFrame to prime the LLM cache without emitting responses.
|
|
482
|
+
|
|
483
|
+
This method sends a minimal request to the LLM to warm up any provider-side
|
|
484
|
+
caches (like prompt caching). The response is discarded and no frames are emitted.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
frame: WarmupLLMFrame containing the messages to cache.
|
|
488
|
+
"""
|
|
489
|
+
try:
|
|
490
|
+
# Use the provided messages for warmup
|
|
491
|
+
messages: List[ChatCompletionMessageParam] = frame.messages # type: ignore
|
|
492
|
+
|
|
493
|
+
# Make a non-streaming call to warm the cache
|
|
494
|
+
# We use a minimal max_tokens to reduce latency and cost
|
|
495
|
+
await self._client.chat.completions.create(
|
|
496
|
+
model=self.model_name, # Use the property, not self._model
|
|
497
|
+
messages=messages,
|
|
498
|
+
max_tokens=10, # Minimal response
|
|
499
|
+
stream=False,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
self.logger.info("LLM cache warmed successfully")
|
|
503
|
+
# Intentionally don't emit any frames - this is a silent warmup
|
|
504
|
+
|
|
505
|
+
except Exception as e:
|
|
506
|
+
self.logger.error(f"Failed to warm LLM cache: {e}")
|
|
507
|
+
# Don't propagate error - warmup failure shouldn't break the bot
|
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev848.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev848.dist-info}/top_level.txt
RENAMED
|
File without changes
|