dv-pipecat-ai 0.0.85.dev847__py3-none-any.whl → 0.0.85.dev850__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dv-pipecat-ai
3
- Version: 0.0.85.dev847
3
+ Version: 0.0.85.dev850
4
4
  Summary: An open source framework for voice (and multimodal) assistants
5
5
  License-Expression: BSD-2-Clause
6
6
  Project-URL: Source, https://github.com/pipecat-ai/pipecat
@@ -1,4 +1,4 @@
1
- dv_pipecat_ai-0.0.85.dev847.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
1
+ dv_pipecat_ai-0.0.85.dev850.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
2
2
  pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
3
3
  pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -79,7 +79,7 @@ pipecat/extensions/voicemail/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
79
79
  pipecat/extensions/voicemail/voicemail_detector.py,sha256=JxmU2752iWP_1_GmzZReNESUTFAeyEa4XBPL20_C208,30004
80
80
  pipecat/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
81
81
  pipecat/frames/frames.proto,sha256=JXZm3VXLR8zMOUcOuhVoe2mhM3MQIQGMJXLopdJO_5Y,839
82
- pipecat/frames/frames.py,sha256=vuYtmyK1QSU2AWx2c_pFQhcmpXqSTnfqAXF6DXKzTG8,49605
82
+ pipecat/frames/frames.py,sha256=248d54lNOyO04dq9ni51yUTWUItmGw8b9QKarrDGNeo,50354
83
83
  pipecat/frames/protobufs/frames_pb2.py,sha256=VHgGV_W7qQ4sfQK6RHb5_DggLm3PiSYMr6aBZ8_p1cQ,2590
84
84
  pipecat/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  pipecat/metrics/metrics.py,sha256=bdZNciEtLTtA-xgoKDz2RJAy6fKrXkTwz3pryVHzc2M,2713
@@ -210,14 +210,14 @@ pipecat/services/cartesia/tts.py,sha256=I_OZCINywkDXmYzFL35MjSN8cAuNEaJs7nj0YB_o
210
210
  pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
211
211
  pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
212
212
  pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
213
- pipecat/services/deepgram/stt.py,sha256=fzKirTjTopwXNQEEPuUOIgk4AMvTJQcrh6H11w13q2c,16185
213
+ pipecat/services/deepgram/stt.py,sha256=t7P0zWLBitSF_KQqHr5aYjKdJZRnC36styl_eL86R88,24752
214
214
  pipecat/services/deepgram/tts.py,sha256=H_2WCJEx3_L4ytrHHRNkA-6GKTd1coou_vvTfiEodpQ,3745
215
215
  pipecat/services/deepgram/flux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
216
216
  pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41gHMXVxEM,25887
217
217
  pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
218
218
  pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
219
219
  pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
220
- pipecat/services/elevenlabs/stt.py,sha256=F3xD82eOIy5OyyE-5StdoFFvKjIXlos2yyP0cyNQj6Y,12214
220
+ pipecat/services/elevenlabs/stt.py,sha256=_RhBKpUYEGKMpcO7y4RLxmEOMK11LZFdZqDFIA-DZXk,27303
221
221
  pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
222
222
  pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
223
223
  pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
@@ -280,7 +280,7 @@ pipecat/services/nim/llm.py,sha256=o4WPGI6kOmSiMV7WwOZ0cNEAoq9hW4Aqs2R8X7c9i94,4
280
280
  pipecat/services/ollama/__init__.py,sha256=aw-25zYsR8LR74OFFlMKMTnJjaKwOzdPWVsClueNRkI,255
281
281
  pipecat/services/ollama/llm.py,sha256=rfpG92LRHGJlpENKhF6ld8CLVS9DxlKW-WRVNldOIGs,1605
282
282
  pipecat/services/openai/__init__.py,sha256=V0ZVa8PzEm3hmcStYICbAsYwfgk4ytZ6kiQoq9UZPmI,354
283
- pipecat/services/openai/base_llm.py,sha256=J4Ltg1KOXciiUIMBFLn0SmDTZereEE-1LKrPfBsLzFw,19127
283
+ pipecat/services/openai/base_llm.py,sha256=jOiWacimREywCMZZwAwH8RAHCbwnnXvbqAjWQUYA0yM,20727
284
284
  pipecat/services/openai/image.py,sha256=3e3h-dVQ6DQuQE7fp8akXwRMd-oYOdGuZg7RCOjHu9A,2994
285
285
  pipecat/services/openai/llm.py,sha256=_aKtz1VebSFUUenT3tH6mBW9pSCm65_u45cDu_dkTzs,7396
286
286
  pipecat/services/openai/stt.py,sha256=Idf0k73kxFyDgNRBt62MFpoKKNsBV9bwvJteJ6MGWzQ,2419
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
416
416
  pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
417
417
  pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
418
418
  pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
419
- dv_pipecat_ai-0.0.85.dev847.dist-info/METADATA,sha256=UDO7OKPaUOT1xGmii54o0h6Ee5jwWVp2ztvSbSKK-KU,32955
420
- dv_pipecat_ai-0.0.85.dev847.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
- dv_pipecat_ai-0.0.85.dev847.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
- dv_pipecat_ai-0.0.85.dev847.dist-info/RECORD,,
419
+ dv_pipecat_ai-0.0.85.dev850.dist-info/METADATA,sha256=rqzfsDkrkClO-BvwwJr5_b2ggADWXFKhgzPgToBwDm0,32955
420
+ dv_pipecat_ai-0.0.85.dev850.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
421
+ dv_pipecat_ai-0.0.85.dev850.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
422
+ dv_pipecat_ai-0.0.85.dev850.dist-info/RECORD,,
pipecat/frames/frames.py CHANGED
@@ -586,6 +586,27 @@ class LLMRunFrame(DataFrame):
586
586
  pass
587
587
 
588
588
 
589
+ @dataclass
590
+ class WarmupLLMFrame(DataFrame):
591
+ """Frame to trigger prompt caching/warmup in supported LLM providers.
592
+
593
+ This frame instructs the LLM service to cache the provided messages
594
+ without generating a visible response. Primarily used for warming up provider
595
+ caches (e.g., Claude's prompt caching, OpenAI's prompt caching) to improve
596
+ latency for subsequent requests.
597
+
598
+ The LLM service should:
599
+ 1. Send the messages to the provider to trigger caching
600
+ 2. Generate a minimal response (e.g., single word)
601
+ 3. Discard the response without emitting LLM output frames
602
+
603
+ Parameters:
604
+ messages: List of messages to send for cache warming (should match conversation structure).
605
+ """
606
+
607
+ messages: List[dict]
608
+
609
+
589
610
  @dataclass
590
611
  class LLMMessagesAppendFrame(DataFrame):
591
612
  """Frame containing LLM messages to append to current context.
@@ -8,7 +8,11 @@
8
8
 
9
9
  import asyncio
10
10
  import logging
11
- from typing import AsyncGenerator, Dict, Optional
11
+ import os
12
+ import socket
13
+ import time
14
+ from typing import AsyncGenerator, Callable, Dict, Optional
15
+ from urllib.parse import urlparse
12
16
 
13
17
  from loguru import logger
14
18
 
@@ -29,6 +33,155 @@ from pipecat.transcriptions.language import Language
29
33
  from pipecat.utils.time import time_now_iso8601
30
34
  from pipecat.utils.tracing.service_decorators import traced_stt
31
35
 
36
+ _PROCESS_START_MONOTONIC = time.monotonic()
37
+
38
+
39
+ def _read_first_numeric_file(paths):
40
+ for path in paths:
41
+ try:
42
+ with open(path, "r", encoding="utf-8") as file:
43
+ value = file.read().strip()
44
+ except FileNotFoundError:
45
+ continue
46
+ except OSError:
47
+ continue
48
+
49
+ if not value or value == "max":
50
+ return None
51
+
52
+ try:
53
+ return int(value)
54
+ except ValueError:
55
+ continue
56
+ return None
57
+
58
+
59
+ def _read_proc_status_value(key):
60
+ try:
61
+ with open("/proc/self/status", "r", encoding="utf-8") as status_file:
62
+ for line in status_file:
63
+ if line.startswith(key):
64
+ parts = line.split()
65
+ if len(parts) >= 2:
66
+ return int(parts[1]) * 1024 # kB -> bytes
67
+ except FileNotFoundError:
68
+ return None
69
+ except OSError:
70
+ return None
71
+ return None
72
+
73
+
74
+ def _read_cpu_throttling():
75
+ paths = ["/sys/fs/cgroup/cpu.stat", "/sys/fs/cgroup/cpu/cpu.stat"]
76
+ for path in paths:
77
+ try:
78
+ with open(path, "r", encoding="utf-8") as cpu_file:
79
+ for line in cpu_file:
80
+ if line.startswith("nr_throttled"):
81
+ parts = line.split()
82
+ if len(parts) >= 2:
83
+ return int(parts[1])
84
+ except FileNotFoundError:
85
+ continue
86
+ except OSError:
87
+ continue
88
+ return None
89
+
90
+
91
+ def _collect_runtime_diagnostics(
92
+ loop: Optional[asyncio.AbstractEventLoop] = None,
93
+ extra_context: Optional[Dict] = None,
94
+ context_provider: Optional[Callable[[], Dict]] = None,
95
+ ):
96
+ if loop is None:
97
+ try:
98
+ loop = asyncio.get_running_loop()
99
+ except RuntimeError:
100
+ loop = None
101
+
102
+ uptime_s = round(time.monotonic() - _PROCESS_START_MONOTONIC, 1)
103
+ rss_bytes = _read_proc_status_value("VmRSS:")
104
+ rss_mb = round(rss_bytes / (1024**2), 2) if rss_bytes else None
105
+
106
+ cgroup_usage_bytes = _read_first_numeric_file(
107
+ ["/sys/fs/cgroup/memory.current", "/sys/fs/cgroup/memory/memory.usage_in_bytes"]
108
+ )
109
+ cgroup_limit_bytes = _read_first_numeric_file(
110
+ ["/sys/fs/cgroup/memory.max", "/sys/fs/cgroup/memory/memory.limit_in_bytes"]
111
+ )
112
+ cgroup_usage_mb = (
113
+ round(cgroup_usage_bytes / (1024**2), 2) if cgroup_usage_bytes is not None else None
114
+ )
115
+ cgroup_limit_mb = (
116
+ round(cgroup_limit_bytes / (1024**2), 2) if cgroup_limit_bytes not in (None, 0) else None
117
+ )
118
+ cgroup_pct = (
119
+ round(cgroup_usage_bytes / cgroup_limit_bytes * 100, 2)
120
+ if cgroup_usage_bytes is not None and cgroup_limit_bytes not in (None, 0)
121
+ else None
122
+ )
123
+
124
+ try:
125
+ open_fds = len(os.listdir("/proc/self/fd"))
126
+ except Exception:
127
+ open_fds = None
128
+
129
+ pending_tasks = None
130
+ if loop:
131
+ try:
132
+ pending_tasks = len(asyncio.all_tasks(loop))
133
+ except Exception:
134
+ pending_tasks = None
135
+
136
+ suspected_cause = "unknown"
137
+ if cgroup_pct and cgroup_pct >= 90:
138
+ suspected_cause = "memory_pressure"
139
+ elif uptime_s < 180:
140
+ suspected_cause = "pod_cold_start"
141
+
142
+ diagnostics = {
143
+ "uptime_s": uptime_s,
144
+ "rss_mb": rss_mb,
145
+ "cgroup_usage_mb": cgroup_usage_mb,
146
+ "cgroup_limit_mb": cgroup_limit_mb,
147
+ "cgroup_usage_pct": cgroup_pct,
148
+ "open_fds": open_fds,
149
+ "pending_tasks": pending_tasks,
150
+ "suspected_cause": suspected_cause,
151
+ }
152
+ cpu_throttled = _read_cpu_throttling()
153
+ if cpu_throttled is not None:
154
+ diagnostics["cpu_nr_throttled"] = cpu_throttled
155
+
156
+ if context_provider:
157
+ try:
158
+ ctx = context_provider() or {}
159
+ if isinstance(ctx, dict):
160
+ diagnostics.update({k: v for k, v in ctx.items() if v is not None})
161
+ except Exception as exc:
162
+ diagnostics["context_provider_error"] = str(exc)
163
+
164
+ if extra_context:
165
+ diagnostics.update({k: v for k, v in extra_context.items() if v is not None})
166
+
167
+ return {k: v for k, v in diagnostics.items() if v is not None}
168
+
169
+
170
+ def _derive_connect_endpoint(base_url: str):
171
+ if not base_url:
172
+ return "api.deepgram.com", 443
173
+
174
+ parsed = urlparse(base_url)
175
+ host = parsed.hostname or "api.deepgram.com"
176
+ if parsed.port:
177
+ port = parsed.port
178
+ elif parsed.scheme in ("https", "wss"):
179
+ port = 443
180
+ else:
181
+ port = 80
182
+ return host, port
183
+
184
+
32
185
  try:
33
186
  from deepgram import (
34
187
  AsyncListenWebSocketClient,
@@ -64,6 +217,7 @@ class DeepgramSTTService(STTService):
64
217
  addons: Optional[Dict] = None,
65
218
  max_connect_retries: int = 3,
66
219
  connect_timeout_s: float = 2.5,
220
+ diagnostics_context_provider: Optional[Callable[[], Dict]] = None,
67
221
  **kwargs,
68
222
  ):
69
223
  """Initialize the Deepgram STT service.
@@ -82,6 +236,9 @@ class DeepgramSTTService(STTService):
82
236
  max_connect_retries: Maximum number of connection attempts before giving up.
83
237
  connect_timeout_s: Maximum time in seconds to wait for a connection attempt.
84
238
  Connection retries wait 100ms between attempts.
239
+ diagnostics_context_provider: Optional callable returning a dict with
240
+ additional runtime diagnostics (e.g., active call counts) to append
241
+ to warning logs.
85
242
  **kwargs: Additional arguments passed to the parent STTService.
86
243
  """
87
244
  sample_rate = sample_rate or (live_options.sample_rate if live_options else None)
@@ -125,6 +282,7 @@ class DeepgramSTTService(STTService):
125
282
  self.set_model_name(merged_options["model"])
126
283
  self._settings = merged_options
127
284
  self._addons = addons
285
+ self._diagnostics_context_provider = diagnostics_context_provider
128
286
 
129
287
  # Connection retry settings (100ms delay between retries)
130
288
  self._max_connect_retries = max_connect_retries
@@ -142,6 +300,7 @@ class DeepgramSTTService(STTService):
142
300
  verbose=logging.ERROR, # Enable error level and above logging
143
301
  ),
144
302
  )
303
+ self._connect_host, self._connect_port = _derive_connect_endpoint(base_url)
145
304
 
146
305
  if self.vad_enabled:
147
306
  self._register_event_handler("on_speech_started")
@@ -230,7 +389,10 @@ class DeepgramSTTService(STTService):
230
389
  async def _connect(self):
231
390
  self.logger.debug("Attempting to connect to Deepgram...")
232
391
 
392
+ loop = asyncio.get_running_loop()
233
393
  for attempt in range(self._max_connect_retries):
394
+ attempt_started = time.perf_counter()
395
+ dns_ms = await self._measure_dns_resolution(loop)
234
396
  try:
235
397
  # Clean up any previous connection attempt in background (non-blocking)
236
398
  if hasattr(self, "_connection") and self._connection is not None:
@@ -266,18 +428,67 @@ class DeepgramSTTService(STTService):
266
428
  timeout=self._connect_timeout_s,
267
429
  )
268
430
  except asyncio.TimeoutError:
431
+ elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
432
+ diagnostics = _collect_runtime_diagnostics(
433
+ loop,
434
+ extra_context={
435
+ "dns_ms": dns_ms,
436
+ "connect_duration_ms": elapsed_ms,
437
+ },
438
+ context_provider=self._diagnostics_context_provider,
439
+ )
269
440
  self.logger.warning(
270
- f"Deepgram connection attempt {attempt + 1}/{self._max_connect_retries} timed out after {self._connect_timeout_s} second(s)."
441
+ (
442
+ "Deepgram connection attempt {}/{} timed out after {:.2f} second(s). "
443
+ "runtime_diagnostics={}"
444
+ ),
445
+ attempt + 1,
446
+ self._max_connect_retries,
447
+ self._connect_timeout_s,
448
+ diagnostics,
271
449
  )
272
450
  start_result = False
273
451
  except Exception as start_error:
452
+ elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
453
+ diagnostics = _collect_runtime_diagnostics(
454
+ loop,
455
+ extra_context={
456
+ "dns_ms": dns_ms,
457
+ "connect_duration_ms": elapsed_ms,
458
+ },
459
+ context_provider=self._diagnostics_context_provider,
460
+ )
274
461
  self.logger.warning(
275
- f"Deepgram connection attempt {attempt + 1}/{self._max_connect_retries} failed with an exception: {start_error}"
462
+ (
463
+ "Deepgram connection attempt {}/{} failed with an exception: {}. "
464
+ "runtime_diagnostics={}"
465
+ ),
466
+ attempt + 1,
467
+ self._max_connect_retries,
468
+ start_error,
469
+ diagnostics,
276
470
  )
277
471
  start_result = False
278
472
  else:
279
473
  if start_result:
280
- self.logger.info("Successfully connected to Deepgram.")
474
+ elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
475
+ diagnostics = _collect_runtime_diagnostics(
476
+ loop,
477
+ extra_context={
478
+ "dns_ms": dns_ms,
479
+ "connect_duration_ms": elapsed_ms,
480
+ },
481
+ context_provider=self._diagnostics_context_provider,
482
+ )
483
+ self.logger.info(
484
+ (
485
+ "Successfully connected to Deepgram on attempt {} in {:.2f} ms. "
486
+ "runtime_diagnostics={}"
487
+ ),
488
+ attempt + 1,
489
+ elapsed_ms,
490
+ diagnostics,
491
+ )
281
492
  return # Exit the method on success
282
493
 
283
494
  self.logger.warning(
@@ -285,8 +496,24 @@ class DeepgramSTTService(STTService):
285
496
  )
286
497
 
287
498
  except Exception as e:
499
+ elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
500
+ diagnostics = _collect_runtime_diagnostics(
501
+ loop,
502
+ extra_context={
503
+ "dns_ms": dns_ms,
504
+ "connect_duration_ms": elapsed_ms,
505
+ },
506
+ context_provider=self._diagnostics_context_provider,
507
+ )
288
508
  self.logger.warning(
289
- f"Deepgram connection attempt {attempt + 1}/{self._max_connect_retries} failed with an exception: {e}"
509
+ (
510
+ "Deepgram connection attempt {}/{} failed with an exception: {}. "
511
+ "runtime_diagnostics={}"
512
+ ),
513
+ attempt + 1,
514
+ self._max_connect_retries,
515
+ e,
516
+ diagnostics,
290
517
  )
291
518
 
292
519
  # If this is not the last attempt, wait 100ms before retrying
@@ -300,6 +527,22 @@ class DeepgramSTTService(STTService):
300
527
  self.logger.error(error_msg)
301
528
  await self.push_error(ErrorFrame(error_msg, fatal=True))
302
529
 
530
+ async def _measure_dns_resolution(self, loop: Optional[asyncio.AbstractEventLoop]):
531
+ if not loop or not self._connect_host:
532
+ return None
533
+ try:
534
+ dns_task = loop.getaddrinfo(
535
+ self._connect_host,
536
+ self._connect_port,
537
+ type=socket.SOCK_STREAM,
538
+ proto=socket.IPPROTO_TCP,
539
+ )
540
+ start = time.perf_counter()
541
+ await asyncio.wait_for(dns_task, timeout=1.0)
542
+ return round((time.perf_counter() - start) * 1000, 2)
543
+ except Exception:
544
+ return None
545
+
303
546
  async def _disconnect(self):
304
547
  # Guard against missing connection instance and ensure proper async check
305
548
  connection: AsyncListenWebSocketClient = getattr(self, "_connection", None)
@@ -4,26 +4,43 @@
4
4
  # SPDX-License-Identifier: BSD 2-Clause License
5
5
  #
6
6
 
7
- """ElevenLabs speech-to-text service implementation.
8
-
9
- This module provides integration with ElevenLabs' Speech-to-Text API for transcription
10
- using segmented audio processing. The service uploads audio files and receives
11
- transcription results directly.
12
- """
7
+ """ElevenLabs speech-to-text service implementations."""
13
8
 
9
+ import asyncio
10
+ import base64
14
11
  import io
15
- from typing import AsyncGenerator, Optional
12
+ import json
13
+ import urllib.parse
14
+ from typing import Any, AsyncGenerator, Dict, Literal, Optional
16
15
 
17
16
  import aiohttp
18
17
  from loguru import logger
19
18
  from pydantic import BaseModel
20
19
 
21
- from pipecat.frames.frames import ErrorFrame, Frame, TranscriptionFrame
22
- from pipecat.services.stt_service import SegmentedSTTService
20
+ from pipecat.frames.frames import (
21
+ CancelFrame,
22
+ EndFrame,
23
+ ErrorFrame,
24
+ Frame,
25
+ InterimTranscriptionFrame,
26
+ StartFrame,
27
+ TranscriptionFrame,
28
+ UserStartedSpeakingFrame,
29
+ UserStoppedSpeakingFrame,
30
+ )
31
+ from pipecat.processors.frame_processor import FrameDirection
32
+ from pipecat.services.stt_service import SegmentedSTTService, WebsocketSTTService
23
33
  from pipecat.transcriptions.language import Language
24
34
  from pipecat.utils.time import time_now_iso8601
25
35
  from pipecat.utils.tracing.service_decorators import traced_stt
26
36
 
37
+ try:
38
+ from websockets.asyncio.client import connect as websocket_connect
39
+ from websockets.protocol import State
40
+ except ModuleNotFoundError:
41
+ websocket_connect = None # type: ignore[assignment]
42
+ State = None # type: ignore[assignment]
43
+
27
44
 
28
45
  def language_to_elevenlabs_language(language: Language) -> Optional[str]:
29
46
  """Convert a Language enum to ElevenLabs language code.
@@ -150,6 +167,19 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
150
167
  return result
151
168
 
152
169
 
170
+ def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
171
+ """Convert an ElevenLabs language code back to a Language enum value."""
172
+ if not language_code:
173
+ return None
174
+
175
+ normalized = language_code.lower()
176
+ for language in Language:
177
+ code = language_to_elevenlabs_language(language)
178
+ if code and code.lower() == normalized:
179
+ return language
180
+ return None
181
+
182
+
153
183
  class ElevenLabsSTTService(SegmentedSTTService):
154
184
  """Speech-to-text service using ElevenLabs' file-based API.
155
185
 
@@ -337,3 +367,376 @@ class ElevenLabsSTTService(SegmentedSTTService):
337
367
  except Exception as e:
338
368
  self.logger.error(f"ElevenLabs STT error: {e}")
339
369
  yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
370
+
371
+
372
+ class ElevenLabsRealtimeSTTService(WebsocketSTTService):
373
+ """Realtime speech-to-text service using ElevenLabs Scribe v2 WebSocket API."""
374
+
375
+ class InputParams(BaseModel):
376
+ """Realtime connection parameters derived from ElevenLabs documentation."""
377
+
378
+ language: Optional[Language] = None
379
+ commit_strategy: Literal["manual", "vad"] = "manual"
380
+ vad_silence_threshold_secs: Optional[float] = None
381
+ vad_threshold: Optional[float] = None
382
+ min_speech_duration_ms: Optional[int] = None
383
+ min_silence_duration_ms: Optional[int] = None
384
+
385
+ def __init__(
386
+ self,
387
+ *,
388
+ api_key: str,
389
+ sample_rate: Optional[int] = None,
390
+ model: str = "scribe_v2_realtime",
391
+ url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
392
+ params: Optional["ElevenLabsRealtimeSTTService.InputParams"] = None,
393
+ reconnect_on_error: bool = True,
394
+ **kwargs,
395
+ ):
396
+ """Initialize the realtime STT service.
397
+
398
+ Args:
399
+ api_key: ElevenLabs API key for authentication.
400
+ sample_rate: Optional input sample rate. Defaults to pipeline sample rate.
401
+ model: Scribe realtime model identifier.
402
+ url: WebSocket endpoint for realtime transcription.
403
+ params: Optional realtime configuration options.
404
+ reconnect_on_error: Whether to auto-reconnect on transient failures.
405
+ **kwargs: Additional arguments forwarded to WebsocketSTTService.
406
+ """
407
+ if websocket_connect is None or State is None:
408
+ logger.error(
409
+ "In order to use ElevenLabsRealtimeSTTService, you need to "
410
+ "`pip install pipecat-ai[elevenlabs]` (websockets extra)."
411
+ )
412
+ raise ModuleNotFoundError("Missing optional dependency: websockets")
413
+
414
+ super().__init__(sample_rate=sample_rate, reconnect_on_error=reconnect_on_error, **kwargs)
415
+
416
+ self._api_key = api_key
417
+ self._url = url
418
+ self.set_model_name(model)
419
+ self._model = model
420
+ self._params = params or ElevenLabsRealtimeSTTService.InputParams()
421
+ self._language_override = self._params.language
422
+ self._encoding = None
423
+ self._receive_task: Optional[asyncio.Task] = None
424
+ self._pending_final_message: Optional[Dict[str, Any]] = None
425
+ self._pending_final_task: Optional[asyncio.Task] = None
426
+ self._timestamp_merge_delay_s = 0.25
427
+ self._ttfb_started = False
428
+
429
+ @property
430
+ def commit_strategy(self) -> str:
431
+ """Return the configured commit strategy (manual or vad)."""
432
+ return (self._params.commit_strategy or "manual").lower()
433
+
434
+ def can_generate_metrics(self) -> bool:
435
+ """Realtime ElevenLabs service supports latency metrics."""
436
+ return True
437
+
438
+ async def start(self, frame: StartFrame):
439
+ """Start the realtime STT service and establish WebSocket connection."""
440
+ await super().start(frame)
441
+ self._encoding = self._determine_encoding(self.sample_rate)
442
+ await self._connect()
443
+
444
+ async def stop(self, frame: EndFrame):
445
+ """Stop the realtime STT service and close WebSocket connection."""
446
+ await super().stop(frame)
447
+ await self._disconnect()
448
+
449
+ async def cancel(self, frame: CancelFrame):
450
+ """Cancel the realtime STT service and close WebSocket connection."""
451
+ await super().cancel(frame)
452
+ await self._disconnect()
453
+
454
+ async def set_language(self, language: Language):
455
+ """Update preferred transcription language (requires reconnect)."""
456
+ self._language_override = language
457
+ self._params.language = language
458
+ if self._websocket:
459
+ await self._disconnect()
460
+ await self._connect()
461
+
462
+ async def set_model(self, model: str):
463
+ """Set the STT model and reconnect the WebSocket."""
464
+ await super().set_model(model)
465
+ self._model = model
466
+ if self._websocket:
467
+ await self._disconnect()
468
+ await self._connect()
469
+
470
+ async def process_frame(self, frame: Frame, direction: FrameDirection):
471
+ """Process frames and handle VAD events for commit strategy."""
472
+ await super().process_frame(frame, direction)
473
+
474
+ if isinstance(frame, UserStartedSpeakingFrame):
475
+ if frame.emulated:
476
+ return
477
+ self._ttfb_started = False
478
+ await self.start_processing_metrics()
479
+ elif isinstance(frame, UserStoppedSpeakingFrame):
480
+ if frame.emulated:
481
+ return
482
+ if self.commit_strategy == "manual":
483
+ await self._send_commit()
484
+
485
+ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
486
+ """Stream audio chunks over the ElevenLabs realtime WebSocket."""
487
+ if not audio:
488
+ yield None
489
+ return
490
+
491
+ await self._ensure_connection()
492
+ await self._send_audio_chunk(audio)
493
+ yield None
494
+
495
+ async def _ensure_connection(self):
496
+ if not self._websocket or self._websocket.state is State.CLOSED:
497
+ await self._connect()
498
+
499
+ async def _connect(self):
500
+ await self._connect_websocket()
501
+ if self._websocket and not self._receive_task:
502
+ self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
503
+
504
+ async def _disconnect(self):
505
+ if self._receive_task:
506
+ await self.cancel_task(self._receive_task)
507
+ self._receive_task = None
508
+
509
+ await self._clear_pending_final()
510
+ await self._disconnect_websocket()
511
+
512
+ async def _connect_websocket(self):
513
+ try:
514
+ if self._websocket and self._websocket.state is State.OPEN:
515
+ return
516
+
517
+ ws_url = self._build_websocket_url()
518
+ headers = {"xi-api-key": self._api_key}
519
+ self.logger.debug(f"Connecting to ElevenLabs realtime STT at {ws_url}")
520
+ self._websocket = await websocket_connect(ws_url, additional_headers=headers)
521
+ await self._call_event_handler("on_connected")
522
+ except Exception as e:
523
+ self.logger.error(f"{self} unable to connect to ElevenLabs realtime STT: {e}")
524
+ self._websocket = None
525
+ await self._call_event_handler("on_connection_error", f"{e}")
526
+
527
+ async def _disconnect_websocket(self):
528
+ try:
529
+ await self.stop_all_metrics()
530
+ if self._websocket and self._websocket.state is State.OPEN:
531
+ self.logger.debug("Disconnecting from ElevenLabs realtime STT")
532
+ await self._websocket.close()
533
+ except Exception as e:
534
+ self.logger.error(f"{self} error closing ElevenLabs realtime websocket: {e}")
535
+ finally:
536
+ self._websocket = None
537
+ await self._call_event_handler("on_disconnected")
538
+
539
+ async def _receive_messages(self):
540
+ async for message in self._get_websocket():
541
+ await self._process_event(message)
542
+
543
+ def _get_websocket(self):
544
+ if not self._websocket:
545
+ raise RuntimeError("ElevenLabs realtime websocket not connected")
546
+ return self._websocket
547
+
548
+ async def _process_event(self, message: Any):
549
+ try:
550
+ data = json.loads(message)
551
+ except json.JSONDecodeError:
552
+ self.logger.warning(f"ElevenLabs realtime STT sent invalid JSON: {message}")
553
+ return
554
+
555
+ message_type = data.get("message_type")
556
+
557
+ if message_type == "session_started":
558
+ self.logger.debug("ElevenLabs realtime session started")
559
+ return
560
+
561
+ if message_type == "partial_transcript":
562
+ await self._emit_partial_transcript(data)
563
+ elif message_type == "committed_transcript":
564
+ await self._handle_committed_transcript(data)
565
+ elif message_type == "committed_transcript_with_timestamps":
566
+ await self._handle_committed_transcript_with_timestamps(data)
567
+ elif message_type in {
568
+ "auth_error",
569
+ "quota_exceeded",
570
+ "transcriber_error",
571
+ "input_error",
572
+ "error",
573
+ }:
574
+ fatal = message_type in {"auth_error", "quota_exceeded", "error"}
575
+ description = data.get("error", data)
576
+ await self.push_error(
577
+ ErrorFrame(f"ElevenLabs realtime error: {description}", fatal=fatal)
578
+ )
579
+ else:
580
+ self.logger.debug(f"Unhandled ElevenLabs realtime message: {data}")
581
+
582
+ async def _emit_partial_transcript(self, data: Dict[str, Any]):
583
+ text = (data.get("text") or data.get("transcript") or "").strip()
584
+ if not text:
585
+ return
586
+
587
+ language = (
588
+ elevenlabs_language_code_to_language(data.get("language_code"))
589
+ or self._language_override
590
+ )
591
+ await self.stop_ttfb_metrics()
592
+
593
+ await self.push_frame(
594
+ InterimTranscriptionFrame(
595
+ text,
596
+ self._user_id,
597
+ time_now_iso8601(),
598
+ language,
599
+ result=data,
600
+ )
601
+ )
602
+
603
+ async def _handle_committed_transcript(self, data: Dict[str, Any]):
604
+ if self._pending_final_message:
605
+ await self._emit_transcription(self._pending_final_message)
606
+ self._pending_final_message = None
607
+
608
+ self._pending_final_message = data
609
+ await self._schedule_pending_final_emit()
610
+
611
+ async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
612
+ if self._pending_final_message:
613
+ merged = {**self._pending_final_message, **data}
614
+ await self._emit_transcription(merged)
615
+ await self._clear_pending_final()
616
+ else:
617
+ await self._emit_transcription(data)
618
+
619
+ async def _schedule_pending_final_emit(self):
620
+ await self._clear_pending_final(timer_only=True)
621
+ self._pending_final_task = asyncio.create_task(self._emit_pending_after_delay())
622
+
623
+ async def _emit_pending_after_delay(self):
624
+ try:
625
+ await asyncio.sleep(self._timestamp_merge_delay_s)
626
+ if self._pending_final_message:
627
+ await self._emit_transcription(self._pending_final_message)
628
+ self._pending_final_message = None
629
+ except asyncio.CancelledError:
630
+ pass
631
+ finally:
632
+ self._pending_final_task = None
633
+
634
+ async def _clear_pending_final(self, timer_only: bool = False):
635
+ if self._pending_final_task:
636
+ await self.cancel_task(self._pending_final_task)
637
+ self._pending_final_task = None
638
+
639
+ if not timer_only:
640
+ self._pending_final_message = None
641
+
642
+ async def _emit_transcription(self, data: Dict[str, Any]):
643
+ text = (data.get("text") or data.get("transcript") or "").strip()
644
+ if not text:
645
+ return
646
+
647
+ language = (
648
+ elevenlabs_language_code_to_language(data.get("language_code"))
649
+ or self._language_override
650
+ )
651
+ await self.stop_ttfb_metrics()
652
+
653
+ frame = TranscriptionFrame(
654
+ text,
655
+ self._user_id,
656
+ time_now_iso8601(),
657
+ language,
658
+ result=data,
659
+ )
660
+
661
+ await self.push_frame(frame)
662
+ await self._handle_transcription(text, True, language)
663
+ await self.stop_processing_metrics()
664
+
665
+ async def _send_audio_chunk(self, audio: bytes):
666
+ if not audio or not self._websocket:
667
+ return
668
+
669
+ if not self._ttfb_started:
670
+ await self.start_ttfb_metrics()
671
+ self._ttfb_started = True
672
+
673
+ payload = {
674
+ "message_type": "input_audio_chunk",
675
+ "audio_base_64": base64.b64encode(audio).decode("ascii"),
676
+ "commit": False,
677
+ "sample_rate": self.sample_rate,
678
+ }
679
+ await self._websocket.send(json.dumps(payload))
680
+
681
+ async def _send_commit(self):
682
+ if not self._websocket:
683
+ return
684
+ payload = {
685
+ "message_type": "input_audio_chunk",
686
+ "audio_base_64": "",
687
+ "commit": True,
688
+ "sample_rate": self.sample_rate,
689
+ }
690
+ await self._websocket.send(json.dumps(payload))
691
+
692
+ def _build_websocket_url(self) -> str:
693
+ if not self.sample_rate:
694
+ raise ValueError(
695
+ "ElevenLabs realtime STT requires a valid sample rate (start() must run first)."
696
+ )
697
+
698
+ params = {
699
+ "model_id": self._model,
700
+ "encoding": self._encoding or "pcm_16000",
701
+ "sample_rate": str(self.sample_rate),
702
+ "commit_strategy": self.commit_strategy,
703
+ }
704
+
705
+ language_code = (
706
+ language_to_elevenlabs_language(self._language_override)
707
+ if self._language_override
708
+ else None
709
+ )
710
+ if language_code:
711
+ params["language_code"] = language_code
712
+
713
+ if self._params.vad_silence_threshold_secs is not None:
714
+ params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
715
+ if self._params.vad_threshold is not None:
716
+ params["vad_threshold"] = str(self._params.vad_threshold)
717
+ if self._params.min_speech_duration_ms is not None:
718
+ params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
719
+ if self._params.min_silence_duration_ms is not None:
720
+ params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
721
+
722
+ return f"{self._url}?{urllib.parse.urlencode(params)}"
723
+
724
+ def _determine_encoding(self, sample_rate: int) -> str:
725
+ if not sample_rate:
726
+ raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
727
+
728
+ supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
729
+ if sample_rate not in supported_rates:
730
+ raise ValueError(
731
+ f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
732
+ f"Received {sample_rate} Hz."
733
+ )
734
+ return f"pcm_{sample_rate}"
735
+
736
+ @traced_stt
737
+ async def _handle_transcription(
738
+ self, transcript: str, is_final: bool, language: Optional[Language] = None
739
+ ):
740
+ """Handle a transcription result with tracing."""
741
+ # Metrics are stopped by the caller when needed.
742
+ return
@@ -32,6 +32,7 @@ from pipecat.frames.frames import (
32
32
  LLMMessagesFrame,
33
33
  LLMTextFrame,
34
34
  LLMUpdateSettingsFrame,
35
+ WarmupLLMFrame,
35
36
  )
36
37
  from pipecat.metrics.metrics import LLMTokenUsage
37
38
  from pipecat.processors.aggregators.llm_context import LLMContext
@@ -438,14 +439,19 @@ class BaseOpenAILLMService(LLMService):
438
439
  completions and manage settings.
439
440
  >>>>>>> dv-stage
440
441
 
441
- Args:
442
+ Args:
442
443
  frame: The frame to process.
443
444
  direction: The direction of frame processing.
444
445
  """
445
446
  await super().process_frame(frame, direction)
446
447
 
447
448
  context = None
448
- if isinstance(frame, OpenAILLMContextFrame):
449
+ if isinstance(frame, WarmupLLMFrame):
450
+ # Handle warmup frame - prime cache without emitting response
451
+ # Run in background to avoid blocking the pipeline
452
+ asyncio.create_task(self._handle_warmup_frame(frame))
453
+ return # Don't process further, warmup is silent
454
+ elif isinstance(frame, OpenAILLMContextFrame):
449
455
  # Handle OpenAI-specific context frames
450
456
  context = frame.context
451
457
  elif isinstance(frame, LLMContextFrame):
@@ -470,3 +476,32 @@ class BaseOpenAILLMService(LLMService):
470
476
  finally:
471
477
  await self.stop_processing_metrics()
472
478
  await self.push_frame(LLMFullResponseEndFrame())
479
+
480
+ async def _handle_warmup_frame(self, frame: WarmupLLMFrame):
481
+ """Handle WarmupLLMFrame to prime the LLM cache without emitting responses.
482
+
483
+ This method sends a minimal request to the LLM to warm up any provider-side
484
+ caches (like prompt caching). The response is discarded and no frames are emitted.
485
+
486
+ Args:
487
+ frame: WarmupLLMFrame containing the messages to cache.
488
+ """
489
+ try:
490
+ # Use the provided messages for warmup
491
+ messages: List[ChatCompletionMessageParam] = frame.messages # type: ignore
492
+
493
+ # Make a non-streaming call to warm the cache
494
+ # We use a minimal max_tokens to reduce latency and cost
495
+ await self._client.chat.completions.create(
496
+ model=self.model_name, # Use the property, not self._model
497
+ messages=messages,
498
+ max_tokens=10, # Minimal response
499
+ stream=False,
500
+ )
501
+
502
+ self.logger.info("LLM cache warmed successfully")
503
+ # Intentionally don't emit any frames - this is a silent warmup
504
+
505
+ except Exception as e:
506
+ self.logger.error(f"Failed to warm LLM cache: {e}")
507
+ # Don't propagate error - warmup failure shouldn't break the bot