dv-pipecat-ai 0.0.85.dev847__py3-none-any.whl → 0.0.85.dev850__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/METADATA +1 -1
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/RECORD +9 -9
- pipecat/frames/frames.py +21 -0
- pipecat/services/deepgram/stt.py +248 -5
- pipecat/services/elevenlabs/stt.py +412 -9
- pipecat/services/openai/base_llm.py +37 -2
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
dv_pipecat_ai-0.0.85.
|
|
1
|
+
dv_pipecat_ai-0.0.85.dev850.dist-info/licenses/LICENSE,sha256=DWY2QGf2eMCFhuu2ChairtT6CB7BEFffNVhXWc4Od08,1301
|
|
2
2
|
pipecat/__init__.py,sha256=j0Xm6adxHhd7D06dIyyPV_GlBYLlBnTAERVvD_jAARQ,861
|
|
3
3
|
pipecat/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
pipecat/adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -79,7 +79,7 @@ pipecat/extensions/voicemail/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NM
|
|
|
79
79
|
pipecat/extensions/voicemail/voicemail_detector.py,sha256=JxmU2752iWP_1_GmzZReNESUTFAeyEa4XBPL20_C208,30004
|
|
80
80
|
pipecat/frames/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
81
81
|
pipecat/frames/frames.proto,sha256=JXZm3VXLR8zMOUcOuhVoe2mhM3MQIQGMJXLopdJO_5Y,839
|
|
82
|
-
pipecat/frames/frames.py,sha256=
|
|
82
|
+
pipecat/frames/frames.py,sha256=248d54lNOyO04dq9ni51yUTWUItmGw8b9QKarrDGNeo,50354
|
|
83
83
|
pipecat/frames/protobufs/frames_pb2.py,sha256=VHgGV_W7qQ4sfQK6RHb5_DggLm3PiSYMr6aBZ8_p1cQ,2590
|
|
84
84
|
pipecat/metrics/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
85
85
|
pipecat/metrics/metrics.py,sha256=bdZNciEtLTtA-xgoKDz2RJAy6fKrXkTwz3pryVHzc2M,2713
|
|
@@ -210,14 +210,14 @@ pipecat/services/cartesia/tts.py,sha256=I_OZCINywkDXmYzFL35MjSN8cAuNEaJs7nj0YB_o
|
|
|
210
210
|
pipecat/services/cerebras/__init__.py,sha256=5zBmqq9Zfcl-HC7ylekVS5qrRedbl1mAeEwUT-T-c_o,259
|
|
211
211
|
pipecat/services/cerebras/llm.py,sha256=-yzSe_6YDGigwzES-LZS4vNXMPugmvsIYEpTySyr5nA,3047
|
|
212
212
|
pipecat/services/deepgram/__init__.py,sha256=IjRtMI7WytRDdmYVpk2qDWClXUiNgdl7ZkvEAWg1eYE,304
|
|
213
|
-
pipecat/services/deepgram/stt.py,sha256=
|
|
213
|
+
pipecat/services/deepgram/stt.py,sha256=t7P0zWLBitSF_KQqHr5aYjKdJZRnC36styl_eL86R88,24752
|
|
214
214
|
pipecat/services/deepgram/tts.py,sha256=H_2WCJEx3_L4ytrHHRNkA-6GKTd1coou_vvTfiEodpQ,3745
|
|
215
215
|
pipecat/services/deepgram/flux/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
216
216
|
pipecat/services/deepgram/flux/stt.py,sha256=yCZodrHAOShgYy_GbdviX8iAuh36dBgDL41gHMXVxEM,25887
|
|
217
217
|
pipecat/services/deepseek/__init__.py,sha256=bU5z_oNGzgrF_YpsD9pYIMtEibeZFaUobbRjJ9WcYyE,259
|
|
218
218
|
pipecat/services/deepseek/llm.py,sha256=5KjpU2blmhUTM3LcRE1ymdsk6OmoFkIzeQgyNOGwQh8,3112
|
|
219
219
|
pipecat/services/elevenlabs/__init__.py,sha256=cMx5v0HEMh4WetMm5byR9tIjG6_wNVs9UxqWyB3tjlM,313
|
|
220
|
-
pipecat/services/elevenlabs/stt.py,sha256=
|
|
220
|
+
pipecat/services/elevenlabs/stt.py,sha256=_RhBKpUYEGKMpcO7y4RLxmEOMK11LZFdZqDFIA-DZXk,27303
|
|
221
221
|
pipecat/services/elevenlabs/tts.py,sha256=skUndgUatx2F5rjg2tBZLutB8k9B9Cjy-cUeglCDdwc,45314
|
|
222
222
|
pipecat/services/fal/__init__.py,sha256=z_kfZETvUcKy68Lyvni4B-RtdkOvz3J3eh6sFDVKq6M,278
|
|
223
223
|
pipecat/services/fal/image.py,sha256=vArKLKrIGoZfw_xeZY_E7zbUzfzVsScj-R7mOmVqjRQ,4585
|
|
@@ -280,7 +280,7 @@ pipecat/services/nim/llm.py,sha256=o4WPGI6kOmSiMV7WwOZ0cNEAoq9hW4Aqs2R8X7c9i94,4
|
|
|
280
280
|
pipecat/services/ollama/__init__.py,sha256=aw-25zYsR8LR74OFFlMKMTnJjaKwOzdPWVsClueNRkI,255
|
|
281
281
|
pipecat/services/ollama/llm.py,sha256=rfpG92LRHGJlpENKhF6ld8CLVS9DxlKW-WRVNldOIGs,1605
|
|
282
282
|
pipecat/services/openai/__init__.py,sha256=V0ZVa8PzEm3hmcStYICbAsYwfgk4ytZ6kiQoq9UZPmI,354
|
|
283
|
-
pipecat/services/openai/base_llm.py,sha256=
|
|
283
|
+
pipecat/services/openai/base_llm.py,sha256=jOiWacimREywCMZZwAwH8RAHCbwnnXvbqAjWQUYA0yM,20727
|
|
284
284
|
pipecat/services/openai/image.py,sha256=3e3h-dVQ6DQuQE7fp8akXwRMd-oYOdGuZg7RCOjHu9A,2994
|
|
285
285
|
pipecat/services/openai/llm.py,sha256=_aKtz1VebSFUUenT3tH6mBW9pSCm65_u45cDu_dkTzs,7396
|
|
286
286
|
pipecat/services/openai/stt.py,sha256=Idf0k73kxFyDgNRBt62MFpoKKNsBV9bwvJteJ6MGWzQ,2419
|
|
@@ -416,7 +416,7 @@ pipecat/utils/tracing/service_decorators.py,sha256=fwzxFpi8DJl6BJbK74G0UEB4ccMJg
|
|
|
416
416
|
pipecat/utils/tracing/setup.py,sha256=7TEgPNpq6M8lww8OQvf0P9FzYc5A30xICGklVA-fua0,2892
|
|
417
417
|
pipecat/utils/tracing/turn_context_provider.py,sha256=ikon3plFOx0XbMrH6DdeHttNpb-U0gzMZIm3bWLc9eI,2485
|
|
418
418
|
pipecat/utils/tracing/turn_trace_observer.py,sha256=dma16SBJpYSOE58YDWy89QzHyQFc_9gQZszKeWixuwc,9725
|
|
419
|
-
dv_pipecat_ai-0.0.85.
|
|
420
|
-
dv_pipecat_ai-0.0.85.
|
|
421
|
-
dv_pipecat_ai-0.0.85.
|
|
422
|
-
dv_pipecat_ai-0.0.85.
|
|
419
|
+
dv_pipecat_ai-0.0.85.dev850.dist-info/METADATA,sha256=rqzfsDkrkClO-BvwwJr5_b2ggADWXFKhgzPgToBwDm0,32955
|
|
420
|
+
dv_pipecat_ai-0.0.85.dev850.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
421
|
+
dv_pipecat_ai-0.0.85.dev850.dist-info/top_level.txt,sha256=kQzG20CxGf-nSsHmtXHx3hY2-8zHA3jYg8jk0TajqXc,8
|
|
422
|
+
dv_pipecat_ai-0.0.85.dev850.dist-info/RECORD,,
|
pipecat/frames/frames.py
CHANGED
|
@@ -586,6 +586,27 @@ class LLMRunFrame(DataFrame):
|
|
|
586
586
|
pass
|
|
587
587
|
|
|
588
588
|
|
|
589
|
+
@dataclass
|
|
590
|
+
class WarmupLLMFrame(DataFrame):
|
|
591
|
+
"""Frame to trigger prompt caching/warmup in supported LLM providers.
|
|
592
|
+
|
|
593
|
+
This frame instructs the LLM service to cache the provided messages
|
|
594
|
+
without generating a visible response. Primarily used for warming up provider
|
|
595
|
+
caches (e.g., Claude's prompt caching, OpenAI's prompt caching) to improve
|
|
596
|
+
latency for subsequent requests.
|
|
597
|
+
|
|
598
|
+
The LLM service should:
|
|
599
|
+
1. Send the messages to the provider to trigger caching
|
|
600
|
+
2. Generate a minimal response (e.g., single word)
|
|
601
|
+
3. Discard the response without emitting LLM output frames
|
|
602
|
+
|
|
603
|
+
Parameters:
|
|
604
|
+
messages: List of messages to send for cache warming (should match conversation structure).
|
|
605
|
+
"""
|
|
606
|
+
|
|
607
|
+
messages: List[dict]
|
|
608
|
+
|
|
609
|
+
|
|
589
610
|
@dataclass
|
|
590
611
|
class LLMMessagesAppendFrame(DataFrame):
|
|
591
612
|
"""Frame containing LLM messages to append to current context.
|
pipecat/services/deepgram/stt.py
CHANGED
|
@@ -8,7 +8,11 @@
|
|
|
8
8
|
|
|
9
9
|
import asyncio
|
|
10
10
|
import logging
|
|
11
|
-
|
|
11
|
+
import os
|
|
12
|
+
import socket
|
|
13
|
+
import time
|
|
14
|
+
from typing import AsyncGenerator, Callable, Dict, Optional
|
|
15
|
+
from urllib.parse import urlparse
|
|
12
16
|
|
|
13
17
|
from loguru import logger
|
|
14
18
|
|
|
@@ -29,6 +33,155 @@ from pipecat.transcriptions.language import Language
|
|
|
29
33
|
from pipecat.utils.time import time_now_iso8601
|
|
30
34
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
31
35
|
|
|
36
|
+
_PROCESS_START_MONOTONIC = time.monotonic()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _read_first_numeric_file(paths):
|
|
40
|
+
for path in paths:
|
|
41
|
+
try:
|
|
42
|
+
with open(path, "r", encoding="utf-8") as file:
|
|
43
|
+
value = file.read().strip()
|
|
44
|
+
except FileNotFoundError:
|
|
45
|
+
continue
|
|
46
|
+
except OSError:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
if not value or value == "max":
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
return int(value)
|
|
54
|
+
except ValueError:
|
|
55
|
+
continue
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _read_proc_status_value(key):
|
|
60
|
+
try:
|
|
61
|
+
with open("/proc/self/status", "r", encoding="utf-8") as status_file:
|
|
62
|
+
for line in status_file:
|
|
63
|
+
if line.startswith(key):
|
|
64
|
+
parts = line.split()
|
|
65
|
+
if len(parts) >= 2:
|
|
66
|
+
return int(parts[1]) * 1024 # kB -> bytes
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
return None
|
|
69
|
+
except OSError:
|
|
70
|
+
return None
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _read_cpu_throttling():
|
|
75
|
+
paths = ["/sys/fs/cgroup/cpu.stat", "/sys/fs/cgroup/cpu/cpu.stat"]
|
|
76
|
+
for path in paths:
|
|
77
|
+
try:
|
|
78
|
+
with open(path, "r", encoding="utf-8") as cpu_file:
|
|
79
|
+
for line in cpu_file:
|
|
80
|
+
if line.startswith("nr_throttled"):
|
|
81
|
+
parts = line.split()
|
|
82
|
+
if len(parts) >= 2:
|
|
83
|
+
return int(parts[1])
|
|
84
|
+
except FileNotFoundError:
|
|
85
|
+
continue
|
|
86
|
+
except OSError:
|
|
87
|
+
continue
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _collect_runtime_diagnostics(
|
|
92
|
+
loop: Optional[asyncio.AbstractEventLoop] = None,
|
|
93
|
+
extra_context: Optional[Dict] = None,
|
|
94
|
+
context_provider: Optional[Callable[[], Dict]] = None,
|
|
95
|
+
):
|
|
96
|
+
if loop is None:
|
|
97
|
+
try:
|
|
98
|
+
loop = asyncio.get_running_loop()
|
|
99
|
+
except RuntimeError:
|
|
100
|
+
loop = None
|
|
101
|
+
|
|
102
|
+
uptime_s = round(time.monotonic() - _PROCESS_START_MONOTONIC, 1)
|
|
103
|
+
rss_bytes = _read_proc_status_value("VmRSS:")
|
|
104
|
+
rss_mb = round(rss_bytes / (1024**2), 2) if rss_bytes else None
|
|
105
|
+
|
|
106
|
+
cgroup_usage_bytes = _read_first_numeric_file(
|
|
107
|
+
["/sys/fs/cgroup/memory.current", "/sys/fs/cgroup/memory/memory.usage_in_bytes"]
|
|
108
|
+
)
|
|
109
|
+
cgroup_limit_bytes = _read_first_numeric_file(
|
|
110
|
+
["/sys/fs/cgroup/memory.max", "/sys/fs/cgroup/memory/memory.limit_in_bytes"]
|
|
111
|
+
)
|
|
112
|
+
cgroup_usage_mb = (
|
|
113
|
+
round(cgroup_usage_bytes / (1024**2), 2) if cgroup_usage_bytes is not None else None
|
|
114
|
+
)
|
|
115
|
+
cgroup_limit_mb = (
|
|
116
|
+
round(cgroup_limit_bytes / (1024**2), 2) if cgroup_limit_bytes not in (None, 0) else None
|
|
117
|
+
)
|
|
118
|
+
cgroup_pct = (
|
|
119
|
+
round(cgroup_usage_bytes / cgroup_limit_bytes * 100, 2)
|
|
120
|
+
if cgroup_usage_bytes is not None and cgroup_limit_bytes not in (None, 0)
|
|
121
|
+
else None
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
open_fds = len(os.listdir("/proc/self/fd"))
|
|
126
|
+
except Exception:
|
|
127
|
+
open_fds = None
|
|
128
|
+
|
|
129
|
+
pending_tasks = None
|
|
130
|
+
if loop:
|
|
131
|
+
try:
|
|
132
|
+
pending_tasks = len(asyncio.all_tasks(loop))
|
|
133
|
+
except Exception:
|
|
134
|
+
pending_tasks = None
|
|
135
|
+
|
|
136
|
+
suspected_cause = "unknown"
|
|
137
|
+
if cgroup_pct and cgroup_pct >= 90:
|
|
138
|
+
suspected_cause = "memory_pressure"
|
|
139
|
+
elif uptime_s < 180:
|
|
140
|
+
suspected_cause = "pod_cold_start"
|
|
141
|
+
|
|
142
|
+
diagnostics = {
|
|
143
|
+
"uptime_s": uptime_s,
|
|
144
|
+
"rss_mb": rss_mb,
|
|
145
|
+
"cgroup_usage_mb": cgroup_usage_mb,
|
|
146
|
+
"cgroup_limit_mb": cgroup_limit_mb,
|
|
147
|
+
"cgroup_usage_pct": cgroup_pct,
|
|
148
|
+
"open_fds": open_fds,
|
|
149
|
+
"pending_tasks": pending_tasks,
|
|
150
|
+
"suspected_cause": suspected_cause,
|
|
151
|
+
}
|
|
152
|
+
cpu_throttled = _read_cpu_throttling()
|
|
153
|
+
if cpu_throttled is not None:
|
|
154
|
+
diagnostics["cpu_nr_throttled"] = cpu_throttled
|
|
155
|
+
|
|
156
|
+
if context_provider:
|
|
157
|
+
try:
|
|
158
|
+
ctx = context_provider() or {}
|
|
159
|
+
if isinstance(ctx, dict):
|
|
160
|
+
diagnostics.update({k: v for k, v in ctx.items() if v is not None})
|
|
161
|
+
except Exception as exc:
|
|
162
|
+
diagnostics["context_provider_error"] = str(exc)
|
|
163
|
+
|
|
164
|
+
if extra_context:
|
|
165
|
+
diagnostics.update({k: v for k, v in extra_context.items() if v is not None})
|
|
166
|
+
|
|
167
|
+
return {k: v for k, v in diagnostics.items() if v is not None}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _derive_connect_endpoint(base_url: str):
|
|
171
|
+
if not base_url:
|
|
172
|
+
return "api.deepgram.com", 443
|
|
173
|
+
|
|
174
|
+
parsed = urlparse(base_url)
|
|
175
|
+
host = parsed.hostname or "api.deepgram.com"
|
|
176
|
+
if parsed.port:
|
|
177
|
+
port = parsed.port
|
|
178
|
+
elif parsed.scheme in ("https", "wss"):
|
|
179
|
+
port = 443
|
|
180
|
+
else:
|
|
181
|
+
port = 80
|
|
182
|
+
return host, port
|
|
183
|
+
|
|
184
|
+
|
|
32
185
|
try:
|
|
33
186
|
from deepgram import (
|
|
34
187
|
AsyncListenWebSocketClient,
|
|
@@ -64,6 +217,7 @@ class DeepgramSTTService(STTService):
|
|
|
64
217
|
addons: Optional[Dict] = None,
|
|
65
218
|
max_connect_retries: int = 3,
|
|
66
219
|
connect_timeout_s: float = 2.5,
|
|
220
|
+
diagnostics_context_provider: Optional[Callable[[], Dict]] = None,
|
|
67
221
|
**kwargs,
|
|
68
222
|
):
|
|
69
223
|
"""Initialize the Deepgram STT service.
|
|
@@ -82,6 +236,9 @@ class DeepgramSTTService(STTService):
|
|
|
82
236
|
max_connect_retries: Maximum number of connection attempts before giving up.
|
|
83
237
|
connect_timeout_s: Maximum time in seconds to wait for a connection attempt.
|
|
84
238
|
Connection retries wait 100ms between attempts.
|
|
239
|
+
diagnostics_context_provider: Optional callable returning a dict with
|
|
240
|
+
additional runtime diagnostics (e.g., active call counts) to append
|
|
241
|
+
to warning logs.
|
|
85
242
|
**kwargs: Additional arguments passed to the parent STTService.
|
|
86
243
|
"""
|
|
87
244
|
sample_rate = sample_rate or (live_options.sample_rate if live_options else None)
|
|
@@ -125,6 +282,7 @@ class DeepgramSTTService(STTService):
|
|
|
125
282
|
self.set_model_name(merged_options["model"])
|
|
126
283
|
self._settings = merged_options
|
|
127
284
|
self._addons = addons
|
|
285
|
+
self._diagnostics_context_provider = diagnostics_context_provider
|
|
128
286
|
|
|
129
287
|
# Connection retry settings (100ms delay between retries)
|
|
130
288
|
self._max_connect_retries = max_connect_retries
|
|
@@ -142,6 +300,7 @@ class DeepgramSTTService(STTService):
|
|
|
142
300
|
verbose=logging.ERROR, # Enable error level and above logging
|
|
143
301
|
),
|
|
144
302
|
)
|
|
303
|
+
self._connect_host, self._connect_port = _derive_connect_endpoint(base_url)
|
|
145
304
|
|
|
146
305
|
if self.vad_enabled:
|
|
147
306
|
self._register_event_handler("on_speech_started")
|
|
@@ -230,7 +389,10 @@ class DeepgramSTTService(STTService):
|
|
|
230
389
|
async def _connect(self):
|
|
231
390
|
self.logger.debug("Attempting to connect to Deepgram...")
|
|
232
391
|
|
|
392
|
+
loop = asyncio.get_running_loop()
|
|
233
393
|
for attempt in range(self._max_connect_retries):
|
|
394
|
+
attempt_started = time.perf_counter()
|
|
395
|
+
dns_ms = await self._measure_dns_resolution(loop)
|
|
234
396
|
try:
|
|
235
397
|
# Clean up any previous connection attempt in background (non-blocking)
|
|
236
398
|
if hasattr(self, "_connection") and self._connection is not None:
|
|
@@ -266,18 +428,67 @@ class DeepgramSTTService(STTService):
|
|
|
266
428
|
timeout=self._connect_timeout_s,
|
|
267
429
|
)
|
|
268
430
|
except asyncio.TimeoutError:
|
|
431
|
+
elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
|
|
432
|
+
diagnostics = _collect_runtime_diagnostics(
|
|
433
|
+
loop,
|
|
434
|
+
extra_context={
|
|
435
|
+
"dns_ms": dns_ms,
|
|
436
|
+
"connect_duration_ms": elapsed_ms,
|
|
437
|
+
},
|
|
438
|
+
context_provider=self._diagnostics_context_provider,
|
|
439
|
+
)
|
|
269
440
|
self.logger.warning(
|
|
270
|
-
|
|
441
|
+
(
|
|
442
|
+
"Deepgram connection attempt {}/{} timed out after {:.2f} second(s). "
|
|
443
|
+
"runtime_diagnostics={}"
|
|
444
|
+
),
|
|
445
|
+
attempt + 1,
|
|
446
|
+
self._max_connect_retries,
|
|
447
|
+
self._connect_timeout_s,
|
|
448
|
+
diagnostics,
|
|
271
449
|
)
|
|
272
450
|
start_result = False
|
|
273
451
|
except Exception as start_error:
|
|
452
|
+
elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
|
|
453
|
+
diagnostics = _collect_runtime_diagnostics(
|
|
454
|
+
loop,
|
|
455
|
+
extra_context={
|
|
456
|
+
"dns_ms": dns_ms,
|
|
457
|
+
"connect_duration_ms": elapsed_ms,
|
|
458
|
+
},
|
|
459
|
+
context_provider=self._diagnostics_context_provider,
|
|
460
|
+
)
|
|
274
461
|
self.logger.warning(
|
|
275
|
-
|
|
462
|
+
(
|
|
463
|
+
"Deepgram connection attempt {}/{} failed with an exception: {}. "
|
|
464
|
+
"runtime_diagnostics={}"
|
|
465
|
+
),
|
|
466
|
+
attempt + 1,
|
|
467
|
+
self._max_connect_retries,
|
|
468
|
+
start_error,
|
|
469
|
+
diagnostics,
|
|
276
470
|
)
|
|
277
471
|
start_result = False
|
|
278
472
|
else:
|
|
279
473
|
if start_result:
|
|
280
|
-
|
|
474
|
+
elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
|
|
475
|
+
diagnostics = _collect_runtime_diagnostics(
|
|
476
|
+
loop,
|
|
477
|
+
extra_context={
|
|
478
|
+
"dns_ms": dns_ms,
|
|
479
|
+
"connect_duration_ms": elapsed_ms,
|
|
480
|
+
},
|
|
481
|
+
context_provider=self._diagnostics_context_provider,
|
|
482
|
+
)
|
|
483
|
+
self.logger.info(
|
|
484
|
+
(
|
|
485
|
+
"Successfully connected to Deepgram on attempt {} in {:.2f} ms. "
|
|
486
|
+
"runtime_diagnostics={}"
|
|
487
|
+
),
|
|
488
|
+
attempt + 1,
|
|
489
|
+
elapsed_ms,
|
|
490
|
+
diagnostics,
|
|
491
|
+
)
|
|
281
492
|
return # Exit the method on success
|
|
282
493
|
|
|
283
494
|
self.logger.warning(
|
|
@@ -285,8 +496,24 @@ class DeepgramSTTService(STTService):
|
|
|
285
496
|
)
|
|
286
497
|
|
|
287
498
|
except Exception as e:
|
|
499
|
+
elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
|
|
500
|
+
diagnostics = _collect_runtime_diagnostics(
|
|
501
|
+
loop,
|
|
502
|
+
extra_context={
|
|
503
|
+
"dns_ms": dns_ms,
|
|
504
|
+
"connect_duration_ms": elapsed_ms,
|
|
505
|
+
},
|
|
506
|
+
context_provider=self._diagnostics_context_provider,
|
|
507
|
+
)
|
|
288
508
|
self.logger.warning(
|
|
289
|
-
|
|
509
|
+
(
|
|
510
|
+
"Deepgram connection attempt {}/{} failed with an exception: {}. "
|
|
511
|
+
"runtime_diagnostics={}"
|
|
512
|
+
),
|
|
513
|
+
attempt + 1,
|
|
514
|
+
self._max_connect_retries,
|
|
515
|
+
e,
|
|
516
|
+
diagnostics,
|
|
290
517
|
)
|
|
291
518
|
|
|
292
519
|
# If this is not the last attempt, wait 100ms before retrying
|
|
@@ -300,6 +527,22 @@ class DeepgramSTTService(STTService):
|
|
|
300
527
|
self.logger.error(error_msg)
|
|
301
528
|
await self.push_error(ErrorFrame(error_msg, fatal=True))
|
|
302
529
|
|
|
530
|
+
async def _measure_dns_resolution(self, loop: Optional[asyncio.AbstractEventLoop]):
|
|
531
|
+
if not loop or not self._connect_host:
|
|
532
|
+
return None
|
|
533
|
+
try:
|
|
534
|
+
dns_task = loop.getaddrinfo(
|
|
535
|
+
self._connect_host,
|
|
536
|
+
self._connect_port,
|
|
537
|
+
type=socket.SOCK_STREAM,
|
|
538
|
+
proto=socket.IPPROTO_TCP,
|
|
539
|
+
)
|
|
540
|
+
start = time.perf_counter()
|
|
541
|
+
await asyncio.wait_for(dns_task, timeout=1.0)
|
|
542
|
+
return round((time.perf_counter() - start) * 1000, 2)
|
|
543
|
+
except Exception:
|
|
544
|
+
return None
|
|
545
|
+
|
|
303
546
|
async def _disconnect(self):
|
|
304
547
|
# Guard against missing connection instance and ensure proper async check
|
|
305
548
|
connection: AsyncListenWebSocketClient = getattr(self, "_connection", None)
|
|
@@ -4,26 +4,43 @@
|
|
|
4
4
|
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
5
|
#
|
|
6
6
|
|
|
7
|
-
"""ElevenLabs speech-to-text service
|
|
8
|
-
|
|
9
|
-
This module provides integration with ElevenLabs' Speech-to-Text API for transcription
|
|
10
|
-
using segmented audio processing. The service uploads audio files and receives
|
|
11
|
-
transcription results directly.
|
|
12
|
-
"""
|
|
7
|
+
"""ElevenLabs speech-to-text service implementations."""
|
|
13
8
|
|
|
9
|
+
import asyncio
|
|
10
|
+
import base64
|
|
14
11
|
import io
|
|
15
|
-
|
|
12
|
+
import json
|
|
13
|
+
import urllib.parse
|
|
14
|
+
from typing import Any, AsyncGenerator, Dict, Literal, Optional
|
|
16
15
|
|
|
17
16
|
import aiohttp
|
|
18
17
|
from loguru import logger
|
|
19
18
|
from pydantic import BaseModel
|
|
20
19
|
|
|
21
|
-
from pipecat.frames.frames import
|
|
22
|
-
|
|
20
|
+
from pipecat.frames.frames import (
|
|
21
|
+
CancelFrame,
|
|
22
|
+
EndFrame,
|
|
23
|
+
ErrorFrame,
|
|
24
|
+
Frame,
|
|
25
|
+
InterimTranscriptionFrame,
|
|
26
|
+
StartFrame,
|
|
27
|
+
TranscriptionFrame,
|
|
28
|
+
UserStartedSpeakingFrame,
|
|
29
|
+
UserStoppedSpeakingFrame,
|
|
30
|
+
)
|
|
31
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
32
|
+
from pipecat.services.stt_service import SegmentedSTTService, WebsocketSTTService
|
|
23
33
|
from pipecat.transcriptions.language import Language
|
|
24
34
|
from pipecat.utils.time import time_now_iso8601
|
|
25
35
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
26
36
|
|
|
37
|
+
try:
|
|
38
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
39
|
+
from websockets.protocol import State
|
|
40
|
+
except ModuleNotFoundError:
|
|
41
|
+
websocket_connect = None # type: ignore[assignment]
|
|
42
|
+
State = None # type: ignore[assignment]
|
|
43
|
+
|
|
27
44
|
|
|
28
45
|
def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
29
46
|
"""Convert a Language enum to ElevenLabs language code.
|
|
@@ -150,6 +167,19 @@ def language_to_elevenlabs_language(language: Language) -> Optional[str]:
|
|
|
150
167
|
return result
|
|
151
168
|
|
|
152
169
|
|
|
170
|
+
def elevenlabs_language_code_to_language(language_code: Optional[str]) -> Optional[Language]:
|
|
171
|
+
"""Convert an ElevenLabs language code back to a Language enum value."""
|
|
172
|
+
if not language_code:
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
normalized = language_code.lower()
|
|
176
|
+
for language in Language:
|
|
177
|
+
code = language_to_elevenlabs_language(language)
|
|
178
|
+
if code and code.lower() == normalized:
|
|
179
|
+
return language
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
|
|
153
183
|
class ElevenLabsSTTService(SegmentedSTTService):
|
|
154
184
|
"""Speech-to-text service using ElevenLabs' file-based API.
|
|
155
185
|
|
|
@@ -337,3 +367,376 @@ class ElevenLabsSTTService(SegmentedSTTService):
|
|
|
337
367
|
except Exception as e:
|
|
338
368
|
self.logger.error(f"ElevenLabs STT error: {e}")
|
|
339
369
|
yield ErrorFrame(f"ElevenLabs STT error: {str(e)}")
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
class ElevenLabsRealtimeSTTService(WebsocketSTTService):
|
|
373
|
+
"""Realtime speech-to-text service using ElevenLabs Scribe v2 WebSocket API."""
|
|
374
|
+
|
|
375
|
+
class InputParams(BaseModel):
|
|
376
|
+
"""Realtime connection parameters derived from ElevenLabs documentation."""
|
|
377
|
+
|
|
378
|
+
language: Optional[Language] = None
|
|
379
|
+
commit_strategy: Literal["manual", "vad"] = "manual"
|
|
380
|
+
vad_silence_threshold_secs: Optional[float] = None
|
|
381
|
+
vad_threshold: Optional[float] = None
|
|
382
|
+
min_speech_duration_ms: Optional[int] = None
|
|
383
|
+
min_silence_duration_ms: Optional[int] = None
|
|
384
|
+
|
|
385
|
+
def __init__(
|
|
386
|
+
self,
|
|
387
|
+
*,
|
|
388
|
+
api_key: str,
|
|
389
|
+
sample_rate: Optional[int] = None,
|
|
390
|
+
model: str = "scribe_v2_realtime",
|
|
391
|
+
url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
|
|
392
|
+
params: Optional["ElevenLabsRealtimeSTTService.InputParams"] = None,
|
|
393
|
+
reconnect_on_error: bool = True,
|
|
394
|
+
**kwargs,
|
|
395
|
+
):
|
|
396
|
+
"""Initialize the realtime STT service.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
api_key: ElevenLabs API key for authentication.
|
|
400
|
+
sample_rate: Optional input sample rate. Defaults to pipeline sample rate.
|
|
401
|
+
model: Scribe realtime model identifier.
|
|
402
|
+
url: WebSocket endpoint for realtime transcription.
|
|
403
|
+
params: Optional realtime configuration options.
|
|
404
|
+
reconnect_on_error: Whether to auto-reconnect on transient failures.
|
|
405
|
+
**kwargs: Additional arguments forwarded to WebsocketSTTService.
|
|
406
|
+
"""
|
|
407
|
+
if websocket_connect is None or State is None:
|
|
408
|
+
logger.error(
|
|
409
|
+
"In order to use ElevenLabsRealtimeSTTService, you need to "
|
|
410
|
+
"`pip install pipecat-ai[elevenlabs]` (websockets extra)."
|
|
411
|
+
)
|
|
412
|
+
raise ModuleNotFoundError("Missing optional dependency: websockets")
|
|
413
|
+
|
|
414
|
+
super().__init__(sample_rate=sample_rate, reconnect_on_error=reconnect_on_error, **kwargs)
|
|
415
|
+
|
|
416
|
+
self._api_key = api_key
|
|
417
|
+
self._url = url
|
|
418
|
+
self.set_model_name(model)
|
|
419
|
+
self._model = model
|
|
420
|
+
self._params = params or ElevenLabsRealtimeSTTService.InputParams()
|
|
421
|
+
self._language_override = self._params.language
|
|
422
|
+
self._encoding = None
|
|
423
|
+
self._receive_task: Optional[asyncio.Task] = None
|
|
424
|
+
self._pending_final_message: Optional[Dict[str, Any]] = None
|
|
425
|
+
self._pending_final_task: Optional[asyncio.Task] = None
|
|
426
|
+
self._timestamp_merge_delay_s = 0.25
|
|
427
|
+
self._ttfb_started = False
|
|
428
|
+
|
|
429
|
+
@property
|
|
430
|
+
def commit_strategy(self) -> str:
|
|
431
|
+
"""Return the configured commit strategy (manual or vad)."""
|
|
432
|
+
return (self._params.commit_strategy or "manual").lower()
|
|
433
|
+
|
|
434
|
+
def can_generate_metrics(self) -> bool:
|
|
435
|
+
"""Realtime ElevenLabs service supports latency metrics."""
|
|
436
|
+
return True
|
|
437
|
+
|
|
438
|
+
async def start(self, frame: StartFrame):
|
|
439
|
+
"""Start the realtime STT service and establish WebSocket connection."""
|
|
440
|
+
await super().start(frame)
|
|
441
|
+
self._encoding = self._determine_encoding(self.sample_rate)
|
|
442
|
+
await self._connect()
|
|
443
|
+
|
|
444
|
+
async def stop(self, frame: EndFrame):
|
|
445
|
+
"""Stop the realtime STT service and close WebSocket connection."""
|
|
446
|
+
await super().stop(frame)
|
|
447
|
+
await self._disconnect()
|
|
448
|
+
|
|
449
|
+
async def cancel(self, frame: CancelFrame):
|
|
450
|
+
"""Cancel the realtime STT service and close WebSocket connection."""
|
|
451
|
+
await super().cancel(frame)
|
|
452
|
+
await self._disconnect()
|
|
453
|
+
|
|
454
|
+
async def set_language(self, language: Language):
|
|
455
|
+
"""Update preferred transcription language (requires reconnect)."""
|
|
456
|
+
self._language_override = language
|
|
457
|
+
self._params.language = language
|
|
458
|
+
if self._websocket:
|
|
459
|
+
await self._disconnect()
|
|
460
|
+
await self._connect()
|
|
461
|
+
|
|
462
|
+
async def set_model(self, model: str):
|
|
463
|
+
"""Set the STT model and reconnect the WebSocket."""
|
|
464
|
+
await super().set_model(model)
|
|
465
|
+
self._model = model
|
|
466
|
+
if self._websocket:
|
|
467
|
+
await self._disconnect()
|
|
468
|
+
await self._connect()
|
|
469
|
+
|
|
470
|
+
async def process_frame(self, frame: Frame, direction: FrameDirection):
|
|
471
|
+
"""Process frames and handle VAD events for commit strategy."""
|
|
472
|
+
await super().process_frame(frame, direction)
|
|
473
|
+
|
|
474
|
+
if isinstance(frame, UserStartedSpeakingFrame):
|
|
475
|
+
if frame.emulated:
|
|
476
|
+
return
|
|
477
|
+
self._ttfb_started = False
|
|
478
|
+
await self.start_processing_metrics()
|
|
479
|
+
elif isinstance(frame, UserStoppedSpeakingFrame):
|
|
480
|
+
if frame.emulated:
|
|
481
|
+
return
|
|
482
|
+
if self.commit_strategy == "manual":
|
|
483
|
+
await self._send_commit()
|
|
484
|
+
|
|
485
|
+
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
486
|
+
"""Stream audio chunks over the ElevenLabs realtime WebSocket."""
|
|
487
|
+
if not audio:
|
|
488
|
+
yield None
|
|
489
|
+
return
|
|
490
|
+
|
|
491
|
+
await self._ensure_connection()
|
|
492
|
+
await self._send_audio_chunk(audio)
|
|
493
|
+
yield None
|
|
494
|
+
|
|
495
|
+
async def _ensure_connection(self):
|
|
496
|
+
if not self._websocket or self._websocket.state is State.CLOSED:
|
|
497
|
+
await self._connect()
|
|
498
|
+
|
|
499
|
+
async def _connect(self):
|
|
500
|
+
await self._connect_websocket()
|
|
501
|
+
if self._websocket and not self._receive_task:
|
|
502
|
+
self._receive_task = asyncio.create_task(self._receive_task_handler(self._report_error))
|
|
503
|
+
|
|
504
|
+
async def _disconnect(self):
|
|
505
|
+
if self._receive_task:
|
|
506
|
+
await self.cancel_task(self._receive_task)
|
|
507
|
+
self._receive_task = None
|
|
508
|
+
|
|
509
|
+
await self._clear_pending_final()
|
|
510
|
+
await self._disconnect_websocket()
|
|
511
|
+
|
|
512
|
+
async def _connect_websocket(self):
|
|
513
|
+
try:
|
|
514
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
515
|
+
return
|
|
516
|
+
|
|
517
|
+
ws_url = self._build_websocket_url()
|
|
518
|
+
headers = {"xi-api-key": self._api_key}
|
|
519
|
+
self.logger.debug(f"Connecting to ElevenLabs realtime STT at {ws_url}")
|
|
520
|
+
self._websocket = await websocket_connect(ws_url, additional_headers=headers)
|
|
521
|
+
await self._call_event_handler("on_connected")
|
|
522
|
+
except Exception as e:
|
|
523
|
+
self.logger.error(f"{self} unable to connect to ElevenLabs realtime STT: {e}")
|
|
524
|
+
self._websocket = None
|
|
525
|
+
await self._call_event_handler("on_connection_error", f"{e}")
|
|
526
|
+
|
|
527
|
+
async def _disconnect_websocket(self):
|
|
528
|
+
try:
|
|
529
|
+
await self.stop_all_metrics()
|
|
530
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
531
|
+
self.logger.debug("Disconnecting from ElevenLabs realtime STT")
|
|
532
|
+
await self._websocket.close()
|
|
533
|
+
except Exception as e:
|
|
534
|
+
self.logger.error(f"{self} error closing ElevenLabs realtime websocket: {e}")
|
|
535
|
+
finally:
|
|
536
|
+
self._websocket = None
|
|
537
|
+
await self._call_event_handler("on_disconnected")
|
|
538
|
+
|
|
539
|
+
async def _receive_messages(self):
|
|
540
|
+
async for message in self._get_websocket():
|
|
541
|
+
await self._process_event(message)
|
|
542
|
+
|
|
543
|
+
def _get_websocket(self):
|
|
544
|
+
if not self._websocket:
|
|
545
|
+
raise RuntimeError("ElevenLabs realtime websocket not connected")
|
|
546
|
+
return self._websocket
|
|
547
|
+
|
|
548
|
+
async def _process_event(self, message: Any):
|
|
549
|
+
try:
|
|
550
|
+
data = json.loads(message)
|
|
551
|
+
except json.JSONDecodeError:
|
|
552
|
+
self.logger.warning(f"ElevenLabs realtime STT sent invalid JSON: {message}")
|
|
553
|
+
return
|
|
554
|
+
|
|
555
|
+
message_type = data.get("message_type")
|
|
556
|
+
|
|
557
|
+
if message_type == "session_started":
|
|
558
|
+
self.logger.debug("ElevenLabs realtime session started")
|
|
559
|
+
return
|
|
560
|
+
|
|
561
|
+
if message_type == "partial_transcript":
|
|
562
|
+
await self._emit_partial_transcript(data)
|
|
563
|
+
elif message_type == "committed_transcript":
|
|
564
|
+
await self._handle_committed_transcript(data)
|
|
565
|
+
elif message_type == "committed_transcript_with_timestamps":
|
|
566
|
+
await self._handle_committed_transcript_with_timestamps(data)
|
|
567
|
+
elif message_type in {
|
|
568
|
+
"auth_error",
|
|
569
|
+
"quota_exceeded",
|
|
570
|
+
"transcriber_error",
|
|
571
|
+
"input_error",
|
|
572
|
+
"error",
|
|
573
|
+
}:
|
|
574
|
+
fatal = message_type in {"auth_error", "quota_exceeded", "error"}
|
|
575
|
+
description = data.get("error", data)
|
|
576
|
+
await self.push_error(
|
|
577
|
+
ErrorFrame(f"ElevenLabs realtime error: {description}", fatal=fatal)
|
|
578
|
+
)
|
|
579
|
+
else:
|
|
580
|
+
self.logger.debug(f"Unhandled ElevenLabs realtime message: {data}")
|
|
581
|
+
|
|
582
|
+
async def _emit_partial_transcript(self, data: Dict[str, Any]):
|
|
583
|
+
text = (data.get("text") or data.get("transcript") or "").strip()
|
|
584
|
+
if not text:
|
|
585
|
+
return
|
|
586
|
+
|
|
587
|
+
language = (
|
|
588
|
+
elevenlabs_language_code_to_language(data.get("language_code"))
|
|
589
|
+
or self._language_override
|
|
590
|
+
)
|
|
591
|
+
await self.stop_ttfb_metrics()
|
|
592
|
+
|
|
593
|
+
await self.push_frame(
|
|
594
|
+
InterimTranscriptionFrame(
|
|
595
|
+
text,
|
|
596
|
+
self._user_id,
|
|
597
|
+
time_now_iso8601(),
|
|
598
|
+
language,
|
|
599
|
+
result=data,
|
|
600
|
+
)
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
async def _handle_committed_transcript(self, data: Dict[str, Any]):
|
|
604
|
+
if self._pending_final_message:
|
|
605
|
+
await self._emit_transcription(self._pending_final_message)
|
|
606
|
+
self._pending_final_message = None
|
|
607
|
+
|
|
608
|
+
self._pending_final_message = data
|
|
609
|
+
await self._schedule_pending_final_emit()
|
|
610
|
+
|
|
611
|
+
async def _handle_committed_transcript_with_timestamps(self, data: Dict[str, Any]):
|
|
612
|
+
if self._pending_final_message:
|
|
613
|
+
merged = {**self._pending_final_message, **data}
|
|
614
|
+
await self._emit_transcription(merged)
|
|
615
|
+
await self._clear_pending_final()
|
|
616
|
+
else:
|
|
617
|
+
await self._emit_transcription(data)
|
|
618
|
+
|
|
619
|
+
async def _schedule_pending_final_emit(self):
|
|
620
|
+
await self._clear_pending_final(timer_only=True)
|
|
621
|
+
self._pending_final_task = asyncio.create_task(self._emit_pending_after_delay())
|
|
622
|
+
|
|
623
|
+
async def _emit_pending_after_delay(self):
|
|
624
|
+
try:
|
|
625
|
+
await asyncio.sleep(self._timestamp_merge_delay_s)
|
|
626
|
+
if self._pending_final_message:
|
|
627
|
+
await self._emit_transcription(self._pending_final_message)
|
|
628
|
+
self._pending_final_message = None
|
|
629
|
+
except asyncio.CancelledError:
|
|
630
|
+
pass
|
|
631
|
+
finally:
|
|
632
|
+
self._pending_final_task = None
|
|
633
|
+
|
|
634
|
+
async def _clear_pending_final(self, timer_only: bool = False):
|
|
635
|
+
if self._pending_final_task:
|
|
636
|
+
await self.cancel_task(self._pending_final_task)
|
|
637
|
+
self._pending_final_task = None
|
|
638
|
+
|
|
639
|
+
if not timer_only:
|
|
640
|
+
self._pending_final_message = None
|
|
641
|
+
|
|
642
|
+
async def _emit_transcription(self, data: Dict[str, Any]):
|
|
643
|
+
text = (data.get("text") or data.get("transcript") or "").strip()
|
|
644
|
+
if not text:
|
|
645
|
+
return
|
|
646
|
+
|
|
647
|
+
language = (
|
|
648
|
+
elevenlabs_language_code_to_language(data.get("language_code"))
|
|
649
|
+
or self._language_override
|
|
650
|
+
)
|
|
651
|
+
await self.stop_ttfb_metrics()
|
|
652
|
+
|
|
653
|
+
frame = TranscriptionFrame(
|
|
654
|
+
text,
|
|
655
|
+
self._user_id,
|
|
656
|
+
time_now_iso8601(),
|
|
657
|
+
language,
|
|
658
|
+
result=data,
|
|
659
|
+
)
|
|
660
|
+
|
|
661
|
+
await self.push_frame(frame)
|
|
662
|
+
await self._handle_transcription(text, True, language)
|
|
663
|
+
await self.stop_processing_metrics()
|
|
664
|
+
|
|
665
|
+
async def _send_audio_chunk(self, audio: bytes):
|
|
666
|
+
if not audio or not self._websocket:
|
|
667
|
+
return
|
|
668
|
+
|
|
669
|
+
if not self._ttfb_started:
|
|
670
|
+
await self.start_ttfb_metrics()
|
|
671
|
+
self._ttfb_started = True
|
|
672
|
+
|
|
673
|
+
payload = {
|
|
674
|
+
"message_type": "input_audio_chunk",
|
|
675
|
+
"audio_base_64": base64.b64encode(audio).decode("ascii"),
|
|
676
|
+
"commit": False,
|
|
677
|
+
"sample_rate": self.sample_rate,
|
|
678
|
+
}
|
|
679
|
+
await self._websocket.send(json.dumps(payload))
|
|
680
|
+
|
|
681
|
+
async def _send_commit(self):
|
|
682
|
+
if not self._websocket:
|
|
683
|
+
return
|
|
684
|
+
payload = {
|
|
685
|
+
"message_type": "input_audio_chunk",
|
|
686
|
+
"audio_base_64": "",
|
|
687
|
+
"commit": True,
|
|
688
|
+
"sample_rate": self.sample_rate,
|
|
689
|
+
}
|
|
690
|
+
await self._websocket.send(json.dumps(payload))
|
|
691
|
+
|
|
692
|
+
def _build_websocket_url(self) -> str:
|
|
693
|
+
if not self.sample_rate:
|
|
694
|
+
raise ValueError(
|
|
695
|
+
"ElevenLabs realtime STT requires a valid sample rate (start() must run first)."
|
|
696
|
+
)
|
|
697
|
+
|
|
698
|
+
params = {
|
|
699
|
+
"model_id": self._model,
|
|
700
|
+
"encoding": self._encoding or "pcm_16000",
|
|
701
|
+
"sample_rate": str(self.sample_rate),
|
|
702
|
+
"commit_strategy": self.commit_strategy,
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
language_code = (
|
|
706
|
+
language_to_elevenlabs_language(self._language_override)
|
|
707
|
+
if self._language_override
|
|
708
|
+
else None
|
|
709
|
+
)
|
|
710
|
+
if language_code:
|
|
711
|
+
params["language_code"] = language_code
|
|
712
|
+
|
|
713
|
+
if self._params.vad_silence_threshold_secs is not None:
|
|
714
|
+
params["vad_silence_threshold_secs"] = str(self._params.vad_silence_threshold_secs)
|
|
715
|
+
if self._params.vad_threshold is not None:
|
|
716
|
+
params["vad_threshold"] = str(self._params.vad_threshold)
|
|
717
|
+
if self._params.min_speech_duration_ms is not None:
|
|
718
|
+
params["min_speech_duration_ms"] = str(self._params.min_speech_duration_ms)
|
|
719
|
+
if self._params.min_silence_duration_ms is not None:
|
|
720
|
+
params["min_silence_duration_ms"] = str(self._params.min_silence_duration_ms)
|
|
721
|
+
|
|
722
|
+
return f"{self._url}?{urllib.parse.urlencode(params)}"
|
|
723
|
+
|
|
724
|
+
def _determine_encoding(self, sample_rate: int) -> str:
|
|
725
|
+
if not sample_rate:
|
|
726
|
+
raise ValueError("ElevenLabs realtime STT requires a valid sample rate.")
|
|
727
|
+
|
|
728
|
+
supported_rates = {8000, 16000, 22050, 24000, 44100, 48000}
|
|
729
|
+
if sample_rate not in supported_rates:
|
|
730
|
+
raise ValueError(
|
|
731
|
+
f"ElevenLabs realtime STT supports sample rates {sorted(supported_rates)}. "
|
|
732
|
+
f"Received {sample_rate} Hz."
|
|
733
|
+
)
|
|
734
|
+
return f"pcm_{sample_rate}"
|
|
735
|
+
|
|
736
|
+
@traced_stt
|
|
737
|
+
async def _handle_transcription(
|
|
738
|
+
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
|
739
|
+
):
|
|
740
|
+
"""Handle a transcription result with tracing."""
|
|
741
|
+
# Metrics are stopped by the caller when needed.
|
|
742
|
+
return
|
|
@@ -32,6 +32,7 @@ from pipecat.frames.frames import (
|
|
|
32
32
|
LLMMessagesFrame,
|
|
33
33
|
LLMTextFrame,
|
|
34
34
|
LLMUpdateSettingsFrame,
|
|
35
|
+
WarmupLLMFrame,
|
|
35
36
|
)
|
|
36
37
|
from pipecat.metrics.metrics import LLMTokenUsage
|
|
37
38
|
from pipecat.processors.aggregators.llm_context import LLMContext
|
|
@@ -438,14 +439,19 @@ class BaseOpenAILLMService(LLMService):
|
|
|
438
439
|
completions and manage settings.
|
|
439
440
|
>>>>>>> dv-stage
|
|
440
441
|
|
|
441
|
-
|
|
442
|
+
Args:
|
|
442
443
|
frame: The frame to process.
|
|
443
444
|
direction: The direction of frame processing.
|
|
444
445
|
"""
|
|
445
446
|
await super().process_frame(frame, direction)
|
|
446
447
|
|
|
447
448
|
context = None
|
|
448
|
-
if isinstance(frame,
|
|
449
|
+
if isinstance(frame, WarmupLLMFrame):
|
|
450
|
+
# Handle warmup frame - prime cache without emitting response
|
|
451
|
+
# Run in background to avoid blocking the pipeline
|
|
452
|
+
asyncio.create_task(self._handle_warmup_frame(frame))
|
|
453
|
+
return # Don't process further, warmup is silent
|
|
454
|
+
elif isinstance(frame, OpenAILLMContextFrame):
|
|
449
455
|
# Handle OpenAI-specific context frames
|
|
450
456
|
context = frame.context
|
|
451
457
|
elif isinstance(frame, LLMContextFrame):
|
|
@@ -470,3 +476,32 @@ class BaseOpenAILLMService(LLMService):
|
|
|
470
476
|
finally:
|
|
471
477
|
await self.stop_processing_metrics()
|
|
472
478
|
await self.push_frame(LLMFullResponseEndFrame())
|
|
479
|
+
|
|
480
|
+
async def _handle_warmup_frame(self, frame: WarmupLLMFrame):
|
|
481
|
+
"""Handle WarmupLLMFrame to prime the LLM cache without emitting responses.
|
|
482
|
+
|
|
483
|
+
This method sends a minimal request to the LLM to warm up any provider-side
|
|
484
|
+
caches (like prompt caching). The response is discarded and no frames are emitted.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
frame: WarmupLLMFrame containing the messages to cache.
|
|
488
|
+
"""
|
|
489
|
+
try:
|
|
490
|
+
# Use the provided messages for warmup
|
|
491
|
+
messages: List[ChatCompletionMessageParam] = frame.messages # type: ignore
|
|
492
|
+
|
|
493
|
+
# Make a non-streaming call to warm the cache
|
|
494
|
+
# We use a minimal max_tokens to reduce latency and cost
|
|
495
|
+
await self._client.chat.completions.create(
|
|
496
|
+
model=self.model_name, # Use the property, not self._model
|
|
497
|
+
messages=messages,
|
|
498
|
+
max_tokens=10, # Minimal response
|
|
499
|
+
stream=False,
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
self.logger.info("LLM cache warmed successfully")
|
|
503
|
+
# Intentionally don't emit any frames - this is a silent warmup
|
|
504
|
+
|
|
505
|
+
except Exception as e:
|
|
506
|
+
self.logger.error(f"Failed to warm LLM cache: {e}")
|
|
507
|
+
# Don't propagate error - warmup failure shouldn't break the bot
|
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{dv_pipecat_ai-0.0.85.dev847.dist-info → dv_pipecat_ai-0.0.85.dev850.dist-info}/top_level.txt
RENAMED
|
File without changes
|