dv-pipecat-ai 0.0.85.dev824__py3-none-any.whl → 0.0.85.dev858__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/METADATA +2 -1
- {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/RECORD +31 -29
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +5 -1
- pipecat/frames/frames.py +22 -0
- pipecat/metrics/connection_metrics.py +45 -0
- pipecat/processors/aggregators/llm_response.py +15 -9
- pipecat/processors/dtmf_aggregator.py +17 -21
- pipecat/processors/frame_processor.py +44 -1
- pipecat/processors/metrics/frame_processor_metrics.py +108 -0
- pipecat/processors/transcript_processor.py +2 -1
- pipecat/serializers/__init__.py +2 -0
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +2 -2
- pipecat/serializers/custom.py +2 -2
- pipecat/serializers/vi.py +326 -0
- pipecat/services/cartesia/tts.py +75 -10
- pipecat/services/deepgram/stt.py +317 -17
- pipecat/services/elevenlabs/stt.py +487 -19
- pipecat/services/elevenlabs/tts.py +28 -4
- pipecat/services/google/llm.py +26 -11
- pipecat/services/openai/base_llm.py +79 -14
- pipecat/services/salesforce/llm.py +64 -59
- pipecat/services/sarvam/tts.py +0 -1
- pipecat/services/soniox/stt.py +45 -10
- pipecat/services/vistaar/llm.py +97 -6
- pipecat/transcriptions/language.py +50 -0
- pipecat/transports/base_input.py +15 -11
- pipecat/transports/base_output.py +26 -3
- {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.85.dev824.dist-info → dv_pipecat_ai-0.0.85.dev858.dist-info}/top_level.txt +0 -0
pipecat/services/cartesia/tts.py
CHANGED
|
@@ -15,7 +15,6 @@ from typing import AsyncGenerator, List, Literal, Optional, Union
|
|
|
15
15
|
from loguru import logger
|
|
16
16
|
from pydantic import BaseModel, Field
|
|
17
17
|
|
|
18
|
-
|
|
19
18
|
from pipecat.frames.frames import (
|
|
20
19
|
CancelFrame,
|
|
21
20
|
EndFrame,
|
|
@@ -49,6 +48,26 @@ except ModuleNotFoundError as e:
|
|
|
49
48
|
raise Exception(f"Missing module: {e}")
|
|
50
49
|
|
|
51
50
|
|
|
51
|
+
class GenerationConfig(BaseModel):
|
|
52
|
+
"""Configuration for Cartesia Sonic-3 generation parameters.
|
|
53
|
+
|
|
54
|
+
Sonic-3 interprets these parameters as guidance to ensure natural speech.
|
|
55
|
+
Test against your content for best results.
|
|
56
|
+
|
|
57
|
+
Parameters:
|
|
58
|
+
volume: Volume multiplier for generated speech. Valid range: [0.5, 2.0]. Default is 1.0.
|
|
59
|
+
speed: Speed multiplier for generated speech. Valid range: [0.6, 1.5]. Default is 1.0.
|
|
60
|
+
emotion: Single emotion string to guide the emotional tone. Examples include neutral,
|
|
61
|
+
angry, excited, content, sad, scared. Over 60 emotions are supported. For best
|
|
62
|
+
results, use with recommended voices: Leo, Jace, Kyle, Gavin, Maya, Tessa, Dana,
|
|
63
|
+
and Marian.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
volume: Optional[float] = None
|
|
67
|
+
speed: Optional[float] = None
|
|
68
|
+
emotion: Optional[str] = None
|
|
69
|
+
|
|
70
|
+
|
|
52
71
|
def language_to_cartesia_language(language: Language) -> Optional[str]:
|
|
53
72
|
"""Convert a Language enum to Cartesia language code.
|
|
54
73
|
|
|
@@ -74,6 +93,33 @@ def language_to_cartesia_language(language: Language) -> Optional[str]:
|
|
|
74
93
|
Language.SV: "sv",
|
|
75
94
|
Language.TR: "tr",
|
|
76
95
|
Language.ZH: "zh",
|
|
96
|
+
Language.TL: "tl",
|
|
97
|
+
Language.BG: "bg",
|
|
98
|
+
Language.RO: "ro",
|
|
99
|
+
Language.AR: "ar",
|
|
100
|
+
Language.CS: "cs",
|
|
101
|
+
Language.EL: "el",
|
|
102
|
+
Language.FI: "fi",
|
|
103
|
+
Language.HR: "hr",
|
|
104
|
+
Language.MS: "ms",
|
|
105
|
+
Language.SK: "sk",
|
|
106
|
+
Language.DA: "da",
|
|
107
|
+
Language.TA: "ta",
|
|
108
|
+
Language.UK: "uk",
|
|
109
|
+
Language.HU: "hu",
|
|
110
|
+
Language.NO: "no",
|
|
111
|
+
Language.VI: "vi",
|
|
112
|
+
Language.BN: "bn",
|
|
113
|
+
Language.TH: "th",
|
|
114
|
+
Language.HE: "he",
|
|
115
|
+
Language.KA: "ka",
|
|
116
|
+
Language.ID: "id",
|
|
117
|
+
Language.TE: "te",
|
|
118
|
+
Language.GU: "gu",
|
|
119
|
+
Language.KN: "kn",
|
|
120
|
+
Language.ML: "ml",
|
|
121
|
+
Language.MR: "mr",
|
|
122
|
+
Language.PA: "pa",
|
|
77
123
|
}
|
|
78
124
|
|
|
79
125
|
result = BASE_LANGUAGES.get(language)
|
|
@@ -102,16 +148,20 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
102
148
|
|
|
103
149
|
Parameters:
|
|
104
150
|
language: Language to use for synthesis.
|
|
105
|
-
speed: Voice speed control.
|
|
106
|
-
emotion: List of emotion controls.
|
|
151
|
+
speed: Voice speed control for non-Sonic-3 models (literal values).
|
|
152
|
+
emotion: List of emotion controls for non-Sonic-3 models.
|
|
107
153
|
|
|
108
154
|
.. deprecated:: 0.0.68
|
|
109
155
|
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
156
|
+
|
|
157
|
+
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
|
158
|
+
speed (numeric), and emotion (string) parameters.
|
|
110
159
|
"""
|
|
111
160
|
|
|
112
161
|
language: Optional[Language] = Language.EN
|
|
113
162
|
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
|
114
163
|
emotion: Optional[List[str]] = []
|
|
164
|
+
generation_config: Optional[GenerationConfig] = None
|
|
115
165
|
|
|
116
166
|
def __init__(
|
|
117
167
|
self,
|
|
@@ -120,7 +170,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
120
170
|
voice_id: str,
|
|
121
171
|
cartesia_version: str = "2025-04-16",
|
|
122
172
|
url: str = "wss://api.cartesia.ai/tts/websocket",
|
|
123
|
-
model: str = "sonic-
|
|
173
|
+
model: str = "sonic-3",
|
|
124
174
|
sample_rate: Optional[int] = None,
|
|
125
175
|
encoding: str = "pcm_s16le",
|
|
126
176
|
container: str = "raw",
|
|
@@ -136,7 +186,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
136
186
|
voice_id: ID of the voice to use for synthesis.
|
|
137
187
|
cartesia_version: API version string for Cartesia service.
|
|
138
188
|
url: WebSocket URL for Cartesia TTS API.
|
|
139
|
-
model: TTS model to use (e.g., "sonic-
|
|
189
|
+
model: TTS model to use (e.g., "sonic-3").
|
|
140
190
|
sample_rate: Audio sample rate. If None, uses default.
|
|
141
191
|
encoding: Audio encoding format.
|
|
142
192
|
container: Audio container format.
|
|
@@ -180,6 +230,7 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
180
230
|
else "en",
|
|
181
231
|
"speed": params.speed,
|
|
182
232
|
"emotion": params.emotion,
|
|
233
|
+
"generation_config": params.generation_config,
|
|
183
234
|
}
|
|
184
235
|
self.set_model_name(model)
|
|
185
236
|
self.set_voice(voice_id)
|
|
@@ -298,6 +349,11 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
298
349
|
if self._settings["speed"]:
|
|
299
350
|
msg["speed"] = self._settings["speed"]
|
|
300
351
|
|
|
352
|
+
if self._settings["generation_config"]:
|
|
353
|
+
msg["generation_config"] = self._settings["generation_config"].model_dump(
|
|
354
|
+
exclude_none=True
|
|
355
|
+
)
|
|
356
|
+
|
|
301
357
|
return json.dumps(msg)
|
|
302
358
|
|
|
303
359
|
async def start(self, frame: StartFrame):
|
|
@@ -419,7 +475,6 @@ class CartesiaTTSService(AudioContextWordTTSService):
|
|
|
419
475
|
logger.error(f"{self} error: {msg}")
|
|
420
476
|
await self.push_frame(TTSStoppedFrame())
|
|
421
477
|
await self.stop_all_metrics()
|
|
422
|
-
|
|
423
478
|
await self.push_error(ErrorFrame(f"{self} error: {msg['error']}"))
|
|
424
479
|
self._context_id = None
|
|
425
480
|
else:
|
|
@@ -484,23 +539,27 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
484
539
|
|
|
485
540
|
Parameters:
|
|
486
541
|
language: Language to use for synthesis.
|
|
487
|
-
speed: Voice speed control.
|
|
488
|
-
emotion: List of emotion controls.
|
|
542
|
+
speed: Voice speed control for non-Sonic-3 models (literal values).
|
|
543
|
+
emotion: List of emotion controls for non-Sonic-3 models.
|
|
489
544
|
|
|
490
545
|
.. deprecated:: 0.0.68
|
|
491
546
|
The `emotion` parameter is deprecated and will be removed in a future version.
|
|
547
|
+
|
|
548
|
+
generation_config: Generation configuration for Sonic-3 models. Includes volume,
|
|
549
|
+
speed (numeric), and emotion (string) parameters.
|
|
492
550
|
"""
|
|
493
551
|
|
|
494
552
|
language: Optional[Language] = Language.EN
|
|
495
553
|
speed: Optional[Literal["slow", "normal", "fast"]] = None
|
|
496
554
|
emotion: Optional[List[str]] = Field(default_factory=list)
|
|
555
|
+
generation_config: Optional[GenerationConfig] = None
|
|
497
556
|
|
|
498
557
|
def __init__(
|
|
499
558
|
self,
|
|
500
559
|
*,
|
|
501
560
|
api_key: str,
|
|
502
561
|
voice_id: str,
|
|
503
|
-
model: str = "sonic-
|
|
562
|
+
model: str = "sonic-3",
|
|
504
563
|
base_url: str = "https://api.cartesia.ai",
|
|
505
564
|
cartesia_version: str = "2024-11-13",
|
|
506
565
|
sample_rate: Optional[int] = None,
|
|
@@ -514,7 +573,7 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
514
573
|
Args:
|
|
515
574
|
api_key: Cartesia API key for authentication.
|
|
516
575
|
voice_id: ID of the voice to use for synthesis.
|
|
517
|
-
model: TTS model to use (e.g., "sonic-
|
|
576
|
+
model: TTS model to use (e.g., "sonic-3").
|
|
518
577
|
base_url: Base URL for Cartesia HTTP API.
|
|
519
578
|
cartesia_version: API version string for Cartesia service.
|
|
520
579
|
sample_rate: Audio sample rate. If None, uses default.
|
|
@@ -541,6 +600,7 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
541
600
|
else "en",
|
|
542
601
|
"speed": params.speed,
|
|
543
602
|
"emotion": params.emotion,
|
|
603
|
+
"generation_config": params.generation_config,
|
|
544
604
|
}
|
|
545
605
|
self.set_voice(voice_id)
|
|
546
606
|
self.set_model_name(model)
|
|
@@ -634,6 +694,11 @@ class CartesiaHttpTTSService(TTSService):
|
|
|
634
694
|
if self._settings["speed"]:
|
|
635
695
|
payload["speed"] = self._settings["speed"]
|
|
636
696
|
|
|
697
|
+
if self._settings["generation_config"]:
|
|
698
|
+
payload["generation_config"] = self._settings["generation_config"].model_dump(
|
|
699
|
+
exclude_none=True
|
|
700
|
+
)
|
|
701
|
+
|
|
637
702
|
yield TTSStartedFrame()
|
|
638
703
|
|
|
639
704
|
session = await self._client._get_session()
|
pipecat/services/deepgram/stt.py
CHANGED
|
@@ -8,7 +8,11 @@
|
|
|
8
8
|
|
|
9
9
|
import asyncio
|
|
10
10
|
import logging
|
|
11
|
-
|
|
11
|
+
import os
|
|
12
|
+
import socket
|
|
13
|
+
import time
|
|
14
|
+
from typing import AsyncGenerator, Callable, Dict, Optional
|
|
15
|
+
from urllib.parse import urlparse
|
|
12
16
|
|
|
13
17
|
from loguru import logger
|
|
14
18
|
|
|
@@ -29,6 +33,155 @@ from pipecat.transcriptions.language import Language
|
|
|
29
33
|
from pipecat.utils.time import time_now_iso8601
|
|
30
34
|
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
31
35
|
|
|
36
|
+
_PROCESS_START_MONOTONIC = time.monotonic()
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _read_first_numeric_file(paths):
|
|
40
|
+
for path in paths:
|
|
41
|
+
try:
|
|
42
|
+
with open(path, "r", encoding="utf-8") as file:
|
|
43
|
+
value = file.read().strip()
|
|
44
|
+
except FileNotFoundError:
|
|
45
|
+
continue
|
|
46
|
+
except OSError:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
if not value or value == "max":
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
return int(value)
|
|
54
|
+
except ValueError:
|
|
55
|
+
continue
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _read_proc_status_value(key):
|
|
60
|
+
try:
|
|
61
|
+
with open("/proc/self/status", "r", encoding="utf-8") as status_file:
|
|
62
|
+
for line in status_file:
|
|
63
|
+
if line.startswith(key):
|
|
64
|
+
parts = line.split()
|
|
65
|
+
if len(parts) >= 2:
|
|
66
|
+
return int(parts[1]) * 1024 # kB -> bytes
|
|
67
|
+
except FileNotFoundError:
|
|
68
|
+
return None
|
|
69
|
+
except OSError:
|
|
70
|
+
return None
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _read_cpu_throttling():
|
|
75
|
+
paths = ["/sys/fs/cgroup/cpu.stat", "/sys/fs/cgroup/cpu/cpu.stat"]
|
|
76
|
+
for path in paths:
|
|
77
|
+
try:
|
|
78
|
+
with open(path, "r", encoding="utf-8") as cpu_file:
|
|
79
|
+
for line in cpu_file:
|
|
80
|
+
if line.startswith("nr_throttled"):
|
|
81
|
+
parts = line.split()
|
|
82
|
+
if len(parts) >= 2:
|
|
83
|
+
return int(parts[1])
|
|
84
|
+
except FileNotFoundError:
|
|
85
|
+
continue
|
|
86
|
+
except OSError:
|
|
87
|
+
continue
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _collect_runtime_diagnostics(
|
|
92
|
+
loop: Optional[asyncio.AbstractEventLoop] = None,
|
|
93
|
+
extra_context: Optional[Dict] = None,
|
|
94
|
+
context_provider: Optional[Callable[[], Dict]] = None,
|
|
95
|
+
):
|
|
96
|
+
if loop is None:
|
|
97
|
+
try:
|
|
98
|
+
loop = asyncio.get_running_loop()
|
|
99
|
+
except RuntimeError:
|
|
100
|
+
loop = None
|
|
101
|
+
|
|
102
|
+
uptime_s = round(time.monotonic() - _PROCESS_START_MONOTONIC, 1)
|
|
103
|
+
rss_bytes = _read_proc_status_value("VmRSS:")
|
|
104
|
+
rss_mb = round(rss_bytes / (1024**2), 2) if rss_bytes else None
|
|
105
|
+
|
|
106
|
+
cgroup_usage_bytes = _read_first_numeric_file(
|
|
107
|
+
["/sys/fs/cgroup/memory.current", "/sys/fs/cgroup/memory/memory.usage_in_bytes"]
|
|
108
|
+
)
|
|
109
|
+
cgroup_limit_bytes = _read_first_numeric_file(
|
|
110
|
+
["/sys/fs/cgroup/memory.max", "/sys/fs/cgroup/memory/memory.limit_in_bytes"]
|
|
111
|
+
)
|
|
112
|
+
cgroup_usage_mb = (
|
|
113
|
+
round(cgroup_usage_bytes / (1024**2), 2) if cgroup_usage_bytes is not None else None
|
|
114
|
+
)
|
|
115
|
+
cgroup_limit_mb = (
|
|
116
|
+
round(cgroup_limit_bytes / (1024**2), 2) if cgroup_limit_bytes not in (None, 0) else None
|
|
117
|
+
)
|
|
118
|
+
cgroup_pct = (
|
|
119
|
+
round(cgroup_usage_bytes / cgroup_limit_bytes * 100, 2)
|
|
120
|
+
if cgroup_usage_bytes is not None and cgroup_limit_bytes not in (None, 0)
|
|
121
|
+
else None
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
open_fds = len(os.listdir("/proc/self/fd"))
|
|
126
|
+
except Exception:
|
|
127
|
+
open_fds = None
|
|
128
|
+
|
|
129
|
+
pending_tasks = None
|
|
130
|
+
if loop:
|
|
131
|
+
try:
|
|
132
|
+
pending_tasks = len(asyncio.all_tasks(loop))
|
|
133
|
+
except Exception:
|
|
134
|
+
pending_tasks = None
|
|
135
|
+
|
|
136
|
+
suspected_cause = "unknown"
|
|
137
|
+
if cgroup_pct and cgroup_pct >= 90:
|
|
138
|
+
suspected_cause = "memory_pressure"
|
|
139
|
+
elif uptime_s < 180:
|
|
140
|
+
suspected_cause = "pod_cold_start"
|
|
141
|
+
|
|
142
|
+
diagnostics = {
|
|
143
|
+
"uptime_s": uptime_s,
|
|
144
|
+
"rss_mb": rss_mb,
|
|
145
|
+
"cgroup_usage_mb": cgroup_usage_mb,
|
|
146
|
+
"cgroup_limit_mb": cgroup_limit_mb,
|
|
147
|
+
"cgroup_usage_pct": cgroup_pct,
|
|
148
|
+
"open_fds": open_fds,
|
|
149
|
+
"pending_tasks": pending_tasks,
|
|
150
|
+
"suspected_cause": suspected_cause,
|
|
151
|
+
}
|
|
152
|
+
cpu_throttled = _read_cpu_throttling()
|
|
153
|
+
if cpu_throttled is not None:
|
|
154
|
+
diagnostics["cpu_nr_throttled"] = cpu_throttled
|
|
155
|
+
|
|
156
|
+
if context_provider:
|
|
157
|
+
try:
|
|
158
|
+
ctx = context_provider() or {}
|
|
159
|
+
if isinstance(ctx, dict):
|
|
160
|
+
diagnostics.update({k: v for k, v in ctx.items() if v is not None})
|
|
161
|
+
except Exception as exc:
|
|
162
|
+
diagnostics["context_provider_error"] = str(exc)
|
|
163
|
+
|
|
164
|
+
if extra_context:
|
|
165
|
+
diagnostics.update({k: v for k, v in extra_context.items() if v is not None})
|
|
166
|
+
|
|
167
|
+
return {k: v for k, v in diagnostics.items() if v is not None}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _derive_connect_endpoint(base_url: str):
|
|
171
|
+
if not base_url:
|
|
172
|
+
return "api.deepgram.com", 443
|
|
173
|
+
|
|
174
|
+
parsed = urlparse(base_url)
|
|
175
|
+
host = parsed.hostname or "api.deepgram.com"
|
|
176
|
+
if parsed.port:
|
|
177
|
+
port = parsed.port
|
|
178
|
+
elif parsed.scheme in ("https", "wss"):
|
|
179
|
+
port = 443
|
|
180
|
+
else:
|
|
181
|
+
port = 80
|
|
182
|
+
return host, port
|
|
183
|
+
|
|
184
|
+
|
|
32
185
|
try:
|
|
33
186
|
from deepgram import (
|
|
34
187
|
AsyncListenWebSocketClient,
|
|
@@ -62,6 +215,9 @@ class DeepgramSTTService(STTService):
|
|
|
62
215
|
sample_rate: Optional[int] = None,
|
|
63
216
|
live_options: Optional[LiveOptions] = None,
|
|
64
217
|
addons: Optional[Dict] = None,
|
|
218
|
+
max_connect_retries: int = 3,
|
|
219
|
+
connect_timeout_s: float = 2.5,
|
|
220
|
+
diagnostics_context_provider: Optional[Callable[[], Dict]] = None,
|
|
65
221
|
**kwargs,
|
|
66
222
|
):
|
|
67
223
|
"""Initialize the Deepgram STT service.
|
|
@@ -77,6 +233,12 @@ class DeepgramSTTService(STTService):
|
|
|
77
233
|
sample_rate: Audio sample rate. If None, uses default or live_options value.
|
|
78
234
|
live_options: Deepgram LiveOptions for detailed configuration.
|
|
79
235
|
addons: Additional Deepgram features to enable.
|
|
236
|
+
max_connect_retries: Maximum number of connection attempts before giving up.
|
|
237
|
+
connect_timeout_s: Maximum time in seconds to wait for a connection attempt.
|
|
238
|
+
Connection retries wait 100ms between attempts.
|
|
239
|
+
diagnostics_context_provider: Optional callable returning a dict with
|
|
240
|
+
additional runtime diagnostics (e.g., active call counts) to append
|
|
241
|
+
to warning logs.
|
|
80
242
|
**kwargs: Additional arguments passed to the parent STTService.
|
|
81
243
|
"""
|
|
82
244
|
sample_rate = sample_rate or (live_options.sample_rate if live_options else None)
|
|
@@ -120,10 +282,11 @@ class DeepgramSTTService(STTService):
|
|
|
120
282
|
self.set_model_name(merged_options["model"])
|
|
121
283
|
self._settings = merged_options
|
|
122
284
|
self._addons = addons
|
|
285
|
+
self._diagnostics_context_provider = diagnostics_context_provider
|
|
123
286
|
|
|
124
|
-
# Connection retry settings
|
|
125
|
-
self._max_connect_retries =
|
|
126
|
-
self.
|
|
287
|
+
# Connection retry settings (100ms delay between retries)
|
|
288
|
+
self._max_connect_retries = max_connect_retries
|
|
289
|
+
self._connect_timeout_s = connect_timeout_s
|
|
127
290
|
|
|
128
291
|
self._client = DeepgramClient(
|
|
129
292
|
api_key,
|
|
@@ -131,12 +294,13 @@ class DeepgramSTTService(STTService):
|
|
|
131
294
|
url=base_url,
|
|
132
295
|
options={
|
|
133
296
|
"keepalive": "true",
|
|
134
|
-
|
|
135
|
-
#
|
|
297
|
+
# Note: Connection timeout is enforced by asyncio.wait_for() in _connect()
|
|
298
|
+
# with the connect_timeout_s parameter (default 2.0s)
|
|
136
299
|
},
|
|
137
300
|
verbose=logging.ERROR, # Enable error level and above logging
|
|
138
301
|
),
|
|
139
302
|
)
|
|
303
|
+
self._connect_host, self._connect_port = _derive_connect_endpoint(base_url)
|
|
140
304
|
|
|
141
305
|
if self.vad_enabled:
|
|
142
306
|
self._register_event_handler("on_speech_started")
|
|
@@ -224,9 +388,18 @@ class DeepgramSTTService(STTService):
|
|
|
224
388
|
|
|
225
389
|
async def _connect(self):
|
|
226
390
|
self.logger.debug("Attempting to connect to Deepgram...")
|
|
391
|
+
await self.start_connection_metrics()
|
|
227
392
|
|
|
393
|
+
loop = asyncio.get_running_loop()
|
|
228
394
|
for attempt in range(self._max_connect_retries):
|
|
395
|
+
attempt_started = time.perf_counter()
|
|
396
|
+
dns_ms = await self._measure_dns_resolution(loop)
|
|
229
397
|
try:
|
|
398
|
+
# Clean up any previous connection attempt in background (non-blocking)
|
|
399
|
+
if hasattr(self, "_connection") and self._connection is not None:
|
|
400
|
+
old_conn = self._connection
|
|
401
|
+
asyncio.create_task(self._cleanup_abandoned_connection(old_conn))
|
|
402
|
+
|
|
230
403
|
# Create a new connection object for a clean attempt
|
|
231
404
|
self._connection: AsyncListenWebSocketClient = self._client.listen.asyncwebsocket.v(
|
|
232
405
|
"1"
|
|
@@ -250,31 +423,139 @@ class DeepgramSTTService(STTService):
|
|
|
250
423
|
self._on_utterance_end,
|
|
251
424
|
)
|
|
252
425
|
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
426
|
+
try:
|
|
427
|
+
start_result = await asyncio.wait_for(
|
|
428
|
+
self._connection.start(options=self._settings, addons=self._addons),
|
|
429
|
+
timeout=self._connect_timeout_s,
|
|
430
|
+
)
|
|
431
|
+
except asyncio.TimeoutError:
|
|
432
|
+
elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
|
|
433
|
+
diagnostics = _collect_runtime_diagnostics(
|
|
434
|
+
loop,
|
|
435
|
+
extra_context={
|
|
436
|
+
"dns_ms": dns_ms,
|
|
437
|
+
"connect_duration_ms": elapsed_ms,
|
|
438
|
+
},
|
|
439
|
+
context_provider=self._diagnostics_context_provider,
|
|
440
|
+
)
|
|
441
|
+
self.logger.warning(
|
|
442
|
+
(
|
|
443
|
+
"Deepgram connection attempt {}/{} timed out after {:.2f} second(s). "
|
|
444
|
+
"runtime_diagnostics={}"
|
|
445
|
+
),
|
|
446
|
+
attempt + 1,
|
|
447
|
+
self._max_connect_retries,
|
|
448
|
+
self._connect_timeout_s,
|
|
449
|
+
diagnostics,
|
|
450
|
+
)
|
|
451
|
+
start_result = False
|
|
452
|
+
except Exception as start_error:
|
|
453
|
+
elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
|
|
454
|
+
diagnostics = _collect_runtime_diagnostics(
|
|
455
|
+
loop,
|
|
456
|
+
extra_context={
|
|
457
|
+
"dns_ms": dns_ms,
|
|
458
|
+
"connect_duration_ms": elapsed_ms,
|
|
459
|
+
},
|
|
460
|
+
context_provider=self._diagnostics_context_provider,
|
|
461
|
+
)
|
|
462
|
+
self.logger.warning(
|
|
463
|
+
(
|
|
464
|
+
"Deepgram connection attempt {}/{} failed with an exception: {}. "
|
|
465
|
+
"runtime_diagnostics={}"
|
|
466
|
+
),
|
|
467
|
+
attempt + 1,
|
|
468
|
+
self._max_connect_retries,
|
|
469
|
+
start_error,
|
|
470
|
+
diagnostics,
|
|
471
|
+
)
|
|
472
|
+
start_result = False
|
|
473
|
+
else:
|
|
474
|
+
if start_result:
|
|
475
|
+
elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
|
|
476
|
+
diagnostics = _collect_runtime_diagnostics(
|
|
477
|
+
loop,
|
|
478
|
+
extra_context={
|
|
479
|
+
"dns_ms": dns_ms,
|
|
480
|
+
"connect_duration_ms": elapsed_ms,
|
|
481
|
+
},
|
|
482
|
+
context_provider=self._diagnostics_context_provider,
|
|
483
|
+
)
|
|
484
|
+
self.logger.info(
|
|
485
|
+
(
|
|
486
|
+
"Successfully connected to Deepgram on attempt {} in {:.2f} ms. "
|
|
487
|
+
"runtime_diagnostics={}"
|
|
488
|
+
),
|
|
489
|
+
attempt + 1,
|
|
490
|
+
elapsed_ms,
|
|
491
|
+
diagnostics,
|
|
492
|
+
)
|
|
493
|
+
await self.stop_connection_metrics(success=True, connection_type="websocket")
|
|
494
|
+
await self.stop_reconnection_metrics(success=True, reason="successful_reconnection")
|
|
495
|
+
return # Exit the method on success
|
|
257
496
|
|
|
258
497
|
self.logger.warning(
|
|
259
498
|
f"Deepgram connection attempt {attempt + 1}/{self._max_connect_retries} failed."
|
|
260
499
|
)
|
|
261
500
|
|
|
262
501
|
except Exception as e:
|
|
502
|
+
elapsed_ms = round((time.perf_counter() - attempt_started) * 1000, 2)
|
|
503
|
+
diagnostics = _collect_runtime_diagnostics(
|
|
504
|
+
loop,
|
|
505
|
+
extra_context={
|
|
506
|
+
"dns_ms": dns_ms,
|
|
507
|
+
"connect_duration_ms": elapsed_ms,
|
|
508
|
+
},
|
|
509
|
+
context_provider=self._diagnostics_context_provider,
|
|
510
|
+
)
|
|
263
511
|
self.logger.warning(
|
|
264
|
-
|
|
512
|
+
(
|
|
513
|
+
"Deepgram connection attempt {}/{} failed with an exception: {}. "
|
|
514
|
+
"runtime_diagnostics={}"
|
|
515
|
+
),
|
|
516
|
+
attempt + 1,
|
|
517
|
+
self._max_connect_retries,
|
|
518
|
+
e,
|
|
519
|
+
diagnostics,
|
|
265
520
|
)
|
|
266
521
|
|
|
267
|
-
# If this is not the last attempt, wait
|
|
522
|
+
# If this is not the last attempt, wait 100ms before retrying
|
|
268
523
|
if attempt < self._max_connect_retries - 1:
|
|
269
|
-
self.logger.info(
|
|
270
|
-
await asyncio.sleep(
|
|
524
|
+
self.logger.info("Retrying in 0.1 second(s)...")
|
|
525
|
+
await asyncio.sleep(0.1)
|
|
271
526
|
|
|
272
|
-
|
|
527
|
+
error_msg = (
|
|
273
528
|
f"{self}: unable to connect to Deepgram after {self._max_connect_retries} attempts."
|
|
274
529
|
)
|
|
530
|
+
await self.stop_connection_metrics(
|
|
531
|
+
success=False,
|
|
532
|
+
error=f"Failed after {self._max_connect_retries} attempts",
|
|
533
|
+
connection_type="websocket"
|
|
534
|
+
)
|
|
535
|
+
await self.stop_reconnection_metrics(success=False, reason="max_retries_exceeded")
|
|
536
|
+
self.logger.error(error_msg)
|
|
537
|
+
await self.push_error(ErrorFrame(error_msg, fatal=True))
|
|
538
|
+
|
|
539
|
+
async def _measure_dns_resolution(self, loop: Optional[asyncio.AbstractEventLoop]):
|
|
540
|
+
if not loop or not self._connect_host:
|
|
541
|
+
return None
|
|
542
|
+
try:
|
|
543
|
+
dns_task = loop.getaddrinfo(
|
|
544
|
+
self._connect_host,
|
|
545
|
+
self._connect_port,
|
|
546
|
+
type=socket.SOCK_STREAM,
|
|
547
|
+
proto=socket.IPPROTO_TCP,
|
|
548
|
+
)
|
|
549
|
+
start = time.perf_counter()
|
|
550
|
+
await asyncio.wait_for(dns_task, timeout=1.0)
|
|
551
|
+
return round((time.perf_counter() - start) * 1000, 2)
|
|
552
|
+
except Exception:
|
|
553
|
+
return None
|
|
275
554
|
|
|
276
555
|
async def _disconnect(self):
|
|
277
|
-
|
|
556
|
+
# Guard against missing connection instance and ensure proper async check
|
|
557
|
+
connection: AsyncListenWebSocketClient = getattr(self, "_connection", None)
|
|
558
|
+
if connection and await connection.is_connected():
|
|
278
559
|
self.logger.debug("Disconnecting from Deepgram")
|
|
279
560
|
# Deepgram swallows asyncio.CancelledError internally which prevents
|
|
280
561
|
# proper cancellation propagation. This issue was found with
|
|
@@ -284,7 +565,25 @@ class DeepgramSTTService(STTService):
|
|
|
284
565
|
# Deepgram disconnection was still finishing and therefore
|
|
285
566
|
# preventing the task cancellation that occurs during `cleanup()`.
|
|
286
567
|
# GH issue: https://github.com/deepgram/deepgram-python-sdk/issues/570
|
|
287
|
-
await
|
|
568
|
+
await connection.finish()
|
|
569
|
+
|
|
570
|
+
async def _cleanup_abandoned_connection(self, conn: AsyncListenWebSocketClient):
|
|
571
|
+
"""Clean up abandoned connection attempt in background (non-blocking).
|
|
572
|
+
|
|
573
|
+
This prevents zombie connections from triggering spurious error events
|
|
574
|
+
when they eventually timeout and call _on_error().
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
conn: The abandoned connection object to clean up.
|
|
578
|
+
"""
|
|
579
|
+
try:
|
|
580
|
+
# Try to finish with short timeout
|
|
581
|
+
await asyncio.wait_for(conn.finish(), timeout=5)
|
|
582
|
+
self.logger.debug("Successfully cleaned up abandoned connection")
|
|
583
|
+
except Exception as e:
|
|
584
|
+
# Ignore all cleanup errors - connection might not be fully started
|
|
585
|
+
# This is expected and fine - we just want best-effort cleanup
|
|
586
|
+
self.logger.debug(f"Abandoned connection cleanup failed: {e}")
|
|
288
587
|
|
|
289
588
|
async def start_metrics(self):
|
|
290
589
|
"""Start TTFB and processing metrics collection."""
|
|
@@ -299,6 +598,7 @@ class DeepgramSTTService(STTService):
|
|
|
299
598
|
# NOTE(aleix): we don't disconnect (i.e. call finish on the connection)
|
|
300
599
|
# because this triggers more errors internally in the Deepgram SDK. So,
|
|
301
600
|
# we just forget about the previous connection and create a new one.
|
|
601
|
+
await self.start_reconnection_metrics()
|
|
302
602
|
await self._connect()
|
|
303
603
|
|
|
304
604
|
async def _on_speech_started(self, *args, **kwargs):
|