dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of dv-pipecat-ai might be problematic. Click here for more details.
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
- pipecat/adapters/base_llm_adapter.py +38 -1
- pipecat/adapters/services/anthropic_adapter.py +9 -14
- pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
- pipecat/adapters/services/bedrock_adapter.py +236 -13
- pipecat/adapters/services/gemini_adapter.py +12 -8
- pipecat/adapters/services/open_ai_adapter.py +19 -7
- pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
- pipecat/audio/dtmf/dtmf-0.wav +0 -0
- pipecat/audio/dtmf/dtmf-1.wav +0 -0
- pipecat/audio/dtmf/dtmf-2.wav +0 -0
- pipecat/audio/dtmf/dtmf-3.wav +0 -0
- pipecat/audio/dtmf/dtmf-4.wav +0 -0
- pipecat/audio/dtmf/dtmf-5.wav +0 -0
- pipecat/audio/dtmf/dtmf-6.wav +0 -0
- pipecat/audio/dtmf/dtmf-7.wav +0 -0
- pipecat/audio/dtmf/dtmf-8.wav +0 -0
- pipecat/audio/dtmf/dtmf-9.wav +0 -0
- pipecat/audio/dtmf/dtmf-pound.wav +0 -0
- pipecat/audio/dtmf/dtmf-star.wav +0 -0
- pipecat/audio/filters/krisp_viva_filter.py +193 -0
- pipecat/audio/filters/noisereduce_filter.py +15 -0
- pipecat/audio/turn/base_turn_analyzer.py +9 -1
- pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
- pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
- pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
- pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
- pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
- pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
- pipecat/audio/vad/data/README.md +10 -0
- pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
- pipecat/audio/vad/silero.py +9 -3
- pipecat/audio/vad/vad_analyzer.py +13 -1
- pipecat/extensions/voicemail/voicemail_detector.py +5 -5
- pipecat/frames/frames.py +277 -86
- pipecat/observers/loggers/debug_log_observer.py +3 -3
- pipecat/observers/loggers/llm_log_observer.py +7 -3
- pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
- pipecat/pipeline/runner.py +18 -6
- pipecat/pipeline/service_switcher.py +64 -36
- pipecat/pipeline/task.py +125 -79
- pipecat/pipeline/tts_switcher.py +30 -0
- pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
- pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
- pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
- pipecat/processors/aggregators/llm_context.py +40 -2
- pipecat/processors/aggregators/llm_response.py +32 -15
- pipecat/processors/aggregators/llm_response_universal.py +19 -15
- pipecat/processors/aggregators/user_response.py +6 -6
- pipecat/processors/aggregators/vision_image_frame.py +24 -2
- pipecat/processors/audio/audio_buffer_processor.py +43 -8
- pipecat/processors/dtmf_aggregator.py +174 -77
- pipecat/processors/filters/stt_mute_filter.py +17 -0
- pipecat/processors/frame_processor.py +110 -24
- pipecat/processors/frameworks/langchain.py +8 -2
- pipecat/processors/frameworks/rtvi.py +210 -68
- pipecat/processors/frameworks/strands_agents.py +170 -0
- pipecat/processors/logger.py +2 -2
- pipecat/processors/transcript_processor.py +26 -5
- pipecat/processors/user_idle_processor.py +35 -11
- pipecat/runner/daily.py +59 -20
- pipecat/runner/run.py +395 -93
- pipecat/runner/types.py +6 -4
- pipecat/runner/utils.py +51 -10
- pipecat/serializers/__init__.py +5 -1
- pipecat/serializers/asterisk.py +16 -2
- pipecat/serializers/convox.py +41 -4
- pipecat/serializers/custom.py +257 -0
- pipecat/serializers/exotel.py +5 -5
- pipecat/serializers/livekit.py +20 -0
- pipecat/serializers/plivo.py +5 -5
- pipecat/serializers/protobuf.py +6 -5
- pipecat/serializers/telnyx.py +2 -2
- pipecat/serializers/twilio.py +43 -23
- pipecat/serializers/vi.py +324 -0
- pipecat/services/ai_service.py +2 -6
- pipecat/services/anthropic/llm.py +2 -25
- pipecat/services/assemblyai/models.py +6 -0
- pipecat/services/assemblyai/stt.py +13 -5
- pipecat/services/asyncai/tts.py +5 -3
- pipecat/services/aws/__init__.py +1 -0
- pipecat/services/aws/llm.py +147 -105
- pipecat/services/aws/nova_sonic/__init__.py +0 -0
- pipecat/services/aws/nova_sonic/context.py +436 -0
- pipecat/services/aws/nova_sonic/frames.py +25 -0
- pipecat/services/aws/nova_sonic/llm.py +1265 -0
- pipecat/services/aws/stt.py +3 -3
- pipecat/services/aws_nova_sonic/__init__.py +19 -1
- pipecat/services/aws_nova_sonic/aws.py +11 -1151
- pipecat/services/aws_nova_sonic/context.py +8 -354
- pipecat/services/aws_nova_sonic/frames.py +13 -17
- pipecat/services/azure/llm.py +51 -1
- pipecat/services/azure/realtime/__init__.py +0 -0
- pipecat/services/azure/realtime/llm.py +65 -0
- pipecat/services/azure/stt.py +15 -0
- pipecat/services/cartesia/stt.py +77 -70
- pipecat/services/cartesia/tts.py +80 -13
- pipecat/services/deepgram/__init__.py +1 -0
- pipecat/services/deepgram/flux/__init__.py +0 -0
- pipecat/services/deepgram/flux/stt.py +640 -0
- pipecat/services/elevenlabs/__init__.py +4 -1
- pipecat/services/elevenlabs/stt.py +339 -0
- pipecat/services/elevenlabs/tts.py +87 -46
- pipecat/services/fish/tts.py +5 -2
- pipecat/services/gemini_multimodal_live/events.py +38 -524
- pipecat/services/gemini_multimodal_live/file_api.py +23 -173
- pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
- pipecat/services/gladia/stt.py +56 -72
- pipecat/services/google/__init__.py +1 -0
- pipecat/services/google/gemini_live/__init__.py +3 -0
- pipecat/services/google/gemini_live/file_api.py +189 -0
- pipecat/services/google/gemini_live/llm.py +1582 -0
- pipecat/services/google/gemini_live/llm_vertex.py +184 -0
- pipecat/services/google/llm.py +15 -11
- pipecat/services/google/llm_openai.py +3 -3
- pipecat/services/google/llm_vertex.py +86 -16
- pipecat/services/google/stt.py +4 -0
- pipecat/services/google/tts.py +7 -3
- pipecat/services/heygen/api.py +2 -0
- pipecat/services/heygen/client.py +8 -4
- pipecat/services/heygen/video.py +2 -0
- pipecat/services/hume/__init__.py +5 -0
- pipecat/services/hume/tts.py +220 -0
- pipecat/services/inworld/tts.py +6 -6
- pipecat/services/llm_service.py +15 -5
- pipecat/services/lmnt/tts.py +4 -2
- pipecat/services/mcp_service.py +4 -2
- pipecat/services/mem0/memory.py +6 -5
- pipecat/services/mistral/llm.py +29 -8
- pipecat/services/moondream/vision.py +42 -16
- pipecat/services/neuphonic/tts.py +5 -2
- pipecat/services/openai/__init__.py +1 -0
- pipecat/services/openai/base_llm.py +27 -20
- pipecat/services/openai/realtime/__init__.py +0 -0
- pipecat/services/openai/realtime/context.py +272 -0
- pipecat/services/openai/realtime/events.py +1106 -0
- pipecat/services/openai/realtime/frames.py +37 -0
- pipecat/services/openai/realtime/llm.py +829 -0
- pipecat/services/openai/tts.py +49 -10
- pipecat/services/openai_realtime/__init__.py +27 -0
- pipecat/services/openai_realtime/azure.py +21 -0
- pipecat/services/openai_realtime/context.py +21 -0
- pipecat/services/openai_realtime/events.py +21 -0
- pipecat/services/openai_realtime/frames.py +21 -0
- pipecat/services/openai_realtime_beta/azure.py +16 -0
- pipecat/services/openai_realtime_beta/openai.py +17 -5
- pipecat/services/piper/tts.py +7 -9
- pipecat/services/playht/tts.py +34 -4
- pipecat/services/rime/tts.py +12 -12
- pipecat/services/riva/stt.py +3 -1
- pipecat/services/salesforce/__init__.py +9 -0
- pipecat/services/salesforce/llm.py +700 -0
- pipecat/services/sarvam/__init__.py +7 -0
- pipecat/services/sarvam/stt.py +540 -0
- pipecat/services/sarvam/tts.py +97 -13
- pipecat/services/simli/video.py +2 -2
- pipecat/services/speechmatics/stt.py +22 -10
- pipecat/services/stt_service.py +47 -0
- pipecat/services/tavus/video.py +2 -2
- pipecat/services/tts_service.py +75 -22
- pipecat/services/vision_service.py +7 -6
- pipecat/services/vistaar/llm.py +51 -9
- pipecat/tests/utils.py +4 -4
- pipecat/transcriptions/language.py +41 -1
- pipecat/transports/base_input.py +13 -34
- pipecat/transports/base_output.py +140 -104
- pipecat/transports/daily/transport.py +199 -26
- pipecat/transports/heygen/__init__.py +0 -0
- pipecat/transports/heygen/transport.py +381 -0
- pipecat/transports/livekit/transport.py +228 -63
- pipecat/transports/local/audio.py +6 -1
- pipecat/transports/local/tk.py +11 -2
- pipecat/transports/network/fastapi_websocket.py +1 -1
- pipecat/transports/smallwebrtc/connection.py +103 -19
- pipecat/transports/smallwebrtc/request_handler.py +246 -0
- pipecat/transports/smallwebrtc/transport.py +65 -23
- pipecat/transports/tavus/transport.py +23 -12
- pipecat/transports/websocket/client.py +41 -5
- pipecat/transports/websocket/fastapi.py +21 -11
- pipecat/transports/websocket/server.py +14 -7
- pipecat/transports/whatsapp/api.py +8 -0
- pipecat/transports/whatsapp/client.py +47 -0
- pipecat/utils/base_object.py +54 -22
- pipecat/utils/redis.py +58 -0
- pipecat/utils/string.py +13 -1
- pipecat/utils/tracing/service_decorators.py +21 -21
- pipecat/serializers/genesys.py +0 -95
- pipecat/services/google/test-google-chirp.py +0 -45
- pipecat/services/openai.py +0 -698
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
- {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
- /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
|
@@ -0,0 +1,640 @@
|
|
|
1
|
+
#
|
|
2
|
+
# Copyright (c) 2024–2025, Daily
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: BSD 2-Clause License
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Deepgram Flux speech-to-text service implementation."""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any, AsyncGenerator, Dict, Optional
|
|
12
|
+
|
|
13
|
+
from loguru import logger
|
|
14
|
+
from pydantic import BaseModel
|
|
15
|
+
|
|
16
|
+
from pipecat.frames.frames import (
|
|
17
|
+
CancelFrame,
|
|
18
|
+
EndFrame,
|
|
19
|
+
ErrorFrame,
|
|
20
|
+
Frame,
|
|
21
|
+
InterimTranscriptionFrame,
|
|
22
|
+
StartFrame,
|
|
23
|
+
TranscriptionFrame,
|
|
24
|
+
UserStartedSpeakingFrame,
|
|
25
|
+
UserStoppedSpeakingFrame,
|
|
26
|
+
)
|
|
27
|
+
from pipecat.processors.frame_processor import FrameDirection
|
|
28
|
+
from pipecat.services.stt_service import WebsocketSTTService
|
|
29
|
+
from pipecat.transcriptions.language import Language
|
|
30
|
+
from pipecat.utils.time import time_now_iso8601
|
|
31
|
+
from pipecat.utils.tracing.service_decorators import traced_stt
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
from websockets.asyncio.client import connect as websocket_connect
|
|
35
|
+
from websockets.protocol import State
|
|
36
|
+
except ModuleNotFoundError as e:
|
|
37
|
+
logger.error(f"Exception: {e}")
|
|
38
|
+
logger.error("In order to use Deepgram Flux, you need to `pip install pipecat-ai[deepgram]`.")
|
|
39
|
+
raise Exception(f"Missing module: {e}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class FluxMessageType(str, Enum):
|
|
43
|
+
"""Deepgram Flux WebSocket message types.
|
|
44
|
+
|
|
45
|
+
These are the top-level message types that can be received from the
|
|
46
|
+
Deepgram Flux WebSocket connection.
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
RECEIVE_CONNECTED = "Connected"
|
|
50
|
+
RECEIVE_FATAL_ERROR = "Error"
|
|
51
|
+
TURN_INFO = "TurnInfo"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FluxEventType(str, Enum):
|
|
55
|
+
"""Deepgram Flux TurnInfo event types.
|
|
56
|
+
|
|
57
|
+
These events are contained within TurnInfo messages and indicate
|
|
58
|
+
different stages of speech processing and turn detection.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
START_OF_TURN = "StartOfTurn"
|
|
62
|
+
TURN_RESUMED = "TurnResumed"
|
|
63
|
+
END_OF_TURN = "EndOfTurn"
|
|
64
|
+
EAGER_END_OF_TURN = "EagerEndOfTurn"
|
|
65
|
+
UPDATE = "Update"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class DeepgramFluxSTTService(WebsocketSTTService):
|
|
69
|
+
"""Deepgram Flux speech-to-text service.
|
|
70
|
+
|
|
71
|
+
Provides real-time speech recognition using Deepgram's WebSocket API with Flux capabilities.
|
|
72
|
+
Supports configurable models, VAD events, and various audio processing options
|
|
73
|
+
including advanced turn detection and EagerEndOfTurn events for improved conversational AI performance.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
class InputParams(BaseModel):
|
|
77
|
+
"""Configuration parameters for Deepgram Flux API.
|
|
78
|
+
|
|
79
|
+
This class defines all available connection parameters for the Deepgram Flux API
|
|
80
|
+
based on the official documentation.
|
|
81
|
+
|
|
82
|
+
Parameters:
|
|
83
|
+
eager_eot_threshold: Optional. EagerEndOfTurn/TurnResumed are off by default.
|
|
84
|
+
You can turn them on by setting eager_eot_threshold to a valid value.
|
|
85
|
+
Lower values = more aggressive EagerEndOfTurning (faster response, more LLM calls).
|
|
86
|
+
Higher values = more conservative EagerEndOfTurning (slower response, fewer LLM calls).
|
|
87
|
+
eot_threshold: Optional. End-of-turn confidence required to finish a turn (default 0.7).
|
|
88
|
+
Lower values = turns end sooner (more interruptions, faster responses).
|
|
89
|
+
Higher values = turns end later (fewer interruptions, more complete utterances).
|
|
90
|
+
eot_timeout_ms: Optional. Time in milliseconds after speech to finish a turn
|
|
91
|
+
regardless of EOT confidence (default 5000).
|
|
92
|
+
keyterm: List of keyterms to boost recognition accuracy for specialized terminology.
|
|
93
|
+
mip_opt_out: Optional. Opts out requests from the Deepgram Model Improvement Program
|
|
94
|
+
(default False).
|
|
95
|
+
tag: List of tags to label requests for identification during usage reporting.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
eager_eot_threshold: Optional[float] = None
|
|
99
|
+
eot_threshold: Optional[float] = None
|
|
100
|
+
eot_timeout_ms: Optional[int] = None
|
|
101
|
+
keyterm: list = []
|
|
102
|
+
mip_opt_out: Optional[bool] = None
|
|
103
|
+
tag: list = []
|
|
104
|
+
|
|
105
|
+
def __init__(
|
|
106
|
+
self,
|
|
107
|
+
*,
|
|
108
|
+
api_key: str,
|
|
109
|
+
url: str = "wss://api.deepgram.com/v2/listen",
|
|
110
|
+
sample_rate: Optional[int] = None,
|
|
111
|
+
model: str = "flux-general-en",
|
|
112
|
+
flux_encoding: str = "linear16",
|
|
113
|
+
params: Optional[InputParams] = None,
|
|
114
|
+
**kwargs,
|
|
115
|
+
):
|
|
116
|
+
"""Initialize the Deepgram Flux STT service.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
api_key: Deepgram API key for authentication. Required for API access.
|
|
120
|
+
url: WebSocket URL for the Deepgram Flux API. Defaults to the preview endpoint.
|
|
121
|
+
sample_rate: Audio sample rate in Hz. If None, uses the rate from params or 16000.
|
|
122
|
+
model: Deepgram Flux model to use for transcription. Currently only supports "flux-general-en".
|
|
123
|
+
flux_encoding: Audio encoding format required by Flux API. Must be "linear16".
|
|
124
|
+
Raw signed little-endian 16-bit PCM encoding.
|
|
125
|
+
params: InputParams instance containing detailed API configuration options.
|
|
126
|
+
If None, default parameters will be used.
|
|
127
|
+
**kwargs: Additional arguments passed to the parent WebsocketSTTService class.
|
|
128
|
+
|
|
129
|
+
Examples:
|
|
130
|
+
Basic usage with default parameters::
|
|
131
|
+
|
|
132
|
+
stt = DeepgramFluxSTTService(api_key="your-api-key")
|
|
133
|
+
|
|
134
|
+
Advanced usage with custom parameters::
|
|
135
|
+
|
|
136
|
+
params = DeepgramFluxSTTService.InputParams(
|
|
137
|
+
eager_eot_threshold=0.5,
|
|
138
|
+
eot_threshold=0.8,
|
|
139
|
+
keyterm=["AI", "machine learning", "neural network"],
|
|
140
|
+
tag=["production", "voice-agent"]
|
|
141
|
+
)
|
|
142
|
+
stt = DeepgramFluxSTTService(
|
|
143
|
+
api_key="your-api-key",
|
|
144
|
+
model="flux-general-en",
|
|
145
|
+
params=params
|
|
146
|
+
)
|
|
147
|
+
"""
|
|
148
|
+
super().__init__(sample_rate=sample_rate, **kwargs)
|
|
149
|
+
|
|
150
|
+
self._api_key = api_key
|
|
151
|
+
self._url = url
|
|
152
|
+
self._model = model
|
|
153
|
+
self._params = params or DeepgramFluxSTTService.InputParams()
|
|
154
|
+
self._flux_encoding = flux_encoding
|
|
155
|
+
# This is the currently only supported language
|
|
156
|
+
self._language = Language.EN
|
|
157
|
+
self._websocket_url = None
|
|
158
|
+
self._receive_task = None
|
|
159
|
+
|
|
160
|
+
async def _connect(self):
|
|
161
|
+
"""Connect to WebSocket and start background tasks.
|
|
162
|
+
|
|
163
|
+
Establishes the WebSocket connection to the Deepgram Flux API and starts
|
|
164
|
+
the background task for receiving transcription results.
|
|
165
|
+
"""
|
|
166
|
+
await self._connect_websocket()
|
|
167
|
+
|
|
168
|
+
if self._websocket and not self._receive_task:
|
|
169
|
+
self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
|
|
170
|
+
|
|
171
|
+
async def _disconnect(self):
|
|
172
|
+
"""Disconnect from WebSocket and clean up tasks.
|
|
173
|
+
|
|
174
|
+
Gracefully disconnects from the Deepgram Flux API, cancels background tasks,
|
|
175
|
+
and cleans up resources to prevent memory leaks.
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
# Cancel background tasks BEFORE closing websocket
|
|
179
|
+
if self._receive_task:
|
|
180
|
+
await self.cancel_task(self._receive_task, timeout=2.0)
|
|
181
|
+
self._receive_task = None
|
|
182
|
+
|
|
183
|
+
# Now close the websocket
|
|
184
|
+
await self._disconnect_websocket()
|
|
185
|
+
|
|
186
|
+
except Exception as e:
|
|
187
|
+
logger.error(f"Error during disconnect: {e}")
|
|
188
|
+
finally:
|
|
189
|
+
# Reset state only after everything is cleaned up
|
|
190
|
+
self._websocket = None
|
|
191
|
+
|
|
192
|
+
async def _connect_websocket(self):
|
|
193
|
+
"""Establish WebSocket connection to API.
|
|
194
|
+
|
|
195
|
+
Creates a WebSocket connection to the Deepgram Flux API using the configured
|
|
196
|
+
URL and authentication headers. Handles connection errors and reports them
|
|
197
|
+
through the event handler system.
|
|
198
|
+
"""
|
|
199
|
+
try:
|
|
200
|
+
if self._websocket and self._websocket.state is State.OPEN:
|
|
201
|
+
return
|
|
202
|
+
|
|
203
|
+
self._websocket = await websocket_connect(
|
|
204
|
+
self._websocket_url,
|
|
205
|
+
additional_headers={"Authorization": f"Token {self._api_key}"},
|
|
206
|
+
)
|
|
207
|
+
logger.debug("Connected to Deepgram Flux Websocket")
|
|
208
|
+
await self._call_event_handler("on_connected")
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.error(f"{self} initialization error: {e}")
|
|
211
|
+
self._websocket = None
|
|
212
|
+
await self._call_event_handler("on_connection_error", f"{e}")
|
|
213
|
+
|
|
214
|
+
async def _disconnect_websocket(self):
|
|
215
|
+
"""Close WebSocket connection and clean up state.
|
|
216
|
+
|
|
217
|
+
Closes the WebSocket connection to the Deepgram Flux API and stops all
|
|
218
|
+
metrics collection. Handles disconnection errors gracefully.
|
|
219
|
+
"""
|
|
220
|
+
try:
|
|
221
|
+
await self.stop_all_metrics()
|
|
222
|
+
|
|
223
|
+
if self._websocket:
|
|
224
|
+
await self._send_close_stream()
|
|
225
|
+
logger.debug("Disconnecting from Deepgram Flux Websocket")
|
|
226
|
+
await self._websocket.close()
|
|
227
|
+
except Exception as e:
|
|
228
|
+
logger.error(f"{self} error closing websocket: {e}")
|
|
229
|
+
finally:
|
|
230
|
+
self._websocket = None
|
|
231
|
+
await self._call_event_handler("on_disconnected")
|
|
232
|
+
|
|
233
|
+
async def _send_close_stream(self) -> None:
|
|
234
|
+
"""Sends a CloseStream control message to the Deepgram Flux WebSocket API.
|
|
235
|
+
|
|
236
|
+
This signals to the server that no more audio data will be sent.
|
|
237
|
+
"""
|
|
238
|
+
if self._websocket:
|
|
239
|
+
logger.debug("Sending CloseStream message to Deepgram Flux")
|
|
240
|
+
message = {"type": "CloseStream"}
|
|
241
|
+
await self._websocket.send(json.dumps(message))
|
|
242
|
+
|
|
243
|
+
def can_generate_metrics(self) -> bool:
|
|
244
|
+
"""Check if this service can generate processing metrics.
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
True, as Deepgram service supports metrics generation.
|
|
248
|
+
"""
|
|
249
|
+
return True
|
|
250
|
+
|
|
251
|
+
async def start(self, frame: StartFrame):
|
|
252
|
+
"""Start the Deepgram Flux STT service.
|
|
253
|
+
|
|
254
|
+
Initializes the service by constructing the WebSocket URL with all configured
|
|
255
|
+
parameters and establishing the connection to begin transcription processing.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
frame: The start frame containing initialization parameters and metadata.
|
|
259
|
+
"""
|
|
260
|
+
await super().start(frame)
|
|
261
|
+
|
|
262
|
+
url_params = [
|
|
263
|
+
f"model={self._model}",
|
|
264
|
+
f"sample_rate={self.sample_rate}",
|
|
265
|
+
f"encoding={self._flux_encoding}",
|
|
266
|
+
]
|
|
267
|
+
|
|
268
|
+
if self._params.eager_eot_threshold is not None:
|
|
269
|
+
url_params.append(f"eager_eot_threshold={self._params.eager_eot_threshold}")
|
|
270
|
+
|
|
271
|
+
if self._params.eot_threshold is not None:
|
|
272
|
+
url_params.append(f"eot_threshold={self._params.eot_threshold}")
|
|
273
|
+
|
|
274
|
+
if self._params.eot_timeout_ms is not None:
|
|
275
|
+
url_params.append(f"eot_timeout_ms={self._params.eot_timeout_ms}")
|
|
276
|
+
|
|
277
|
+
if self._params.mip_opt_out is not None:
|
|
278
|
+
url_params.append(f"mip_opt_out={str(self._params.mip_opt_out).lower()}")
|
|
279
|
+
|
|
280
|
+
# Add keyterm parameters (can have multiple)
|
|
281
|
+
for keyterm in self._params.keyterm:
|
|
282
|
+
url_params.append(f"keyterm={keyterm}")
|
|
283
|
+
|
|
284
|
+
# Add tag parameters (can have multiple)
|
|
285
|
+
for tag_value in self._params.tag:
|
|
286
|
+
url_params.append(f"tag={tag_value}")
|
|
287
|
+
|
|
288
|
+
self._websocket_url = f"{self._url}?{'&'.join(url_params)}"
|
|
289
|
+
await self._connect()
|
|
290
|
+
|
|
291
|
+
async def stop(self, frame: EndFrame):
|
|
292
|
+
"""Stop the Deepgram Flux STT service.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
frame: The end frame.
|
|
296
|
+
"""
|
|
297
|
+
await super().stop(frame)
|
|
298
|
+
await self._disconnect()
|
|
299
|
+
|
|
300
|
+
async def cancel(self, frame: CancelFrame):
|
|
301
|
+
"""Cancel the Deepgram Flux STT service.
|
|
302
|
+
|
|
303
|
+
Args:
|
|
304
|
+
frame: The cancel frame.
|
|
305
|
+
"""
|
|
306
|
+
await super().cancel(frame)
|
|
307
|
+
await self._disconnect()
|
|
308
|
+
|
|
309
|
+
async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
|
|
310
|
+
"""Send audio data to Deepgram Flux for transcription.
|
|
311
|
+
|
|
312
|
+
Transmits raw audio bytes to the Deepgram Flux API for real-time speech
|
|
313
|
+
recognition. Transcription results are received asynchronously through
|
|
314
|
+
WebSocket callbacks and processed in the background.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
audio: Raw audio bytes in linear16 format (signed little-endian 16-bit PCM).
|
|
318
|
+
|
|
319
|
+
Yields:
|
|
320
|
+
Frame: None (transcription results are delivered via WebSocket callbacks
|
|
321
|
+
rather than as return values from this method).
|
|
322
|
+
|
|
323
|
+
Raises:
|
|
324
|
+
Exception: If the WebSocket connection is not established or if there
|
|
325
|
+
are issues sending the audio data.
|
|
326
|
+
"""
|
|
327
|
+
if not self._websocket:
|
|
328
|
+
logger.error("Not connected to Deepgram Flux.")
|
|
329
|
+
yield ErrorFrame("Not connected to Deepgram Flux.", fatal=True)
|
|
330
|
+
return
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
await self._websocket.send(audio)
|
|
334
|
+
except Exception as e:
|
|
335
|
+
logger.error(f"Failed to send audio to Flux: {e}")
|
|
336
|
+
yield ErrorFrame(f"Failed to send audio to Flux: {e}")
|
|
337
|
+
return
|
|
338
|
+
|
|
339
|
+
yield None
|
|
340
|
+
|
|
341
|
+
async def start_metrics(self):
|
|
342
|
+
"""Start TTFB and processing metrics collection."""
|
|
343
|
+
# TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
|
|
344
|
+
# Ideally, TTFB should measure the time from when a user starts speaking
|
|
345
|
+
# until we receive the first transcript. However, Deepgram Flux delivers
|
|
346
|
+
# both the "user started speaking" event and the first transcript simultaneously,
|
|
347
|
+
# making this timing measurement meaningless in this context.
|
|
348
|
+
# await self.start_ttfb_metrics()
|
|
349
|
+
await self.start_processing_metrics()
|
|
350
|
+
|
|
351
|
+
@traced_stt
|
|
352
|
+
async def _handle_transcription(
|
|
353
|
+
self, transcript: str, is_final: bool, language: Optional[Language] = None
|
|
354
|
+
):
|
|
355
|
+
"""Handle a transcription result with tracing."""
|
|
356
|
+
pass
|
|
357
|
+
|
|
358
|
+
def _get_websocket(self):
|
|
359
|
+
"""Get the current WebSocket connection.
|
|
360
|
+
|
|
361
|
+
Returns the active WebSocket connection instance, raising an exception
|
|
362
|
+
if no connection is currently established.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
The active WebSocket connection instance.
|
|
366
|
+
|
|
367
|
+
Raises:
|
|
368
|
+
Exception: If no WebSocket connection is currently active.
|
|
369
|
+
"""
|
|
370
|
+
if self._websocket:
|
|
371
|
+
return self._websocket
|
|
372
|
+
raise Exception("Websocket not connected")
|
|
373
|
+
|
|
374
|
+
def _validate_message(self, data: Dict[str, Any]) -> bool:
|
|
375
|
+
"""Validate basic message structure from Deepgram Flux.
|
|
376
|
+
|
|
377
|
+
Ensures the received message has the expected structure before processing.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
data: The parsed JSON message data to validate.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
True if the message structure is valid, False otherwise.
|
|
384
|
+
"""
|
|
385
|
+
if not isinstance(data, dict):
|
|
386
|
+
logger.warning("Message is not a dictionary")
|
|
387
|
+
return False
|
|
388
|
+
|
|
389
|
+
if "type" not in data:
|
|
390
|
+
logger.warning("Message missing 'type' field")
|
|
391
|
+
return False
|
|
392
|
+
|
|
393
|
+
return True
|
|
394
|
+
|
|
395
|
+
async def _receive_messages(self):
|
|
396
|
+
"""Receive and process messages from WebSocket.
|
|
397
|
+
|
|
398
|
+
Continuously receives messages from the Deepgram Flux WebSocket connection
|
|
399
|
+
and processes various message types including connection status, transcription
|
|
400
|
+
results, turn information, and error conditions. Handles different event types
|
|
401
|
+
such as StartOfTurn, EndOfTurn, EagerEndOfTurn, and Update events.
|
|
402
|
+
"""
|
|
403
|
+
async for message in self._get_websocket():
|
|
404
|
+
if isinstance(message, str):
|
|
405
|
+
try:
|
|
406
|
+
data = json.loads(message)
|
|
407
|
+
await self._handle_message(data)
|
|
408
|
+
except json.JSONDecodeError as e:
|
|
409
|
+
logger.error(f"Failed to decode JSON message: {e}")
|
|
410
|
+
# Skip malformed messages
|
|
411
|
+
continue
|
|
412
|
+
except Exception as e:
|
|
413
|
+
logger.error(f"Error processing message: {e}")
|
|
414
|
+
# Error will be handled inside WebsocketService->_receive_task_handler
|
|
415
|
+
raise
|
|
416
|
+
else:
|
|
417
|
+
logger.warning(f"Received non-string message: {type(message)}")
|
|
418
|
+
|
|
419
|
+
async def _handle_message(self, data: Dict[str, Any]):
|
|
420
|
+
"""Handle a parsed WebSocket message from Deepgram Flux.
|
|
421
|
+
|
|
422
|
+
Routes messages to appropriate handlers based on their type. Validates
|
|
423
|
+
message structure before processing.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
data: The parsed JSON message data from the WebSocket.
|
|
427
|
+
"""
|
|
428
|
+
if not self._validate_message(data):
|
|
429
|
+
return
|
|
430
|
+
|
|
431
|
+
message_type = data.get("type")
|
|
432
|
+
|
|
433
|
+
try:
|
|
434
|
+
flux_message_type = FluxMessageType(message_type)
|
|
435
|
+
except ValueError:
|
|
436
|
+
logger.debug(f"Unhandled message type: {message_type or 'unknown'}")
|
|
437
|
+
return
|
|
438
|
+
|
|
439
|
+
match flux_message_type:
|
|
440
|
+
case FluxMessageType.RECEIVE_CONNECTED:
|
|
441
|
+
await self._handle_connection_established()
|
|
442
|
+
case FluxMessageType.RECEIVE_FATAL_ERROR:
|
|
443
|
+
await self._handle_fatal_error(data)
|
|
444
|
+
case FluxMessageType.TURN_INFO:
|
|
445
|
+
await self._handle_turn_info(data)
|
|
446
|
+
|
|
447
|
+
async def _handle_connection_established(self):
|
|
448
|
+
"""Handle successful connection establishment to Deepgram Flux.
|
|
449
|
+
|
|
450
|
+
This event is fired when the WebSocket connection to Deepgram Flux
|
|
451
|
+
is successfully established and ready to receive audio data for
|
|
452
|
+
transcription processing.
|
|
453
|
+
"""
|
|
454
|
+
logger.info("Connected to Flux - ready to stream audio")
|
|
455
|
+
|
|
456
|
+
async def _handle_fatal_error(self, data: Dict[str, Any]):
|
|
457
|
+
"""Handle fatal error messages from Deepgram Flux.
|
|
458
|
+
|
|
459
|
+
Fatal errors indicate unrecoverable issues with the connection or
|
|
460
|
+
configuration that require intervention. These errors will cause
|
|
461
|
+
the connection to be terminated.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
data: The error message data containing error details.
|
|
465
|
+
|
|
466
|
+
Raises:
|
|
467
|
+
Exception: Always raises to trigger error handling in the parent service.
|
|
468
|
+
"""
|
|
469
|
+
error_msg = data.get("error", "Unknown error")
|
|
470
|
+
deepgram_error = f"Fatal error: {error_msg}"
|
|
471
|
+
logger.error(deepgram_error)
|
|
472
|
+
# Error will be handled inside WebsocketService->_receive_task_handler
|
|
473
|
+
raise Exception(deepgram_error)
|
|
474
|
+
|
|
475
|
+
async def _handle_turn_info(self, data: Dict[str, Any]):
|
|
476
|
+
"""Handle TurnInfo events from Deepgram Flux.
|
|
477
|
+
|
|
478
|
+
TurnInfo messages contain various turn-based events that indicate
|
|
479
|
+
the state of speech processing, including turn boundaries, interim
|
|
480
|
+
results, and turn finalization events.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
|
484
|
+
"""
|
|
485
|
+
event = data.get("event")
|
|
486
|
+
transcript = data.get("transcript", "")
|
|
487
|
+
|
|
488
|
+
try:
|
|
489
|
+
flux_event_type = FluxEventType(event)
|
|
490
|
+
except ValueError:
|
|
491
|
+
logger.debug(f"Unhandled TurnInfo event: {event}")
|
|
492
|
+
return
|
|
493
|
+
|
|
494
|
+
match flux_event_type:
|
|
495
|
+
case FluxEventType.START_OF_TURN:
|
|
496
|
+
await self._handle_start_of_turn(transcript)
|
|
497
|
+
case FluxEventType.TURN_RESUMED:
|
|
498
|
+
await self._handle_turn_resumed(event)
|
|
499
|
+
case FluxEventType.END_OF_TURN:
|
|
500
|
+
await self._handle_end_of_turn(transcript, data)
|
|
501
|
+
case FluxEventType.EAGER_END_OF_TURN:
|
|
502
|
+
await self._handle_eager_end_of_turn(transcript, data)
|
|
503
|
+
case FluxEventType.UPDATE:
|
|
504
|
+
await self._handle_update(transcript)
|
|
505
|
+
|
|
506
|
+
async def _handle_start_of_turn(self, transcript: str):
|
|
507
|
+
"""Handle StartOfTurn events from Deepgram Flux.
|
|
508
|
+
|
|
509
|
+
StartOfTurn events are fired when Deepgram Flux detects the beginning
|
|
510
|
+
of a new speaking turn. This triggers bot interruption to stop any
|
|
511
|
+
ongoing speech synthesis and signals the start of user speech detection.
|
|
512
|
+
|
|
513
|
+
The service will:
|
|
514
|
+
- Send a BotInterruptionFrame upstream to stop bot speech
|
|
515
|
+
- Send a UserStartedSpeakingFrame downstream to notify other components
|
|
516
|
+
- Start metrics collection for measuring response times
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
transcript: maybe the first few words of the turn.
|
|
520
|
+
"""
|
|
521
|
+
logger.debug("User started speaking")
|
|
522
|
+
await self.push_interruption_task_frame_and_wait()
|
|
523
|
+
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
|
524
|
+
await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
|
|
525
|
+
await self.start_metrics()
|
|
526
|
+
if transcript:
|
|
527
|
+
logger.trace(f"Start of turn transcript: {transcript}")
|
|
528
|
+
|
|
529
|
+
async def _handle_turn_resumed(self, event: str):
|
|
530
|
+
"""Handle TurnResumed events from Deepgram Flux.
|
|
531
|
+
|
|
532
|
+
TurnResumed events indicate that speech has resumed after a brief pause
|
|
533
|
+
within the same turn. This is primarily used for logging and debugging
|
|
534
|
+
purposes and doesn't trigger any significant processing changes.
|
|
535
|
+
|
|
536
|
+
Args:
|
|
537
|
+
event: The event type string for logging purposes.
|
|
538
|
+
"""
|
|
539
|
+
logger.trace(f"Received event TurnResumed: {event}")
|
|
540
|
+
|
|
541
|
+
async def _handle_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
|
542
|
+
"""Handle EndOfTurn events from Deepgram Flux.
|
|
543
|
+
|
|
544
|
+
EndOfTurn events are fired when Deepgram Flux determines that a speaking
|
|
545
|
+
turn has concluded, either due to sufficient silence or end-of-turn
|
|
546
|
+
confidence thresholds being met. This provides the final transcript
|
|
547
|
+
for the completed turn.
|
|
548
|
+
|
|
549
|
+
The service will:
|
|
550
|
+
- Create and send a final TranscriptionFrame with the complete transcript
|
|
551
|
+
- Trigger transcription handling with tracing for metrics
|
|
552
|
+
- Stop processing metrics collection
|
|
553
|
+
- Send a UserStoppedSpeakingFrame to signal turn completion
|
|
554
|
+
|
|
555
|
+
Args:
|
|
556
|
+
transcript: The final transcript text for the completed turn.
|
|
557
|
+
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
|
558
|
+
"""
|
|
559
|
+
logger.debug("User stopped speaking")
|
|
560
|
+
|
|
561
|
+
await self.push_frame(
|
|
562
|
+
TranscriptionFrame(
|
|
563
|
+
transcript,
|
|
564
|
+
self._user_id,
|
|
565
|
+
time_now_iso8601(),
|
|
566
|
+
self._language,
|
|
567
|
+
result=data,
|
|
568
|
+
)
|
|
569
|
+
)
|
|
570
|
+
await self._handle_transcription(transcript, True, self._language)
|
|
571
|
+
await self.stop_processing_metrics()
|
|
572
|
+
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.DOWNSTREAM)
|
|
573
|
+
await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.UPSTREAM)
|
|
574
|
+
|
|
575
|
+
async def _handle_eager_end_of_turn(self, transcript: str, data: Dict[str, Any]):
|
|
576
|
+
"""Handle EagerEndOfTurn events from Deepgram Flux.
|
|
577
|
+
|
|
578
|
+
EagerEndOfTurn events are fired when the end-of-turn confidence reaches the
|
|
579
|
+
EagerEndOfTurn threshold but hasn't yet reached the full end-of-turn threshold.
|
|
580
|
+
These provide interim transcripts that can be used for faster response
|
|
581
|
+
generation while still allowing the user to continue speaking.
|
|
582
|
+
|
|
583
|
+
EagerEndOfTurn events enable more responsive conversational AI by allowing
|
|
584
|
+
the LLM to start processing likely final transcripts before the turn
|
|
585
|
+
is definitively ended.
|
|
586
|
+
|
|
587
|
+
Args:
|
|
588
|
+
transcript: The interim transcript text that triggered the EagerEndOfTurn event.
|
|
589
|
+
data: The TurnInfo message data containing event type, transcript and some extra metadata.
|
|
590
|
+
"""
|
|
591
|
+
logger.trace(f"EagerEndOfTurn - {transcript}")
|
|
592
|
+
# Deepgram's EagerEndOfTurn feature enables lower-latency voice agents by sending
|
|
593
|
+
# medium-confidence transcripts before EndOfTurn certainty, allowing LLM processing to
|
|
594
|
+
# begin early.
|
|
595
|
+
#
|
|
596
|
+
# However, if speech resumes or the transcripts differ from the final EndOfTurn, the
|
|
597
|
+
# EagerEndOfTurn response should be cancelled to avoid incorrect or partial responses.
|
|
598
|
+
#
|
|
599
|
+
# Pipecat doesn't yet provide built-in Gate/control mechanisms to:
|
|
600
|
+
# 1. Start LLM/TTS processing early on EagerEndOfTurn events
|
|
601
|
+
# 2. Cancel in-flight processing when TurnResumed occurs
|
|
602
|
+
#
|
|
603
|
+
# By pushing EagerEndOfTurn transcripts as InterimTranscriptionFrame, we enable
|
|
604
|
+
# developers to implement custom EagerEndOfTurn handling in their applications while
|
|
605
|
+
# maintaining compatibility with existing interim transcription workflows.
|
|
606
|
+
#
|
|
607
|
+
# TODO: Implement proper EagerEndOfTurn support with cancellable processing pipeline
|
|
608
|
+
# that can start response generation on EagerEndOfTurn and cancel or confirm it.
|
|
609
|
+
await self.push_frame(
|
|
610
|
+
InterimTranscriptionFrame(
|
|
611
|
+
transcript,
|
|
612
|
+
self._user_id,
|
|
613
|
+
time_now_iso8601(),
|
|
614
|
+
self._language,
|
|
615
|
+
result=data,
|
|
616
|
+
)
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
async def _handle_update(self, transcript: str):
|
|
620
|
+
"""Handle Update events from Deepgram Flux.
|
|
621
|
+
|
|
622
|
+
Update events provide incremental transcript updates during an ongoing
|
|
623
|
+
turn. These events allow for real-time display of transcription progress
|
|
624
|
+
and can be used to provide visual feedback to users about what's being
|
|
625
|
+
recognized.
|
|
626
|
+
|
|
627
|
+
The service stops TTFB (Time To First Byte) metrics when the first
|
|
628
|
+
substantial update is received, indicating successful processing start.
|
|
629
|
+
|
|
630
|
+
Args:
|
|
631
|
+
transcript: The current partial transcript text for the ongoing turn.
|
|
632
|
+
"""
|
|
633
|
+
if transcript:
|
|
634
|
+
logger.trace(f"Update event: {transcript}")
|
|
635
|
+
# TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
|
|
636
|
+
# Ideally, TTFB should measure the time from when a user starts speaking
|
|
637
|
+
# until we receive the first transcript. However, Deepgram Flux delivers
|
|
638
|
+
# both the "user started speaking" event and the first transcript simultaneously,
|
|
639
|
+
# making this timing measurement meaningless in this context.
|
|
640
|
+
# await self.stop_ttfb_metrics()
|
|
@@ -8,6 +8,9 @@ import sys
|
|
|
8
8
|
|
|
9
9
|
from pipecat.services import DeprecatedModuleProxy
|
|
10
10
|
|
|
11
|
+
from .stt import *
|
|
11
12
|
from .tts import *
|
|
13
|
+
from .stt import *
|
|
14
|
+
# Old
|
|
12
15
|
|
|
13
|
-
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.tts")
|
|
16
|
+
sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.[stt,tts]")
|