dv-pipecat-ai 0.0.82.dev857__py3-none-any.whl → 0.0.85.dev837__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dv-pipecat-ai might be problematic. Click here for more details.

Files changed (195) hide show
  1. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/METADATA +98 -130
  2. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/RECORD +192 -140
  3. pipecat/adapters/base_llm_adapter.py +38 -1
  4. pipecat/adapters/services/anthropic_adapter.py +9 -14
  5. pipecat/adapters/services/aws_nova_sonic_adapter.py +120 -5
  6. pipecat/adapters/services/bedrock_adapter.py +236 -13
  7. pipecat/adapters/services/gemini_adapter.py +12 -8
  8. pipecat/adapters/services/open_ai_adapter.py +19 -7
  9. pipecat/adapters/services/open_ai_realtime_adapter.py +5 -0
  10. pipecat/audio/dtmf/dtmf-0.wav +0 -0
  11. pipecat/audio/dtmf/dtmf-1.wav +0 -0
  12. pipecat/audio/dtmf/dtmf-2.wav +0 -0
  13. pipecat/audio/dtmf/dtmf-3.wav +0 -0
  14. pipecat/audio/dtmf/dtmf-4.wav +0 -0
  15. pipecat/audio/dtmf/dtmf-5.wav +0 -0
  16. pipecat/audio/dtmf/dtmf-6.wav +0 -0
  17. pipecat/audio/dtmf/dtmf-7.wav +0 -0
  18. pipecat/audio/dtmf/dtmf-8.wav +0 -0
  19. pipecat/audio/dtmf/dtmf-9.wav +0 -0
  20. pipecat/audio/dtmf/dtmf-pound.wav +0 -0
  21. pipecat/audio/dtmf/dtmf-star.wav +0 -0
  22. pipecat/audio/filters/krisp_viva_filter.py +193 -0
  23. pipecat/audio/filters/noisereduce_filter.py +15 -0
  24. pipecat/audio/turn/base_turn_analyzer.py +9 -1
  25. pipecat/audio/turn/smart_turn/base_smart_turn.py +14 -8
  26. pipecat/audio/turn/smart_turn/data/__init__.py +0 -0
  27. pipecat/audio/turn/smart_turn/data/smart-turn-v3.0.onnx +0 -0
  28. pipecat/audio/turn/smart_turn/http_smart_turn.py +6 -2
  29. pipecat/audio/turn/smart_turn/local_smart_turn.py +1 -1
  30. pipecat/audio/turn/smart_turn/local_smart_turn_v2.py +1 -1
  31. pipecat/audio/turn/smart_turn/local_smart_turn_v3.py +124 -0
  32. pipecat/audio/vad/data/README.md +10 -0
  33. pipecat/audio/vad/data/silero_vad_v2.onnx +0 -0
  34. pipecat/audio/vad/silero.py +9 -3
  35. pipecat/audio/vad/vad_analyzer.py +13 -1
  36. pipecat/extensions/voicemail/voicemail_detector.py +5 -5
  37. pipecat/frames/frames.py +277 -86
  38. pipecat/observers/loggers/debug_log_observer.py +3 -3
  39. pipecat/observers/loggers/llm_log_observer.py +7 -3
  40. pipecat/observers/loggers/user_bot_latency_log_observer.py +22 -10
  41. pipecat/pipeline/runner.py +18 -6
  42. pipecat/pipeline/service_switcher.py +64 -36
  43. pipecat/pipeline/task.py +125 -79
  44. pipecat/pipeline/tts_switcher.py +30 -0
  45. pipecat/processors/aggregators/dtmf_aggregator.py +2 -3
  46. pipecat/processors/aggregators/{gated_openai_llm_context.py → gated_llm_context.py} +9 -9
  47. pipecat/processors/aggregators/gated_open_ai_llm_context.py +12 -0
  48. pipecat/processors/aggregators/llm_context.py +40 -2
  49. pipecat/processors/aggregators/llm_response.py +32 -15
  50. pipecat/processors/aggregators/llm_response_universal.py +19 -15
  51. pipecat/processors/aggregators/user_response.py +6 -6
  52. pipecat/processors/aggregators/vision_image_frame.py +24 -2
  53. pipecat/processors/audio/audio_buffer_processor.py +43 -8
  54. pipecat/processors/dtmf_aggregator.py +174 -77
  55. pipecat/processors/filters/stt_mute_filter.py +17 -0
  56. pipecat/processors/frame_processor.py +110 -24
  57. pipecat/processors/frameworks/langchain.py +8 -2
  58. pipecat/processors/frameworks/rtvi.py +210 -68
  59. pipecat/processors/frameworks/strands_agents.py +170 -0
  60. pipecat/processors/logger.py +2 -2
  61. pipecat/processors/transcript_processor.py +26 -5
  62. pipecat/processors/user_idle_processor.py +35 -11
  63. pipecat/runner/daily.py +59 -20
  64. pipecat/runner/run.py +395 -93
  65. pipecat/runner/types.py +6 -4
  66. pipecat/runner/utils.py +51 -10
  67. pipecat/serializers/__init__.py +5 -1
  68. pipecat/serializers/asterisk.py +16 -2
  69. pipecat/serializers/convox.py +41 -4
  70. pipecat/serializers/custom.py +257 -0
  71. pipecat/serializers/exotel.py +5 -5
  72. pipecat/serializers/livekit.py +20 -0
  73. pipecat/serializers/plivo.py +5 -5
  74. pipecat/serializers/protobuf.py +6 -5
  75. pipecat/serializers/telnyx.py +2 -2
  76. pipecat/serializers/twilio.py +43 -23
  77. pipecat/serializers/vi.py +324 -0
  78. pipecat/services/ai_service.py +2 -6
  79. pipecat/services/anthropic/llm.py +2 -25
  80. pipecat/services/assemblyai/models.py +6 -0
  81. pipecat/services/assemblyai/stt.py +13 -5
  82. pipecat/services/asyncai/tts.py +5 -3
  83. pipecat/services/aws/__init__.py +1 -0
  84. pipecat/services/aws/llm.py +147 -105
  85. pipecat/services/aws/nova_sonic/__init__.py +0 -0
  86. pipecat/services/aws/nova_sonic/context.py +436 -0
  87. pipecat/services/aws/nova_sonic/frames.py +25 -0
  88. pipecat/services/aws/nova_sonic/llm.py +1265 -0
  89. pipecat/services/aws/stt.py +3 -3
  90. pipecat/services/aws_nova_sonic/__init__.py +19 -1
  91. pipecat/services/aws_nova_sonic/aws.py +11 -1151
  92. pipecat/services/aws_nova_sonic/context.py +8 -354
  93. pipecat/services/aws_nova_sonic/frames.py +13 -17
  94. pipecat/services/azure/llm.py +51 -1
  95. pipecat/services/azure/realtime/__init__.py +0 -0
  96. pipecat/services/azure/realtime/llm.py +65 -0
  97. pipecat/services/azure/stt.py +15 -0
  98. pipecat/services/cartesia/stt.py +77 -70
  99. pipecat/services/cartesia/tts.py +80 -13
  100. pipecat/services/deepgram/__init__.py +1 -0
  101. pipecat/services/deepgram/flux/__init__.py +0 -0
  102. pipecat/services/deepgram/flux/stt.py +640 -0
  103. pipecat/services/elevenlabs/__init__.py +4 -1
  104. pipecat/services/elevenlabs/stt.py +339 -0
  105. pipecat/services/elevenlabs/tts.py +87 -46
  106. pipecat/services/fish/tts.py +5 -2
  107. pipecat/services/gemini_multimodal_live/events.py +38 -524
  108. pipecat/services/gemini_multimodal_live/file_api.py +23 -173
  109. pipecat/services/gemini_multimodal_live/gemini.py +41 -1403
  110. pipecat/services/gladia/stt.py +56 -72
  111. pipecat/services/google/__init__.py +1 -0
  112. pipecat/services/google/gemini_live/__init__.py +3 -0
  113. pipecat/services/google/gemini_live/file_api.py +189 -0
  114. pipecat/services/google/gemini_live/llm.py +1582 -0
  115. pipecat/services/google/gemini_live/llm_vertex.py +184 -0
  116. pipecat/services/google/llm.py +15 -11
  117. pipecat/services/google/llm_openai.py +3 -3
  118. pipecat/services/google/llm_vertex.py +86 -16
  119. pipecat/services/google/stt.py +4 -0
  120. pipecat/services/google/tts.py +7 -3
  121. pipecat/services/heygen/api.py +2 -0
  122. pipecat/services/heygen/client.py +8 -4
  123. pipecat/services/heygen/video.py +2 -0
  124. pipecat/services/hume/__init__.py +5 -0
  125. pipecat/services/hume/tts.py +220 -0
  126. pipecat/services/inworld/tts.py +6 -6
  127. pipecat/services/llm_service.py +15 -5
  128. pipecat/services/lmnt/tts.py +4 -2
  129. pipecat/services/mcp_service.py +4 -2
  130. pipecat/services/mem0/memory.py +6 -5
  131. pipecat/services/mistral/llm.py +29 -8
  132. pipecat/services/moondream/vision.py +42 -16
  133. pipecat/services/neuphonic/tts.py +5 -2
  134. pipecat/services/openai/__init__.py +1 -0
  135. pipecat/services/openai/base_llm.py +27 -20
  136. pipecat/services/openai/realtime/__init__.py +0 -0
  137. pipecat/services/openai/realtime/context.py +272 -0
  138. pipecat/services/openai/realtime/events.py +1106 -0
  139. pipecat/services/openai/realtime/frames.py +37 -0
  140. pipecat/services/openai/realtime/llm.py +829 -0
  141. pipecat/services/openai/tts.py +49 -10
  142. pipecat/services/openai_realtime/__init__.py +27 -0
  143. pipecat/services/openai_realtime/azure.py +21 -0
  144. pipecat/services/openai_realtime/context.py +21 -0
  145. pipecat/services/openai_realtime/events.py +21 -0
  146. pipecat/services/openai_realtime/frames.py +21 -0
  147. pipecat/services/openai_realtime_beta/azure.py +16 -0
  148. pipecat/services/openai_realtime_beta/openai.py +17 -5
  149. pipecat/services/piper/tts.py +7 -9
  150. pipecat/services/playht/tts.py +34 -4
  151. pipecat/services/rime/tts.py +12 -12
  152. pipecat/services/riva/stt.py +3 -1
  153. pipecat/services/salesforce/__init__.py +9 -0
  154. pipecat/services/salesforce/llm.py +700 -0
  155. pipecat/services/sarvam/__init__.py +7 -0
  156. pipecat/services/sarvam/stt.py +540 -0
  157. pipecat/services/sarvam/tts.py +97 -13
  158. pipecat/services/simli/video.py +2 -2
  159. pipecat/services/speechmatics/stt.py +22 -10
  160. pipecat/services/stt_service.py +47 -0
  161. pipecat/services/tavus/video.py +2 -2
  162. pipecat/services/tts_service.py +75 -22
  163. pipecat/services/vision_service.py +7 -6
  164. pipecat/services/vistaar/llm.py +51 -9
  165. pipecat/tests/utils.py +4 -4
  166. pipecat/transcriptions/language.py +41 -1
  167. pipecat/transports/base_input.py +13 -34
  168. pipecat/transports/base_output.py +140 -104
  169. pipecat/transports/daily/transport.py +199 -26
  170. pipecat/transports/heygen/__init__.py +0 -0
  171. pipecat/transports/heygen/transport.py +381 -0
  172. pipecat/transports/livekit/transport.py +228 -63
  173. pipecat/transports/local/audio.py +6 -1
  174. pipecat/transports/local/tk.py +11 -2
  175. pipecat/transports/network/fastapi_websocket.py +1 -1
  176. pipecat/transports/smallwebrtc/connection.py +103 -19
  177. pipecat/transports/smallwebrtc/request_handler.py +246 -0
  178. pipecat/transports/smallwebrtc/transport.py +65 -23
  179. pipecat/transports/tavus/transport.py +23 -12
  180. pipecat/transports/websocket/client.py +41 -5
  181. pipecat/transports/websocket/fastapi.py +21 -11
  182. pipecat/transports/websocket/server.py +14 -7
  183. pipecat/transports/whatsapp/api.py +8 -0
  184. pipecat/transports/whatsapp/client.py +47 -0
  185. pipecat/utils/base_object.py +54 -22
  186. pipecat/utils/redis.py +58 -0
  187. pipecat/utils/string.py +13 -1
  188. pipecat/utils/tracing/service_decorators.py +21 -21
  189. pipecat/serializers/genesys.py +0 -95
  190. pipecat/services/google/test-google-chirp.py +0 -45
  191. pipecat/services/openai.py +0 -698
  192. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/WHEEL +0 -0
  193. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/licenses/LICENSE +0 -0
  194. {dv_pipecat_ai-0.0.82.dev857.dist-info → dv_pipecat_ai-0.0.85.dev837.dist-info}/top_level.txt +0 -0
  195. /pipecat/services/{aws_nova_sonic → aws/nova_sonic}/ready.wav +0 -0
@@ -0,0 +1,640 @@
1
+ #
2
+ # Copyright (c) 2024–2025, Daily
3
+ #
4
+ # SPDX-License-Identifier: BSD 2-Clause License
5
+ #
6
+
7
+ """Deepgram Flux speech-to-text service implementation."""
8
+
9
+ import json
10
+ from enum import Enum
11
+ from typing import Any, AsyncGenerator, Dict, Optional
12
+
13
+ from loguru import logger
14
+ from pydantic import BaseModel
15
+
16
+ from pipecat.frames.frames import (
17
+ CancelFrame,
18
+ EndFrame,
19
+ ErrorFrame,
20
+ Frame,
21
+ InterimTranscriptionFrame,
22
+ StartFrame,
23
+ TranscriptionFrame,
24
+ UserStartedSpeakingFrame,
25
+ UserStoppedSpeakingFrame,
26
+ )
27
+ from pipecat.processors.frame_processor import FrameDirection
28
+ from pipecat.services.stt_service import WebsocketSTTService
29
+ from pipecat.transcriptions.language import Language
30
+ from pipecat.utils.time import time_now_iso8601
31
+ from pipecat.utils.tracing.service_decorators import traced_stt
32
+
33
+ try:
34
+ from websockets.asyncio.client import connect as websocket_connect
35
+ from websockets.protocol import State
36
+ except ModuleNotFoundError as e:
37
+ logger.error(f"Exception: {e}")
38
+ logger.error("In order to use Deepgram Flux, you need to `pip install pipecat-ai[deepgram]`.")
39
+ raise Exception(f"Missing module: {e}")
40
+
41
+
42
+ class FluxMessageType(str, Enum):
43
+ """Deepgram Flux WebSocket message types.
44
+
45
+ These are the top-level message types that can be received from the
46
+ Deepgram Flux WebSocket connection.
47
+ """
48
+
49
+ RECEIVE_CONNECTED = "Connected"
50
+ RECEIVE_FATAL_ERROR = "Error"
51
+ TURN_INFO = "TurnInfo"
52
+
53
+
54
+ class FluxEventType(str, Enum):
55
+ """Deepgram Flux TurnInfo event types.
56
+
57
+ These events are contained within TurnInfo messages and indicate
58
+ different stages of speech processing and turn detection.
59
+ """
60
+
61
+ START_OF_TURN = "StartOfTurn"
62
+ TURN_RESUMED = "TurnResumed"
63
+ END_OF_TURN = "EndOfTurn"
64
+ EAGER_END_OF_TURN = "EagerEndOfTurn"
65
+ UPDATE = "Update"
66
+
67
+
68
+ class DeepgramFluxSTTService(WebsocketSTTService):
69
+ """Deepgram Flux speech-to-text service.
70
+
71
+ Provides real-time speech recognition using Deepgram's WebSocket API with Flux capabilities.
72
+ Supports configurable models, VAD events, and various audio processing options
73
+ including advanced turn detection and EagerEndOfTurn events for improved conversational AI performance.
74
+ """
75
+
76
+ class InputParams(BaseModel):
77
+ """Configuration parameters for Deepgram Flux API.
78
+
79
+ This class defines all available connection parameters for the Deepgram Flux API
80
+ based on the official documentation.
81
+
82
+ Parameters:
83
+ eager_eot_threshold: Optional. EagerEndOfTurn/TurnResumed are off by default.
84
+ You can turn them on by setting eager_eot_threshold to a valid value.
85
+ Lower values = more aggressive EagerEndOfTurning (faster response, more LLM calls).
86
+ Higher values = more conservative EagerEndOfTurning (slower response, fewer LLM calls).
87
+ eot_threshold: Optional. End-of-turn confidence required to finish a turn (default 0.7).
88
+ Lower values = turns end sooner (more interruptions, faster responses).
89
+ Higher values = turns end later (fewer interruptions, more complete utterances).
90
+ eot_timeout_ms: Optional. Time in milliseconds after speech to finish a turn
91
+ regardless of EOT confidence (default 5000).
92
+ keyterm: List of keyterms to boost recognition accuracy for specialized terminology.
93
+ mip_opt_out: Optional. Opts out requests from the Deepgram Model Improvement Program
94
+ (default False).
95
+ tag: List of tags to label requests for identification during usage reporting.
96
+ """
97
+
98
+ eager_eot_threshold: Optional[float] = None
99
+ eot_threshold: Optional[float] = None
100
+ eot_timeout_ms: Optional[int] = None
101
+ keyterm: list = []
102
+ mip_opt_out: Optional[bool] = None
103
+ tag: list = []
104
+
105
+ def __init__(
106
+ self,
107
+ *,
108
+ api_key: str,
109
+ url: str = "wss://api.deepgram.com/v2/listen",
110
+ sample_rate: Optional[int] = None,
111
+ model: str = "flux-general-en",
112
+ flux_encoding: str = "linear16",
113
+ params: Optional[InputParams] = None,
114
+ **kwargs,
115
+ ):
116
+ """Initialize the Deepgram Flux STT service.
117
+
118
+ Args:
119
+ api_key: Deepgram API key for authentication. Required for API access.
120
+ url: WebSocket URL for the Deepgram Flux API. Defaults to the preview endpoint.
121
+ sample_rate: Audio sample rate in Hz. If None, uses the rate from params or 16000.
122
+ model: Deepgram Flux model to use for transcription. Currently only supports "flux-general-en".
123
+ flux_encoding: Audio encoding format required by Flux API. Must be "linear16".
124
+ Raw signed little-endian 16-bit PCM encoding.
125
+ params: InputParams instance containing detailed API configuration options.
126
+ If None, default parameters will be used.
127
+ **kwargs: Additional arguments passed to the parent WebsocketSTTService class.
128
+
129
+ Examples:
130
+ Basic usage with default parameters::
131
+
132
+ stt = DeepgramFluxSTTService(api_key="your-api-key")
133
+
134
+ Advanced usage with custom parameters::
135
+
136
+ params = DeepgramFluxSTTService.InputParams(
137
+ eager_eot_threshold=0.5,
138
+ eot_threshold=0.8,
139
+ keyterm=["AI", "machine learning", "neural network"],
140
+ tag=["production", "voice-agent"]
141
+ )
142
+ stt = DeepgramFluxSTTService(
143
+ api_key="your-api-key",
144
+ model="flux-general-en",
145
+ params=params
146
+ )
147
+ """
148
+ super().__init__(sample_rate=sample_rate, **kwargs)
149
+
150
+ self._api_key = api_key
151
+ self._url = url
152
+ self._model = model
153
+ self._params = params or DeepgramFluxSTTService.InputParams()
154
+ self._flux_encoding = flux_encoding
155
+ # This is the currently only supported language
156
+ self._language = Language.EN
157
+ self._websocket_url = None
158
+ self._receive_task = None
159
+
160
+ async def _connect(self):
161
+ """Connect to WebSocket and start background tasks.
162
+
163
+ Establishes the WebSocket connection to the Deepgram Flux API and starts
164
+ the background task for receiving transcription results.
165
+ """
166
+ await self._connect_websocket()
167
+
168
+ if self._websocket and not self._receive_task:
169
+ self._receive_task = self.create_task(self._receive_task_handler(self._report_error))
170
+
171
+ async def _disconnect(self):
172
+ """Disconnect from WebSocket and clean up tasks.
173
+
174
+ Gracefully disconnects from the Deepgram Flux API, cancels background tasks,
175
+ and cleans up resources to prevent memory leaks.
176
+ """
177
+ try:
178
+ # Cancel background tasks BEFORE closing websocket
179
+ if self._receive_task:
180
+ await self.cancel_task(self._receive_task, timeout=2.0)
181
+ self._receive_task = None
182
+
183
+ # Now close the websocket
184
+ await self._disconnect_websocket()
185
+
186
+ except Exception as e:
187
+ logger.error(f"Error during disconnect: {e}")
188
+ finally:
189
+ # Reset state only after everything is cleaned up
190
+ self._websocket = None
191
+
192
+ async def _connect_websocket(self):
193
+ """Establish WebSocket connection to API.
194
+
195
+ Creates a WebSocket connection to the Deepgram Flux API using the configured
196
+ URL and authentication headers. Handles connection errors and reports them
197
+ through the event handler system.
198
+ """
199
+ try:
200
+ if self._websocket and self._websocket.state is State.OPEN:
201
+ return
202
+
203
+ self._websocket = await websocket_connect(
204
+ self._websocket_url,
205
+ additional_headers={"Authorization": f"Token {self._api_key}"},
206
+ )
207
+ logger.debug("Connected to Deepgram Flux Websocket")
208
+ await self._call_event_handler("on_connected")
209
+ except Exception as e:
210
+ logger.error(f"{self} initialization error: {e}")
211
+ self._websocket = None
212
+ await self._call_event_handler("on_connection_error", f"{e}")
213
+
214
+ async def _disconnect_websocket(self):
215
+ """Close WebSocket connection and clean up state.
216
+
217
+ Closes the WebSocket connection to the Deepgram Flux API and stops all
218
+ metrics collection. Handles disconnection errors gracefully.
219
+ """
220
+ try:
221
+ await self.stop_all_metrics()
222
+
223
+ if self._websocket:
224
+ await self._send_close_stream()
225
+ logger.debug("Disconnecting from Deepgram Flux Websocket")
226
+ await self._websocket.close()
227
+ except Exception as e:
228
+ logger.error(f"{self} error closing websocket: {e}")
229
+ finally:
230
+ self._websocket = None
231
+ await self._call_event_handler("on_disconnected")
232
+
233
+ async def _send_close_stream(self) -> None:
234
+ """Sends a CloseStream control message to the Deepgram Flux WebSocket API.
235
+
236
+ This signals to the server that no more audio data will be sent.
237
+ """
238
+ if self._websocket:
239
+ logger.debug("Sending CloseStream message to Deepgram Flux")
240
+ message = {"type": "CloseStream"}
241
+ await self._websocket.send(json.dumps(message))
242
+
243
+ def can_generate_metrics(self) -> bool:
244
+ """Check if this service can generate processing metrics.
245
+
246
+ Returns:
247
+ True, as Deepgram service supports metrics generation.
248
+ """
249
+ return True
250
+
251
+ async def start(self, frame: StartFrame):
252
+ """Start the Deepgram Flux STT service.
253
+
254
+ Initializes the service by constructing the WebSocket URL with all configured
255
+ parameters and establishing the connection to begin transcription processing.
256
+
257
+ Args:
258
+ frame: The start frame containing initialization parameters and metadata.
259
+ """
260
+ await super().start(frame)
261
+
262
+ url_params = [
263
+ f"model={self._model}",
264
+ f"sample_rate={self.sample_rate}",
265
+ f"encoding={self._flux_encoding}",
266
+ ]
267
+
268
+ if self._params.eager_eot_threshold is not None:
269
+ url_params.append(f"eager_eot_threshold={self._params.eager_eot_threshold}")
270
+
271
+ if self._params.eot_threshold is not None:
272
+ url_params.append(f"eot_threshold={self._params.eot_threshold}")
273
+
274
+ if self._params.eot_timeout_ms is not None:
275
+ url_params.append(f"eot_timeout_ms={self._params.eot_timeout_ms}")
276
+
277
+ if self._params.mip_opt_out is not None:
278
+ url_params.append(f"mip_opt_out={str(self._params.mip_opt_out).lower()}")
279
+
280
+ # Add keyterm parameters (can have multiple)
281
+ for keyterm in self._params.keyterm:
282
+ url_params.append(f"keyterm={keyterm}")
283
+
284
+ # Add tag parameters (can have multiple)
285
+ for tag_value in self._params.tag:
286
+ url_params.append(f"tag={tag_value}")
287
+
288
+ self._websocket_url = f"{self._url}?{'&'.join(url_params)}"
289
+ await self._connect()
290
+
291
+ async def stop(self, frame: EndFrame):
292
+ """Stop the Deepgram Flux STT service.
293
+
294
+ Args:
295
+ frame: The end frame.
296
+ """
297
+ await super().stop(frame)
298
+ await self._disconnect()
299
+
300
+ async def cancel(self, frame: CancelFrame):
301
+ """Cancel the Deepgram Flux STT service.
302
+
303
+ Args:
304
+ frame: The cancel frame.
305
+ """
306
+ await super().cancel(frame)
307
+ await self._disconnect()
308
+
309
+ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
310
+ """Send audio data to Deepgram Flux for transcription.
311
+
312
+ Transmits raw audio bytes to the Deepgram Flux API for real-time speech
313
+ recognition. Transcription results are received asynchronously through
314
+ WebSocket callbacks and processed in the background.
315
+
316
+ Args:
317
+ audio: Raw audio bytes in linear16 format (signed little-endian 16-bit PCM).
318
+
319
+ Yields:
320
+ Frame: None (transcription results are delivered via WebSocket callbacks
321
+ rather than as return values from this method).
322
+
323
+ Raises:
324
+ Exception: If the WebSocket connection is not established or if there
325
+ are issues sending the audio data.
326
+ """
327
+ if not self._websocket:
328
+ logger.error("Not connected to Deepgram Flux.")
329
+ yield ErrorFrame("Not connected to Deepgram Flux.", fatal=True)
330
+ return
331
+
332
+ try:
333
+ await self._websocket.send(audio)
334
+ except Exception as e:
335
+ logger.error(f"Failed to send audio to Flux: {e}")
336
+ yield ErrorFrame(f"Failed to send audio to Flux: {e}")
337
+ return
338
+
339
+ yield None
340
+
341
+ async def start_metrics(self):
342
+ """Start TTFB and processing metrics collection."""
343
+ # TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
344
+ # Ideally, TTFB should measure the time from when a user starts speaking
345
+ # until we receive the first transcript. However, Deepgram Flux delivers
346
+ # both the "user started speaking" event and the first transcript simultaneously,
347
+ # making this timing measurement meaningless in this context.
348
+ # await self.start_ttfb_metrics()
349
+ await self.start_processing_metrics()
350
+
351
+ @traced_stt
352
+ async def _handle_transcription(
353
+ self, transcript: str, is_final: bool, language: Optional[Language] = None
354
+ ):
355
+ """Handle a transcription result with tracing."""
356
+ pass
357
+
358
+ def _get_websocket(self):
359
+ """Get the current WebSocket connection.
360
+
361
+ Returns the active WebSocket connection instance, raising an exception
362
+ if no connection is currently established.
363
+
364
+ Returns:
365
+ The active WebSocket connection instance.
366
+
367
+ Raises:
368
+ Exception: If no WebSocket connection is currently active.
369
+ """
370
+ if self._websocket:
371
+ return self._websocket
372
+ raise Exception("Websocket not connected")
373
+
374
+ def _validate_message(self, data: Dict[str, Any]) -> bool:
375
+ """Validate basic message structure from Deepgram Flux.
376
+
377
+ Ensures the received message has the expected structure before processing.
378
+
379
+ Args:
380
+ data: The parsed JSON message data to validate.
381
+
382
+ Returns:
383
+ True if the message structure is valid, False otherwise.
384
+ """
385
+ if not isinstance(data, dict):
386
+ logger.warning("Message is not a dictionary")
387
+ return False
388
+
389
+ if "type" not in data:
390
+ logger.warning("Message missing 'type' field")
391
+ return False
392
+
393
+ return True
394
+
395
+ async def _receive_messages(self):
396
+ """Receive and process messages from WebSocket.
397
+
398
+ Continuously receives messages from the Deepgram Flux WebSocket connection
399
+ and processes various message types including connection status, transcription
400
+ results, turn information, and error conditions. Handles different event types
401
+ such as StartOfTurn, EndOfTurn, EagerEndOfTurn, and Update events.
402
+ """
403
+ async for message in self._get_websocket():
404
+ if isinstance(message, str):
405
+ try:
406
+ data = json.loads(message)
407
+ await self._handle_message(data)
408
+ except json.JSONDecodeError as e:
409
+ logger.error(f"Failed to decode JSON message: {e}")
410
+ # Skip malformed messages
411
+ continue
412
+ except Exception as e:
413
+ logger.error(f"Error processing message: {e}")
414
+ # Error will be handled inside WebsocketService->_receive_task_handler
415
+ raise
416
+ else:
417
+ logger.warning(f"Received non-string message: {type(message)}")
418
+
419
+ async def _handle_message(self, data: Dict[str, Any]):
420
+ """Handle a parsed WebSocket message from Deepgram Flux.
421
+
422
+ Routes messages to appropriate handlers based on their type. Validates
423
+ message structure before processing.
424
+
425
+ Args:
426
+ data: The parsed JSON message data from the WebSocket.
427
+ """
428
+ if not self._validate_message(data):
429
+ return
430
+
431
+ message_type = data.get("type")
432
+
433
+ try:
434
+ flux_message_type = FluxMessageType(message_type)
435
+ except ValueError:
436
+ logger.debug(f"Unhandled message type: {message_type or 'unknown'}")
437
+ return
438
+
439
+ match flux_message_type:
440
+ case FluxMessageType.RECEIVE_CONNECTED:
441
+ await self._handle_connection_established()
442
+ case FluxMessageType.RECEIVE_FATAL_ERROR:
443
+ await self._handle_fatal_error(data)
444
+ case FluxMessageType.TURN_INFO:
445
+ await self._handle_turn_info(data)
446
+
447
+ async def _handle_connection_established(self):
448
+ """Handle successful connection establishment to Deepgram Flux.
449
+
450
+ This event is fired when the WebSocket connection to Deepgram Flux
451
+ is successfully established and ready to receive audio data for
452
+ transcription processing.
453
+ """
454
+ logger.info("Connected to Flux - ready to stream audio")
455
+
456
+ async def _handle_fatal_error(self, data: Dict[str, Any]):
457
+ """Handle fatal error messages from Deepgram Flux.
458
+
459
+ Fatal errors indicate unrecoverable issues with the connection or
460
+ configuration that require intervention. These errors will cause
461
+ the connection to be terminated.
462
+
463
+ Args:
464
+ data: The error message data containing error details.
465
+
466
+ Raises:
467
+ Exception: Always raises to trigger error handling in the parent service.
468
+ """
469
+ error_msg = data.get("error", "Unknown error")
470
+ deepgram_error = f"Fatal error: {error_msg}"
471
+ logger.error(deepgram_error)
472
+ # Error will be handled inside WebsocketService->_receive_task_handler
473
+ raise Exception(deepgram_error)
474
+
475
+ async def _handle_turn_info(self, data: Dict[str, Any]):
476
+ """Handle TurnInfo events from Deepgram Flux.
477
+
478
+ TurnInfo messages contain various turn-based events that indicate
479
+ the state of speech processing, including turn boundaries, interim
480
+ results, and turn finalization events.
481
+
482
+ Args:
483
+ data: The TurnInfo message data containing event type, transcript and some extra metadata.
484
+ """
485
+ event = data.get("event")
486
+ transcript = data.get("transcript", "")
487
+
488
+ try:
489
+ flux_event_type = FluxEventType(event)
490
+ except ValueError:
491
+ logger.debug(f"Unhandled TurnInfo event: {event}")
492
+ return
493
+
494
+ match flux_event_type:
495
+ case FluxEventType.START_OF_TURN:
496
+ await self._handle_start_of_turn(transcript)
497
+ case FluxEventType.TURN_RESUMED:
498
+ await self._handle_turn_resumed(event)
499
+ case FluxEventType.END_OF_TURN:
500
+ await self._handle_end_of_turn(transcript, data)
501
+ case FluxEventType.EAGER_END_OF_TURN:
502
+ await self._handle_eager_end_of_turn(transcript, data)
503
+ case FluxEventType.UPDATE:
504
+ await self._handle_update(transcript)
505
+
506
+ async def _handle_start_of_turn(self, transcript: str):
507
+ """Handle StartOfTurn events from Deepgram Flux.
508
+
509
+ StartOfTurn events are fired when Deepgram Flux detects the beginning
510
+ of a new speaking turn. This triggers bot interruption to stop any
511
+ ongoing speech synthesis and signals the start of user speech detection.
512
+
513
+ The service will:
514
+ - Send a BotInterruptionFrame upstream to stop bot speech
515
+ - Send a UserStartedSpeakingFrame downstream to notify other components
516
+ - Start metrics collection for measuring response times
517
+
518
+ Args:
519
+ transcript: maybe the first few words of the turn.
520
+ """
521
+ logger.debug("User started speaking")
522
+ await self.push_interruption_task_frame_and_wait()
523
+ await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.DOWNSTREAM)
524
+ await self.push_frame(UserStartedSpeakingFrame(), FrameDirection.UPSTREAM)
525
+ await self.start_metrics()
526
+ if transcript:
527
+ logger.trace(f"Start of turn transcript: {transcript}")
528
+
529
+ async def _handle_turn_resumed(self, event: str):
530
+ """Handle TurnResumed events from Deepgram Flux.
531
+
532
+ TurnResumed events indicate that speech has resumed after a brief pause
533
+ within the same turn. This is primarily used for logging and debugging
534
+ purposes and doesn't trigger any significant processing changes.
535
+
536
+ Args:
537
+ event: The event type string for logging purposes.
538
+ """
539
+ logger.trace(f"Received event TurnResumed: {event}")
540
+
541
+ async def _handle_end_of_turn(self, transcript: str, data: Dict[str, Any]):
542
+ """Handle EndOfTurn events from Deepgram Flux.
543
+
544
+ EndOfTurn events are fired when Deepgram Flux determines that a speaking
545
+ turn has concluded, either due to sufficient silence or end-of-turn
546
+ confidence thresholds being met. This provides the final transcript
547
+ for the completed turn.
548
+
549
+ The service will:
550
+ - Create and send a final TranscriptionFrame with the complete transcript
551
+ - Trigger transcription handling with tracing for metrics
552
+ - Stop processing metrics collection
553
+ - Send a UserStoppedSpeakingFrame to signal turn completion
554
+
555
+ Args:
556
+ transcript: The final transcript text for the completed turn.
557
+ data: The TurnInfo message data containing event type, transcript and some extra metadata.
558
+ """
559
+ logger.debug("User stopped speaking")
560
+
561
+ await self.push_frame(
562
+ TranscriptionFrame(
563
+ transcript,
564
+ self._user_id,
565
+ time_now_iso8601(),
566
+ self._language,
567
+ result=data,
568
+ )
569
+ )
570
+ await self._handle_transcription(transcript, True, self._language)
571
+ await self.stop_processing_metrics()
572
+ await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.DOWNSTREAM)
573
+ await self.push_frame(UserStoppedSpeakingFrame(), FrameDirection.UPSTREAM)
574
+
575
+ async def _handle_eager_end_of_turn(self, transcript: str, data: Dict[str, Any]):
576
+ """Handle EagerEndOfTurn events from Deepgram Flux.
577
+
578
+ EagerEndOfTurn events are fired when the end-of-turn confidence reaches the
579
+ EagerEndOfTurn threshold but hasn't yet reached the full end-of-turn threshold.
580
+ These provide interim transcripts that can be used for faster response
581
+ generation while still allowing the user to continue speaking.
582
+
583
+ EagerEndOfTurn events enable more responsive conversational AI by allowing
584
+ the LLM to start processing likely final transcripts before the turn
585
+ is definitively ended.
586
+
587
+ Args:
588
+ transcript: The interim transcript text that triggered the EagerEndOfTurn event.
589
+ data: The TurnInfo message data containing event type, transcript and some extra metadata.
590
+ """
591
+ logger.trace(f"EagerEndOfTurn - {transcript}")
592
+ # Deepgram's EagerEndOfTurn feature enables lower-latency voice agents by sending
593
+ # medium-confidence transcripts before EndOfTurn certainty, allowing LLM processing to
594
+ # begin early.
595
+ #
596
+ # However, if speech resumes or the transcripts differ from the final EndOfTurn, the
597
+ # EagerEndOfTurn response should be cancelled to avoid incorrect or partial responses.
598
+ #
599
+ # Pipecat doesn't yet provide built-in Gate/control mechanisms to:
600
+ # 1. Start LLM/TTS processing early on EagerEndOfTurn events
601
+ # 2. Cancel in-flight processing when TurnResumed occurs
602
+ #
603
+ # By pushing EagerEndOfTurn transcripts as InterimTranscriptionFrame, we enable
604
+ # developers to implement custom EagerEndOfTurn handling in their applications while
605
+ # maintaining compatibility with existing interim transcription workflows.
606
+ #
607
+ # TODO: Implement proper EagerEndOfTurn support with cancellable processing pipeline
608
+ # that can start response generation on EagerEndOfTurn and cancel or confirm it.
609
+ await self.push_frame(
610
+ InterimTranscriptionFrame(
611
+ transcript,
612
+ self._user_id,
613
+ time_now_iso8601(),
614
+ self._language,
615
+ result=data,
616
+ )
617
+ )
618
+
619
+ async def _handle_update(self, transcript: str):
620
+ """Handle Update events from Deepgram Flux.
621
+
622
+ Update events provide incremental transcript updates during an ongoing
623
+ turn. These events allow for real-time display of transcription progress
624
+ and can be used to provide visual feedback to users about what's being
625
+ recognized.
626
+
627
+ The service stops TTFB (Time To First Byte) metrics when the first
628
+ substantial update is received, indicating successful processing start.
629
+
630
+ Args:
631
+ transcript: The current partial transcript text for the ongoing turn.
632
+ """
633
+ if transcript:
634
+ logger.trace(f"Update event: {transcript}")
635
+ # TTFB (Time To First Byte) metrics are currently disabled for Deepgram Flux.
636
+ # Ideally, TTFB should measure the time from when a user starts speaking
637
+ # until we receive the first transcript. However, Deepgram Flux delivers
638
+ # both the "user started speaking" event and the first transcript simultaneously,
639
+ # making this timing measurement meaningless in this context.
640
+ # await self.stop_ttfb_metrics()
@@ -8,6 +8,9 @@ import sys
8
8
 
9
9
  from pipecat.services import DeprecatedModuleProxy
10
10
 
11
+ from .stt import *
11
12
  from .tts import *
13
+ from .stt import *
14
+ # Old
12
15
 
13
- sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.tts")
16
+ sys.modules[__name__] = DeprecatedModuleProxy(globals(), "elevenlabs", "elevenlabs.[stt,tts]")