vision-agents-plugins-deepgram 0.1.9__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vision-agents-plugins-deepgram might be problematic. Click here for more details.
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.12}/.gitignore +1 -0
- vision_agents_plugins_deepgram-0.1.12/PKG-INFO +30 -0
- vision_agents_plugins_deepgram-0.1.12/README.md +15 -0
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.12}/pyproject.toml +1 -1
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.12}/vision_agents/plugins/deepgram/__init__.py +1 -1
- vision_agents_plugins_deepgram-0.1.12/vision_agents/plugins/deepgram/deepgram_stt.py +270 -0
- vision_agents_plugins_deepgram-0.1.9/PKG-INFO +0 -69
- vision_agents_plugins_deepgram-0.1.9/README.md +0 -54
- vision_agents_plugins_deepgram-0.1.9/vision_agents/plugins/deepgram/stt.py +0 -349
- vision_agents_plugins_deepgram-0.1.9/vision_agents/plugins/deepgram/utils.py +0 -18
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vision-agents-plugins-deepgram
|
|
3
|
+
Version: 0.1.12
|
|
4
|
+
Summary: Deepgram STT integration for Vision Agents
|
|
5
|
+
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
+
Project-URL: Website, https://visionagents.ai/
|
|
7
|
+
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: AI,STT,agents,deepgram,speech-to-text,transcription,voice agents
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: deepgram-sdk==5.2.0
|
|
12
|
+
Requires-Dist: numpy<2.3,>=2.2.6
|
|
13
|
+
Requires-Dist: vision-agents
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# Deepgram Speech-to-Text Plugin
|
|
17
|
+
|
|
18
|
+
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uv add vision-agents-plugins-deepgram
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Docs
|
|
27
|
+
|
|
28
|
+
- https://developers.deepgram.com/docs/flux/quickstart
|
|
29
|
+
- https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
|
|
30
|
+
- https://github.com/deepgram/deepgram-python-sdk/tree/main
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Deepgram Speech-to-Text Plugin
|
|
2
|
+
|
|
3
|
+
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv add vision-agents-plugins-deepgram
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Docs
|
|
12
|
+
|
|
13
|
+
- https://developers.deepgram.com/docs/flux/quickstart
|
|
14
|
+
- https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
|
|
15
|
+
- https://github.com/deepgram/deepgram-python-sdk/tree/main
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
from typing import Optional, Any
|
|
5
|
+
|
|
6
|
+
from deepgram import AsyncDeepgramClient
|
|
7
|
+
from deepgram.core import EventType
|
|
8
|
+
from deepgram.extensions.types.sockets import ListenV2ControlMessage
|
|
9
|
+
from deepgram.listen.v2.socket_client import AsyncV2SocketClient
|
|
10
|
+
from getstream.video.rtc.track_util import PcmData
|
|
11
|
+
|
|
12
|
+
from vision_agents.core import stt
|
|
13
|
+
from vision_agents.core.stt import TranscriptResponse
|
|
14
|
+
from vision_agents.core.edge.types import Participant
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class STT(stt.STT):
|
|
20
|
+
"""
|
|
21
|
+
Deepgram Speech-to-Text implementation using Flux model.
|
|
22
|
+
|
|
23
|
+
- https://developers.deepgram.com/docs/flux/quickstart
|
|
24
|
+
- https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/listen/v2/connect/async.py
|
|
25
|
+
- https://github.com/deepgram/deepgram-python-sdk/tree/main
|
|
26
|
+
- https://github.com/deepgram-devs/deepgram-demos-flux-streaming-transcription/blob/main/main.py
|
|
27
|
+
|
|
28
|
+
Deepgram flux runs turn detection internally. So running turn detection in front of this is optional/not needed
|
|
29
|
+
|
|
30
|
+
- eot_threshold controls turn end sensitivity
|
|
31
|
+
- eager_eot_threshold controls eager turn ending (so you can already prepare the LLM response)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
api_key: Optional[str] = None,
|
|
37
|
+
model: str = "flux-general-en",
|
|
38
|
+
language: Optional[str] = None,
|
|
39
|
+
eot_threshold: Optional[float] = None,
|
|
40
|
+
eager_eot_threshold: Optional[float] = None,
|
|
41
|
+
client: Optional[AsyncDeepgramClient] = None,
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
Initialize Deepgram STT.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
api_key: Deepgram API key. If not provided, will use DEEPGRAM_API_KEY env var.
|
|
48
|
+
model: Model to use for transcription. Defaults to "flux-general-en".
|
|
49
|
+
language: Language code (e.g., "en", "es"). If not provided, auto-detection is used.
|
|
50
|
+
eot_threshold: End-of-turn threshold for determining when a turn is complete.
|
|
51
|
+
eager_eot_threshold: Eager end-of-turn threshold for faster turn detection.
|
|
52
|
+
client: Optional pre-configured AsyncDeepgramClient instance.
|
|
53
|
+
"""
|
|
54
|
+
super().__init__(provider_name="deepgram")
|
|
55
|
+
|
|
56
|
+
if not api_key:
|
|
57
|
+
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
|
58
|
+
|
|
59
|
+
if client is not None:
|
|
60
|
+
self.client = client
|
|
61
|
+
else:
|
|
62
|
+
# Initialize AsyncDeepgramClient with api_key as named parameter
|
|
63
|
+
if api_key:
|
|
64
|
+
self.client = AsyncDeepgramClient(api_key=api_key)
|
|
65
|
+
else:
|
|
66
|
+
self.client = AsyncDeepgramClient()
|
|
67
|
+
|
|
68
|
+
self.model = model
|
|
69
|
+
self.language = language
|
|
70
|
+
self.eot_threshold = eot_threshold
|
|
71
|
+
self.eager_eot_threshold = eager_eot_threshold
|
|
72
|
+
self._current_participant: Optional[Participant] = None
|
|
73
|
+
self.connection: Optional[AsyncV2SocketClient] = None
|
|
74
|
+
self._connection_ready = asyncio.Event()
|
|
75
|
+
self._connection_context: Optional[Any] = None
|
|
76
|
+
self._listen_task: Optional[asyncio.Task[Any]] = None
|
|
77
|
+
|
|
78
|
+
async def process_audio(
|
|
79
|
+
self,
|
|
80
|
+
pcm_data: PcmData,
|
|
81
|
+
participant: Optional[Participant] = None,
|
|
82
|
+
):
|
|
83
|
+
"""
|
|
84
|
+
Process audio data through Deepgram for transcription.
|
|
85
|
+
|
|
86
|
+
This method sends audio to the existing WebSocket connection. The connection
|
|
87
|
+
is started automatically on first use. Audio is automatically resampled to 16kHz.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
pcm_data: The PCM audio data to process.
|
|
91
|
+
participant: Optional participant metadata (currently not used in streaming mode).
|
|
92
|
+
"""
|
|
93
|
+
if self.closed:
|
|
94
|
+
logger.warning("Deepgram STT is closed, ignoring audio")
|
|
95
|
+
return
|
|
96
|
+
|
|
97
|
+
# Wait for connection to be ready
|
|
98
|
+
await self._connection_ready.wait()
|
|
99
|
+
|
|
100
|
+
# Double-check connection is still ready (could have closed while waiting)
|
|
101
|
+
if not self._connection_ready.is_set():
|
|
102
|
+
logger.warning("Deepgram connection closed while processing audio")
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
# Resample to 16kHz mono (recommended by Deepgram)
|
|
106
|
+
resampled_pcm = pcm_data.resample(16_000, 1)
|
|
107
|
+
|
|
108
|
+
# Convert int16 samples to bytes
|
|
109
|
+
audio_bytes = resampled_pcm.samples.tobytes()
|
|
110
|
+
|
|
111
|
+
self._current_participant = participant
|
|
112
|
+
|
|
113
|
+
if self.connection is not None:
|
|
114
|
+
await self.connection.send_media(audio_bytes)
|
|
115
|
+
|
|
116
|
+
async def start(self):
|
|
117
|
+
"""
|
|
118
|
+
Start the Deepgram WebSocket connection and begin listening for transcripts.
|
|
119
|
+
"""
|
|
120
|
+
if self.connection is not None:
|
|
121
|
+
logger.warning("Deepgram connection already started")
|
|
122
|
+
return
|
|
123
|
+
|
|
124
|
+
# Build connection parameters
|
|
125
|
+
connect_params = {
|
|
126
|
+
"model": self.model,
|
|
127
|
+
"encoding": "linear16",
|
|
128
|
+
"sample_rate": "16000",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
# Add optional parameters if specified
|
|
132
|
+
if self.eot_threshold is not None:
|
|
133
|
+
connect_params["eot_threshold"] = str(self.eot_threshold)
|
|
134
|
+
if self.eager_eot_threshold is not None:
|
|
135
|
+
connect_params["eager_eot_threshold"] = str(self.eager_eot_threshold)
|
|
136
|
+
|
|
137
|
+
# Connect to Deepgram v2 listen WebSocket with timeout
|
|
138
|
+
self._connection_context = self.client.listen.v2.connect(**connect_params)
|
|
139
|
+
|
|
140
|
+
# Add timeout for connection establishment
|
|
141
|
+
self.connection = await asyncio.wait_for(
|
|
142
|
+
self._connection_context.__aenter__(),
|
|
143
|
+
timeout=10.0
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
# Register event handlers
|
|
147
|
+
if self.connection is not None:
|
|
148
|
+
self.connection.on(EventType.OPEN, self._on_open)
|
|
149
|
+
self.connection.on(EventType.MESSAGE, self._on_message)
|
|
150
|
+
self.connection.on(EventType.ERROR, self._on_error)
|
|
151
|
+
self.connection.on(EventType.CLOSE, self._on_close)
|
|
152
|
+
|
|
153
|
+
# Start listening for events
|
|
154
|
+
self._listen_task = asyncio.create_task(self.connection.start_listening())
|
|
155
|
+
|
|
156
|
+
# Mark connection as ready
|
|
157
|
+
self._connection_ready.set()
|
|
158
|
+
|
|
159
|
+
def _on_message(self, message):
|
|
160
|
+
"""
|
|
161
|
+
Event handler for messages from Deepgram.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
message: The message object from Deepgram
|
|
165
|
+
"""
|
|
166
|
+
# Extract message data
|
|
167
|
+
if not hasattr(message, "type"):
|
|
168
|
+
logger.warning(f"Received message without 'type' attribute: {message}")
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
# Handle TurnInfo messages (v2 API)
|
|
172
|
+
if message.type == "TurnInfo":
|
|
173
|
+
# Extract transcript text
|
|
174
|
+
transcript_text = getattr(message, "transcript", "").strip()
|
|
175
|
+
|
|
176
|
+
if not transcript_text:
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
# Get event type to determine if final or partial
|
|
180
|
+
# "StartOfTurn" and "Update" = partial, "EndOfTurn" = final
|
|
181
|
+
event = getattr(message, "event", "")
|
|
182
|
+
|
|
183
|
+
is_final = event == "EndOfTurn"
|
|
184
|
+
|
|
185
|
+
# Get end of turn confidence
|
|
186
|
+
end_of_turn_confidence = getattr(message, "end_of_turn_confidence", 0.0)
|
|
187
|
+
|
|
188
|
+
# Calculate average confidence from words
|
|
189
|
+
words = getattr(message, "words", [])
|
|
190
|
+
if words:
|
|
191
|
+
confidences = [w.confidence for w in words if hasattr(w, "confidence")]
|
|
192
|
+
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
|
193
|
+
else:
|
|
194
|
+
avg_confidence = 0.0
|
|
195
|
+
|
|
196
|
+
# Get audio duration
|
|
197
|
+
audio_window_end = getattr(message, "audio_window_end", 0.0)
|
|
198
|
+
duration_ms = int(audio_window_end * 1000)
|
|
199
|
+
|
|
200
|
+
# Build response metadata
|
|
201
|
+
response_metadata = TranscriptResponse(
|
|
202
|
+
confidence=avg_confidence,
|
|
203
|
+
language=self.language or "auto",
|
|
204
|
+
audio_duration_ms=duration_ms,
|
|
205
|
+
model_name=self.model,
|
|
206
|
+
other={
|
|
207
|
+
"end_of_turn_confidence": end_of_turn_confidence,
|
|
208
|
+
"turn_index": getattr(message, "turn_index", None),
|
|
209
|
+
"event": event,
|
|
210
|
+
}
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
# Use the participant from the most recent process_audio call
|
|
214
|
+
participant = self._current_participant
|
|
215
|
+
|
|
216
|
+
if participant is None:
|
|
217
|
+
logger.warning("Received transcript but no participant set")
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
if is_final:
|
|
221
|
+
# Final transcript (event == "EndOfTurn")
|
|
222
|
+
self._emit_transcript_event(
|
|
223
|
+
transcript_text, participant, response_metadata
|
|
224
|
+
)
|
|
225
|
+
else:
|
|
226
|
+
# Partial transcript (event == "StartOfTurn" or "Update")
|
|
227
|
+
self._emit_partial_transcript_event(
|
|
228
|
+
transcript_text, participant, response_metadata
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
def _on_open(self, message):
|
|
232
|
+
pass
|
|
233
|
+
|
|
234
|
+
def _on_error(self, error):
|
|
235
|
+
"""
|
|
236
|
+
Event handler for errors from Deepgram.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
error: The error from Deepgram
|
|
240
|
+
"""
|
|
241
|
+
logger.error(f"Deepgram WebSocket error: {error}")
|
|
242
|
+
raise Exception(f"Deepgram WebSocket error {error}")
|
|
243
|
+
|
|
244
|
+
def _on_close(self, error):
|
|
245
|
+
"""
|
|
246
|
+
Event handler for connection close.
|
|
247
|
+
"""
|
|
248
|
+
logger.warning(f"Deepgram WebSocket connection closed: {error}")
|
|
249
|
+
self._connection_ready.clear()
|
|
250
|
+
|
|
251
|
+
async def close(self):
|
|
252
|
+
"""
|
|
253
|
+
Close the Deepgram connection and clean up resources.
|
|
254
|
+
"""
|
|
255
|
+
# Mark as closed first
|
|
256
|
+
await super().close()
|
|
257
|
+
|
|
258
|
+
# Cancel listen task
|
|
259
|
+
if self._listen_task and not self._listen_task.done():
|
|
260
|
+
self._listen_task.cancel()
|
|
261
|
+
await asyncio.gather(self._listen_task, return_exceptions=True)
|
|
262
|
+
|
|
263
|
+
# Close connection
|
|
264
|
+
if self.connection and self._connection_context:
|
|
265
|
+
close_msg = ListenV2ControlMessage(type="CloseStream")
|
|
266
|
+
await self.connection.send_control(close_msg)
|
|
267
|
+
await self._connection_context.__aexit__(None, None, None)
|
|
268
|
+
self.connection = None
|
|
269
|
+
self._connection_context = None
|
|
270
|
+
self._connection_ready.clear()
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: vision-agents-plugins-deepgram
|
|
3
|
-
Version: 0.1.9
|
|
4
|
-
Summary: Deepgram STT integration for Vision Agents
|
|
5
|
-
Project-URL: Documentation, https://visionagents.ai/
|
|
6
|
-
Project-URL: Website, https://visionagents.ai/
|
|
7
|
-
Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
8
|
-
License-Expression: MIT
|
|
9
|
-
Keywords: AI,STT,agents,deepgram,speech-to-text,transcription,voice agents
|
|
10
|
-
Requires-Python: >=3.10
|
|
11
|
-
Requires-Dist: deepgram-sdk<5.1,>=5.0.0
|
|
12
|
-
Requires-Dist: numpy<2.3,>=2.2.6
|
|
13
|
-
Requires-Dist: vision-agents
|
|
14
|
-
Description-Content-Type: text/markdown
|
|
15
|
-
|
|
16
|
-
# Deepgram Speech-to-Text Plugin
|
|
17
|
-
|
|
18
|
-
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
19
|
-
|
|
20
|
-
## Installation
|
|
21
|
-
|
|
22
|
-
```bash
|
|
23
|
-
uv add vision-agents-plugins-deepgram
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
## Usage
|
|
27
|
-
|
|
28
|
-
```python
|
|
29
|
-
from vision_agents.plugins import deepgram
|
|
30
|
-
from getstream.video.rtc.track_util import PcmData
|
|
31
|
-
|
|
32
|
-
# Initialize with API key from environment variable
|
|
33
|
-
stt = deepgram.STT()
|
|
34
|
-
|
|
35
|
-
# Or specify API key directly
|
|
36
|
-
stt = deepgram.STT(api_key="your_deepgram_api_key")
|
|
37
|
-
|
|
38
|
-
# Register event handlers
|
|
39
|
-
@stt.on("transcript")
|
|
40
|
-
def on_transcript(text, user, metadata):
|
|
41
|
-
print(f"Final transcript from {user}: {text}")
|
|
42
|
-
|
|
43
|
-
@stt.on("partial_transcript")
|
|
44
|
-
def on_partial(text, user, metadata):
|
|
45
|
-
print(f"Partial transcript from {user}: {text}")
|
|
46
|
-
|
|
47
|
-
# Process audio
|
|
48
|
-
pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
|
|
49
|
-
await stt.process_audio(pcm_data)
|
|
50
|
-
|
|
51
|
-
# When done
|
|
52
|
-
await stt.close()
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
## Configuration Options
|
|
56
|
-
|
|
57
|
-
- `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
|
|
58
|
-
- `options`: Deepgram options for configuring the transcription.
|
|
59
|
-
See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
|
|
60
|
-
- `sample_rate`: Sample rate of the audio in Hz (default: 16000)
|
|
61
|
-
- `language`: Language code for transcription (default: "en-US")
|
|
62
|
-
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
|
|
63
|
-
- `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
|
|
64
|
-
|
|
65
|
-
## Requirements
|
|
66
|
-
|
|
67
|
-
- Python 3.10+
|
|
68
|
-
- deepgram-sdk>=5.0.0,<5.1
|
|
69
|
-
- numpy>=2.2.6,<2.3
|
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
# Deepgram Speech-to-Text Plugin
|
|
2
|
-
|
|
3
|
-
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
4
|
-
|
|
5
|
-
## Installation
|
|
6
|
-
|
|
7
|
-
```bash
|
|
8
|
-
uv add vision-agents-plugins-deepgram
|
|
9
|
-
```
|
|
10
|
-
|
|
11
|
-
## Usage
|
|
12
|
-
|
|
13
|
-
```python
|
|
14
|
-
from vision_agents.plugins import deepgram
|
|
15
|
-
from getstream.video.rtc.track_util import PcmData
|
|
16
|
-
|
|
17
|
-
# Initialize with API key from environment variable
|
|
18
|
-
stt = deepgram.STT()
|
|
19
|
-
|
|
20
|
-
# Or specify API key directly
|
|
21
|
-
stt = deepgram.STT(api_key="your_deepgram_api_key")
|
|
22
|
-
|
|
23
|
-
# Register event handlers
|
|
24
|
-
@stt.on("transcript")
|
|
25
|
-
def on_transcript(text, user, metadata):
|
|
26
|
-
print(f"Final transcript from {user}: {text}")
|
|
27
|
-
|
|
28
|
-
@stt.on("partial_transcript")
|
|
29
|
-
def on_partial(text, user, metadata):
|
|
30
|
-
print(f"Partial transcript from {user}: {text}")
|
|
31
|
-
|
|
32
|
-
# Process audio
|
|
33
|
-
pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
|
|
34
|
-
await stt.process_audio(pcm_data)
|
|
35
|
-
|
|
36
|
-
# When done
|
|
37
|
-
await stt.close()
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
## Configuration Options
|
|
41
|
-
|
|
42
|
-
- `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
|
|
43
|
-
- `options`: Deepgram options for configuring the transcription.
|
|
44
|
-
See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
|
|
45
|
-
- `sample_rate`: Sample rate of the audio in Hz (default: 16000)
|
|
46
|
-
- `language`: Language code for transcription (default: "en-US")
|
|
47
|
-
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
|
|
48
|
-
- `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
|
|
49
|
-
|
|
50
|
-
## Requirements
|
|
51
|
-
|
|
52
|
-
- Python 3.10+
|
|
53
|
-
- deepgram-sdk>=5.0.0,<5.1
|
|
54
|
-
- numpy>=2.2.6,<2.3
|
|
@@ -1,349 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import contextlib
|
|
3
|
-
import logging
|
|
4
|
-
import os
|
|
5
|
-
import time
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
|
7
|
-
|
|
8
|
-
import numpy as np
|
|
9
|
-
import websockets
|
|
10
|
-
from deepgram import AsyncDeepgramClient
|
|
11
|
-
from deepgram.core.events import EventType
|
|
12
|
-
from deepgram.extensions.types.sockets import (
|
|
13
|
-
ListenV1ControlMessage,
|
|
14
|
-
ListenV1MetadataEvent,
|
|
15
|
-
ListenV1ResultsEvent,
|
|
16
|
-
ListenV1SpeechStartedEvent,
|
|
17
|
-
ListenV1UtteranceEndEvent,
|
|
18
|
-
)
|
|
19
|
-
from deepgram.listen.v1.socket_client import AsyncV1SocketClient
|
|
20
|
-
from getstream.video.rtc.track_util import PcmData
|
|
21
|
-
|
|
22
|
-
from vision_agents.core import stt
|
|
23
|
-
|
|
24
|
-
from .utils import generate_silence
|
|
25
|
-
|
|
26
|
-
if TYPE_CHECKING:
|
|
27
|
-
from vision_agents.core.edge.types import Participant
|
|
28
|
-
|
|
29
|
-
logger = logging.getLogger(__name__)
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
class STT(stt.STT):
|
|
33
|
-
"""
|
|
34
|
-
Deepgram-based Speech-to-Text implementation.
|
|
35
|
-
|
|
36
|
-
This implementation operates in asynchronous mode - it receives streaming transcripts
|
|
37
|
-
from Deepgram's WebSocket connection and emits events immediately as they arrive,
|
|
38
|
-
providing real-time responsiveness for live transcription scenarios.
|
|
39
|
-
|
|
40
|
-
Events:
|
|
41
|
-
- transcript: Emitted when a complete transcript is available.
|
|
42
|
-
Args: text (str), user_metadata (dict), metadata (dict)
|
|
43
|
-
- partial_transcript: Emitted when a partial transcript is available.
|
|
44
|
-
Args: text (str), user_metadata (dict), metadata (dict)
|
|
45
|
-
- error: Emitted when an error occurs during transcription.
|
|
46
|
-
Args: error (Exception)
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
def __init__(
|
|
50
|
-
self,
|
|
51
|
-
api_key: Optional[str] = None,
|
|
52
|
-
options: Optional[dict] = None,
|
|
53
|
-
sample_rate: int = 48000,
|
|
54
|
-
language: str = "en-US",
|
|
55
|
-
interim_results: bool = True,
|
|
56
|
-
client: Optional[AsyncDeepgramClient] = None,
|
|
57
|
-
keep_alive_interval: float = 1.0,
|
|
58
|
-
connection_timeout: float = 15.0,
|
|
59
|
-
):
|
|
60
|
-
"""
|
|
61
|
-
Initialize the Deepgram STT service.
|
|
62
|
-
|
|
63
|
-
Args:
|
|
64
|
-
api_key: Deepgram API key. If not provided, the DEEPGRAM_API_KEY
|
|
65
|
-
environment variable will be used automatically.
|
|
66
|
-
options: Deepgram live transcription options
|
|
67
|
-
sample_rate: Sample rate of the audio in Hz (default: 48000)
|
|
68
|
-
language: Language code for transcription
|
|
69
|
-
interim_results: Whether to emit interim results (partial transcripts with the partial_transcript event).
|
|
70
|
-
connection_timeout: Time to wait for the Deepgram connection to be established.
|
|
71
|
-
|
|
72
|
-
"""
|
|
73
|
-
super().__init__(sample_rate=sample_rate)
|
|
74
|
-
|
|
75
|
-
# If no API key was provided, check for DEEPGRAM_API_KEY in environment
|
|
76
|
-
if api_key is None:
|
|
77
|
-
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
|
78
|
-
if not api_key:
|
|
79
|
-
logger.warning(
|
|
80
|
-
"No API key provided and DEEPGRAM_API_KEY environment variable not found."
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
# Initialize DeepgramClient with the API key
|
|
84
|
-
logger.info("Initializing Deepgram client")
|
|
85
|
-
self.deepgram = (
|
|
86
|
-
client if client is not None else AsyncDeepgramClient(api_key=api_key)
|
|
87
|
-
)
|
|
88
|
-
self.dg_connection: Optional[AsyncV1SocketClient] = None
|
|
89
|
-
|
|
90
|
-
self.options = options or {
|
|
91
|
-
"model": "nova-2",
|
|
92
|
-
"language": language,
|
|
93
|
-
"encoding": "linear16",
|
|
94
|
-
"sample_rate": sample_rate,
|
|
95
|
-
"channels": 1,
|
|
96
|
-
"interim_results": interim_results,
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
# Track current user context for associating transcripts with users
|
|
100
|
-
self._current_user: Optional[Dict[str, Any]] = None
|
|
101
|
-
|
|
102
|
-
# Generate a silence audio to use as keep-alive message
|
|
103
|
-
self._keep_alive_data = generate_silence(
|
|
104
|
-
sample_rate=sample_rate, duration_ms=10
|
|
105
|
-
)
|
|
106
|
-
self._keep_alive_interval = keep_alive_interval
|
|
107
|
-
|
|
108
|
-
self._stack = contextlib.AsyncExitStack()
|
|
109
|
-
# An event to detect that the connection was established once.
|
|
110
|
-
self._connected_once = asyncio.Event()
|
|
111
|
-
# Time to wait for connection to be established before sending the event
|
|
112
|
-
self._connection_timeout = connection_timeout
|
|
113
|
-
self._last_sent_at = float("-inf")
|
|
114
|
-
# Lock to prevent concurrent connection opening
|
|
115
|
-
self._connect_lock = asyncio.Lock()
|
|
116
|
-
|
|
117
|
-
# Start the listener loop in the background
|
|
118
|
-
asyncio.create_task(self.start())
|
|
119
|
-
|
|
120
|
-
async def start(self):
|
|
121
|
-
"""
|
|
122
|
-
Start the main task establishing the Deepgram connection and processing the events.
|
|
123
|
-
"""
|
|
124
|
-
if self._is_closed:
|
|
125
|
-
logger.warning("Cannot setup connection - Deepgram instance is closed")
|
|
126
|
-
return None
|
|
127
|
-
|
|
128
|
-
# Establish a Deepgram connection.
|
|
129
|
-
# Use a lock to make sure it's established only once
|
|
130
|
-
async with self._connect_lock:
|
|
131
|
-
if self.dg_connection is not None:
|
|
132
|
-
logger.debug("Connection already set up, skipping initialization")
|
|
133
|
-
return None
|
|
134
|
-
|
|
135
|
-
try:
|
|
136
|
-
logger.info("Creating a Deepgram connection with options %s", self.options)
|
|
137
|
-
dg_connection = await self._stack.enter_async_context(
|
|
138
|
-
self.deepgram.listen.v1.connect(**self.options)
|
|
139
|
-
)
|
|
140
|
-
except Exception as e:
|
|
141
|
-
# Log the error and set connection to None
|
|
142
|
-
logger.exception("Error setting up Deepgram connection")
|
|
143
|
-
self.dg_connection = None
|
|
144
|
-
# Emit error immediately
|
|
145
|
-
self._emit_error_event(e, "Deepgram connection setup")
|
|
146
|
-
raise
|
|
147
|
-
finally:
|
|
148
|
-
self._connected_once.set()
|
|
149
|
-
|
|
150
|
-
self.dg_connection = dg_connection
|
|
151
|
-
# Start the keep-alive loop to keep the connection open
|
|
152
|
-
asyncio.create_task(self._keepalive_loop())
|
|
153
|
-
|
|
154
|
-
# Register event handlers
|
|
155
|
-
self.dg_connection.on(
|
|
156
|
-
EventType.OPEN,
|
|
157
|
-
lambda msg: logger.debug(f"Deepgram connection opened. message={msg}"),
|
|
158
|
-
)
|
|
159
|
-
self.dg_connection.on(EventType.CLOSE, self._on_connection_close)
|
|
160
|
-
self.dg_connection.on(EventType.ERROR, self._on_connection_error)
|
|
161
|
-
self.dg_connection.on(EventType.MESSAGE, self._on_message)
|
|
162
|
-
|
|
163
|
-
# Start processing the events from Deepgram.
|
|
164
|
-
# This is a blocking call.
|
|
165
|
-
logger.debug("Listening to the events from a Deepgram connection")
|
|
166
|
-
await self.dg_connection.start_listening()
|
|
167
|
-
return None
|
|
168
|
-
|
|
169
|
-
async def started(self):
|
|
170
|
-
"""
|
|
171
|
-
Wait until the Deepgram connection is established.
|
|
172
|
-
"""
|
|
173
|
-
if self._connected_once.is_set():
|
|
174
|
-
return
|
|
175
|
-
|
|
176
|
-
await asyncio.wait_for(
|
|
177
|
-
self._connected_once.wait(), timeout=self._connection_timeout
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
async def close(self):
|
|
181
|
-
"""Close the Deepgram connection and clean up resources."""
|
|
182
|
-
if self._is_closed:
|
|
183
|
-
logger.debug("Deepgram STT service already closed")
|
|
184
|
-
return
|
|
185
|
-
|
|
186
|
-
logger.info("Closing Deepgram STT service")
|
|
187
|
-
self._is_closed = True
|
|
188
|
-
|
|
189
|
-
# Close the Deepgram connection if it exists
|
|
190
|
-
if self.dg_connection:
|
|
191
|
-
logger.debug("Closing Deepgram connection")
|
|
192
|
-
try:
|
|
193
|
-
await self.dg_connection.send_control(
|
|
194
|
-
ListenV1ControlMessage(type="CloseStream")
|
|
195
|
-
)
|
|
196
|
-
await self._stack.aclose()
|
|
197
|
-
self.dg_connection = None
|
|
198
|
-
except Exception:
|
|
199
|
-
logger.exception("Error closing Deepgram connection")
|
|
200
|
-
|
|
201
|
-
async def _on_message(
|
|
202
|
-
self,
|
|
203
|
-
message: ListenV1ResultsEvent
|
|
204
|
-
| ListenV1MetadataEvent
|
|
205
|
-
| ListenV1UtteranceEndEvent
|
|
206
|
-
| ListenV1SpeechStartedEvent,
|
|
207
|
-
):
|
|
208
|
-
if message.type != "Results":
|
|
209
|
-
logger.debug(
|
|
210
|
-
"Received non-transcript message, skip processing. message=%s", message
|
|
211
|
-
)
|
|
212
|
-
return
|
|
213
|
-
|
|
214
|
-
transcript = message.dict()
|
|
215
|
-
|
|
216
|
-
# Get the transcript text from the response
|
|
217
|
-
alternatives = transcript.get("channel", {}).get("alternatives", [])
|
|
218
|
-
if not alternatives:
|
|
219
|
-
return
|
|
220
|
-
|
|
221
|
-
transcript_text = alternatives[0].get("transcript", "")
|
|
222
|
-
if not transcript_text:
|
|
223
|
-
return
|
|
224
|
-
|
|
225
|
-
# Check if this is a final result
|
|
226
|
-
is_final = transcript.get("is_final", False)
|
|
227
|
-
|
|
228
|
-
# Create metadata with useful information
|
|
229
|
-
metadata = {
|
|
230
|
-
"confidence": alternatives[0].get("confidence", 0),
|
|
231
|
-
"words": alternatives[0].get("words", []),
|
|
232
|
-
"is_final": is_final,
|
|
233
|
-
"channel_index": transcript.get("channel_index", 0),
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
# Emit immediately for real-time responsiveness
|
|
237
|
-
if is_final:
|
|
238
|
-
self._emit_transcript_event(transcript_text, self._current_user, metadata)
|
|
239
|
-
else:
|
|
240
|
-
self._emit_partial_transcript_event(
|
|
241
|
-
transcript_text, self._current_user, metadata
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
logger.debug(
|
|
245
|
-
"Received transcript",
|
|
246
|
-
extra={
|
|
247
|
-
"is_final": is_final,
|
|
248
|
-
"text_length": len(transcript_text),
|
|
249
|
-
"confidence": metadata["confidence"],
|
|
250
|
-
},
|
|
251
|
-
)
|
|
252
|
-
|
|
253
|
-
async def _on_connection_error(self, error: websockets.WebSocketException):
|
|
254
|
-
error_text = str(error) if error is not None else "Unknown error"
|
|
255
|
-
logger.error("Deepgram error received: %s", error_text)
|
|
256
|
-
# Emit error immediately
|
|
257
|
-
error_obj = Exception(f"Deepgram error: {error_text}")
|
|
258
|
-
self._emit_error_event(error_obj, "Deepgram connection")
|
|
259
|
-
|
|
260
|
-
async def _on_connection_close(self, message: Any):
|
|
261
|
-
logger.warning(f"Deepgram connection closed. message={message}")
|
|
262
|
-
await self.close()
|
|
263
|
-
|
|
264
|
-
async def _process_audio_impl(
|
|
265
|
-
self,
|
|
266
|
-
pcm_data: PcmData,
|
|
267
|
-
user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None,
|
|
268
|
-
) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
|
|
269
|
-
"""
|
|
270
|
-
Process audio data through Deepgram for transcription.
|
|
271
|
-
|
|
272
|
-
Args:
|
|
273
|
-
pcm_data: The PCM audio data to process.
|
|
274
|
-
user_metadata: Additional metadata about the user or session.
|
|
275
|
-
|
|
276
|
-
Returns:
|
|
277
|
-
None - Deepgram operates in asynchronous mode and emits events directly
|
|
278
|
-
when transcripts arrive from the streaming service.
|
|
279
|
-
"""
|
|
280
|
-
if self._is_closed:
|
|
281
|
-
logger.warning("Deepgram connection is closed, ignoring audio")
|
|
282
|
-
return None
|
|
283
|
-
|
|
284
|
-
# Store the current user context for transcript events
|
|
285
|
-
self._current_user = user_metadata # type: ignore[assignment]
|
|
286
|
-
|
|
287
|
-
# Check if the input sample rate matches the expected sample rate
|
|
288
|
-
if pcm_data.sample_rate != self.sample_rate:
|
|
289
|
-
logger.warning(
|
|
290
|
-
"Input audio sample rate (%s Hz) does not match the expected sample rate (%s Hz). "
|
|
291
|
-
"This may result in incorrect transcriptions. Consider resampling the audio.",
|
|
292
|
-
pcm_data.sample_rate,
|
|
293
|
-
self.sample_rate,
|
|
294
|
-
)
|
|
295
|
-
|
|
296
|
-
# Convert PCM data to bytes if needed
|
|
297
|
-
audio_data = pcm_data.samples
|
|
298
|
-
if not isinstance(audio_data, bytes):
|
|
299
|
-
# Convert numpy array to bytes
|
|
300
|
-
audio_data = audio_data.astype(np.int16).tobytes()
|
|
301
|
-
|
|
302
|
-
# Wait for the attempt to establish the connection
|
|
303
|
-
try:
|
|
304
|
-
await self.started()
|
|
305
|
-
except asyncio.TimeoutError:
|
|
306
|
-
logger.error(
|
|
307
|
-
f"Deepgram connection is not established within {self._connection_timeout} seconds. "
|
|
308
|
-
f"Skipping the audio package."
|
|
309
|
-
)
|
|
310
|
-
return None
|
|
311
|
-
|
|
312
|
-
# Send the audio data to Deepgram
|
|
313
|
-
logger.debug(
|
|
314
|
-
"Sending audio data to Deepgram",
|
|
315
|
-
extra={"audio_bytes": len(audio_data)},
|
|
316
|
-
)
|
|
317
|
-
await self._send_audio(audio_data)
|
|
318
|
-
return None
|
|
319
|
-
|
|
320
|
-
async def _send_audio(self, data: bytes):
|
|
321
|
-
if self.dg_connection is None:
|
|
322
|
-
logger.warning("Deepgram connection is not established")
|
|
323
|
-
return
|
|
324
|
-
|
|
325
|
-
try:
|
|
326
|
-
await self.dg_connection.send_media(data)
|
|
327
|
-
self._last_sent_at = time.time()
|
|
328
|
-
except Exception as e:
|
|
329
|
-
# Raise exception to be handled by base class
|
|
330
|
-
raise Exception(f"Deepgram audio transmission error: {e}") from e
|
|
331
|
-
|
|
332
|
-
async def _keepalive_loop(self):
|
|
333
|
-
"""
|
|
334
|
-
Send the silence audio every `interval` seconds
|
|
335
|
-
to prevent Deepgram from closing the connection.
|
|
336
|
-
"""
|
|
337
|
-
while not self._is_closed and self.dg_connection is not None:
|
|
338
|
-
if self._last_sent_at + self._keep_alive_interval <= time.time():
|
|
339
|
-
logger.debug("Sending keepalive packet to Deepgram...")
|
|
340
|
-
# Send audio silence to keep the connection open
|
|
341
|
-
await self._send_audio(self._keep_alive_data)
|
|
342
|
-
# Send keep-alive message as well
|
|
343
|
-
await self.dg_connection.send_control(
|
|
344
|
-
ListenV1ControlMessage(type="KeepAlive")
|
|
345
|
-
)
|
|
346
|
-
|
|
347
|
-
# Sleep max for 1s to avoid missing the keep-alive schedule
|
|
348
|
-
timeout = min(self._keep_alive_interval, 1.0)
|
|
349
|
-
await asyncio.sleep(timeout)
|
|
@@ -1,18 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
def generate_silence(sample_rate: int, duration_ms: int) -> bytes:
|
|
5
|
-
"""
|
|
6
|
-
Generate a silence of the given sample_rate and duration_ms.
|
|
7
|
-
"""
|
|
8
|
-
# Audio parameters
|
|
9
|
-
channels = 1
|
|
10
|
-
sample_format = np.int16 # 16-bit signed PCM
|
|
11
|
-
|
|
12
|
-
# Number of samples = sample_rate * duration_seconds
|
|
13
|
-
num_samples = int(sample_rate * (duration_ms / 1000.0))
|
|
14
|
-
|
|
15
|
-
# Create silence raw bytes (s16 mono PCM)
|
|
16
|
-
pcm_bytes = np.zeros((num_samples, channels), dtype=sample_format).tobytes()
|
|
17
|
-
return pcm_bytes
|
|
18
|
-
|