vision-agents-plugins-deepgram 0.1.3__tar.gz → 0.1.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vision-agents-plugins-deepgram might be problematic. Click here for more details.
- {vision_agents_plugins_deepgram-0.1.3 → vision_agents_plugins_deepgram-0.1.6}/PKG-INFO +15 -11
- vision_agents_plugins_deepgram-0.1.6/README.md +54 -0
- {vision_agents_plugins_deepgram-0.1.3 → vision_agents_plugins_deepgram-0.1.6}/pyproject.toml +1 -1
- vision_agents_plugins_deepgram-0.1.6/vision_agents/plugins/deepgram/stt.py +349 -0
- vision_agents_plugins_deepgram-0.1.6/vision_agents/plugins/deepgram/utils.py +18 -0
- vision_agents_plugins_deepgram-0.1.3/README.md +0 -50
- vision_agents_plugins_deepgram-0.1.3/vision_agents/plugins/deepgram/stt.py +0 -276
- {vision_agents_plugins_deepgram-0.1.3 → vision_agents_plugins_deepgram-0.1.6}/.gitignore +0 -0
- {vision_agents_plugins_deepgram-0.1.3 → vision_agents_plugins_deepgram-0.1.6}/vision_agents/plugins/deepgram/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vision-agents-plugins-deepgram
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Deepgram STT integration for Vision Agents
|
|
5
5
|
Project-URL: Documentation, https://visionagents.ai/
|
|
6
6
|
Project-URL: Website, https://visionagents.ai/
|
|
@@ -8,31 +8,32 @@ Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
|
8
8
|
License-Expression: MIT
|
|
9
9
|
Keywords: AI,STT,agents,deepgram,speech-to-text,transcription,voice agents
|
|
10
10
|
Requires-Python: >=3.10
|
|
11
|
-
Requires-Dist: deepgram-sdk
|
|
11
|
+
Requires-Dist: deepgram-sdk<5.1,>=5.0.0
|
|
12
12
|
Requires-Dist: numpy<2.3,>=2.2.6
|
|
13
13
|
Requires-Dist: vision-agents
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
|
|
16
16
|
# Deepgram Speech-to-Text Plugin
|
|
17
17
|
|
|
18
|
-
A high-quality Speech-to-Text (STT) plugin for
|
|
18
|
+
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
19
19
|
|
|
20
20
|
## Installation
|
|
21
21
|
|
|
22
22
|
```bash
|
|
23
|
-
|
|
23
|
+
uv add vision-agents-plugins-deepgram
|
|
24
24
|
```
|
|
25
25
|
|
|
26
26
|
## Usage
|
|
27
27
|
|
|
28
28
|
```python
|
|
29
|
-
from
|
|
29
|
+
from vision_agents.plugins import deepgram
|
|
30
|
+
from getstream.video.rtc.track_util import PcmData
|
|
30
31
|
|
|
31
32
|
# Initialize with API key from environment variable
|
|
32
|
-
stt =
|
|
33
|
+
stt = deepgram.STT()
|
|
33
34
|
|
|
34
35
|
# Or specify API key directly
|
|
35
|
-
stt =
|
|
36
|
+
stt = deepgram.STT(api_key="your_deepgram_api_key")
|
|
36
37
|
|
|
37
38
|
# Register event handlers
|
|
38
39
|
@stt.on("transcript")
|
|
@@ -44,6 +45,7 @@ def on_partial(text, user, metadata):
|
|
|
44
45
|
print(f"Partial transcript from {user}: {text}")
|
|
45
46
|
|
|
46
47
|
# Process audio
|
|
48
|
+
pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
|
|
47
49
|
await stt.process_audio(pcm_data)
|
|
48
50
|
|
|
49
51
|
# When done
|
|
@@ -52,14 +54,16 @@ await stt.close()
|
|
|
52
54
|
|
|
53
55
|
## Configuration Options
|
|
54
56
|
|
|
55
|
-
- `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
|
|
56
|
-
- `options`: Deepgram
|
|
57
|
+
- `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
|
|
58
|
+
- `options`: Deepgram options for configuring the transcription.
|
|
59
|
+
See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
|
|
57
60
|
- `sample_rate`: Sample rate of the audio in Hz (default: 16000)
|
|
58
61
|
- `language`: Language code for transcription (default: "en-US")
|
|
59
|
-
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default:
|
|
62
|
+
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
|
|
63
|
+
- `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
|
|
60
64
|
|
|
61
65
|
## Requirements
|
|
62
66
|
|
|
63
67
|
- Python 3.10+
|
|
64
|
-
- deepgram-sdk>=
|
|
68
|
+
- deepgram-sdk>=5.0.0,<5.1
|
|
65
69
|
- numpy>=2.2.6,<2.3
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# Deepgram Speech-to-Text Plugin
|
|
2
|
+
|
|
3
|
+
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
uv add vision-agents-plugins-deepgram
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from vision_agents.plugins import deepgram
|
|
15
|
+
from getstream.video.rtc.track_util import PcmData
|
|
16
|
+
|
|
17
|
+
# Initialize with API key from environment variable
|
|
18
|
+
stt = deepgram.STT()
|
|
19
|
+
|
|
20
|
+
# Or specify API key directly
|
|
21
|
+
stt = deepgram.STT(api_key="your_deepgram_api_key")
|
|
22
|
+
|
|
23
|
+
# Register event handlers
|
|
24
|
+
@stt.on("transcript")
|
|
25
|
+
def on_transcript(text, user, metadata):
|
|
26
|
+
print(f"Final transcript from {user}: {text}")
|
|
27
|
+
|
|
28
|
+
@stt.on("partial_transcript")
|
|
29
|
+
def on_partial(text, user, metadata):
|
|
30
|
+
print(f"Partial transcript from {user}: {text}")
|
|
31
|
+
|
|
32
|
+
# Process audio
|
|
33
|
+
pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
|
|
34
|
+
await stt.process_audio(pcm_data)
|
|
35
|
+
|
|
36
|
+
# When done
|
|
37
|
+
await stt.close()
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Configuration Options
|
|
41
|
+
|
|
42
|
+
- `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
|
|
43
|
+
- `options`: Deepgram options for configuring the transcription.
|
|
44
|
+
See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
|
|
45
|
+
- `sample_rate`: Sample rate of the audio in Hz (default: 16000)
|
|
46
|
+
- `language`: Language code for transcription (default: "en-US")
|
|
47
|
+
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
|
|
48
|
+
- `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
|
|
49
|
+
|
|
50
|
+
## Requirements
|
|
51
|
+
|
|
52
|
+
- Python 3.10+
|
|
53
|
+
- deepgram-sdk>=5.0.0,<5.1
|
|
54
|
+
- numpy>=2.2.6,<2.3
|
|
@@ -0,0 +1,349 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextlib
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import websockets
|
|
10
|
+
from deepgram import AsyncDeepgramClient
|
|
11
|
+
from deepgram.core.events import EventType
|
|
12
|
+
from deepgram.extensions.types.sockets import (
|
|
13
|
+
ListenV1ControlMessage,
|
|
14
|
+
ListenV1MetadataEvent,
|
|
15
|
+
ListenV1ResultsEvent,
|
|
16
|
+
ListenV1SpeechStartedEvent,
|
|
17
|
+
ListenV1UtteranceEndEvent,
|
|
18
|
+
)
|
|
19
|
+
from deepgram.listen.v1.socket_client import AsyncV1SocketClient
|
|
20
|
+
from getstream.video.rtc.track_util import PcmData
|
|
21
|
+
|
|
22
|
+
from vision_agents.core import stt
|
|
23
|
+
|
|
24
|
+
from .utils import generate_silence
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from vision_agents.core.edge.types import Participant
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class STT(stt.STT):
|
|
33
|
+
"""
|
|
34
|
+
Deepgram-based Speech-to-Text implementation.
|
|
35
|
+
|
|
36
|
+
This implementation operates in asynchronous mode - it receives streaming transcripts
|
|
37
|
+
from Deepgram's WebSocket connection and emits events immediately as they arrive,
|
|
38
|
+
providing real-time responsiveness for live transcription scenarios.
|
|
39
|
+
|
|
40
|
+
Events:
|
|
41
|
+
- transcript: Emitted when a complete transcript is available.
|
|
42
|
+
Args: text (str), user_metadata (dict), metadata (dict)
|
|
43
|
+
- partial_transcript: Emitted when a partial transcript is available.
|
|
44
|
+
Args: text (str), user_metadata (dict), metadata (dict)
|
|
45
|
+
- error: Emitted when an error occurs during transcription.
|
|
46
|
+
Args: error (Exception)
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
api_key: Optional[str] = None,
|
|
52
|
+
options: Optional[dict] = None,
|
|
53
|
+
sample_rate: int = 48000,
|
|
54
|
+
language: str = "en-US",
|
|
55
|
+
interim_results: bool = True,
|
|
56
|
+
client: Optional[AsyncDeepgramClient] = None,
|
|
57
|
+
keep_alive_interval: float = 1.0,
|
|
58
|
+
connection_timeout: float = 15.0,
|
|
59
|
+
):
|
|
60
|
+
"""
|
|
61
|
+
Initialize the Deepgram STT service.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
api_key: Deepgram API key. If not provided, the DEEPGRAM_API_KEY
|
|
65
|
+
environment variable will be used automatically.
|
|
66
|
+
options: Deepgram live transcription options
|
|
67
|
+
sample_rate: Sample rate of the audio in Hz (default: 48000)
|
|
68
|
+
language: Language code for transcription
|
|
69
|
+
interim_results: Whether to emit interim results (partial transcripts with the partial_transcript event).
|
|
70
|
+
connection_timeout: Time to wait for the Deepgram connection to be established.
|
|
71
|
+
|
|
72
|
+
"""
|
|
73
|
+
super().__init__(sample_rate=sample_rate)
|
|
74
|
+
|
|
75
|
+
# If no API key was provided, check for DEEPGRAM_API_KEY in environment
|
|
76
|
+
if api_key is None:
|
|
77
|
+
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
|
78
|
+
if not api_key:
|
|
79
|
+
logger.warning(
|
|
80
|
+
"No API key provided and DEEPGRAM_API_KEY environment variable not found."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Initialize DeepgramClient with the API key
|
|
84
|
+
logger.info("Initializing Deepgram client")
|
|
85
|
+
self.deepgram = (
|
|
86
|
+
client if client is not None else AsyncDeepgramClient(api_key=api_key)
|
|
87
|
+
)
|
|
88
|
+
self.dg_connection: Optional[AsyncV1SocketClient] = None
|
|
89
|
+
|
|
90
|
+
self.options = options or {
|
|
91
|
+
"model": "nova-2",
|
|
92
|
+
"language": language,
|
|
93
|
+
"encoding": "linear16",
|
|
94
|
+
"sample_rate": sample_rate,
|
|
95
|
+
"channels": 1,
|
|
96
|
+
"interim_results": interim_results,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Track current user context for associating transcripts with users
|
|
100
|
+
self._current_user: Optional[Dict[str, Any]] = None
|
|
101
|
+
|
|
102
|
+
# Generate a silence audio to use as keep-alive message
|
|
103
|
+
self._keep_alive_data = generate_silence(
|
|
104
|
+
sample_rate=sample_rate, duration_ms=10
|
|
105
|
+
)
|
|
106
|
+
self._keep_alive_interval = keep_alive_interval
|
|
107
|
+
|
|
108
|
+
self._stack = contextlib.AsyncExitStack()
|
|
109
|
+
# An event to detect that the connection was established once.
|
|
110
|
+
self._connected_once = asyncio.Event()
|
|
111
|
+
# Time to wait for connection to be established before sending the event
|
|
112
|
+
self._connection_timeout = connection_timeout
|
|
113
|
+
self._last_sent_at = float("-inf")
|
|
114
|
+
# Lock to prevent concurrent connection opening
|
|
115
|
+
self._connect_lock = asyncio.Lock()
|
|
116
|
+
|
|
117
|
+
# Start the listener loop in the background
|
|
118
|
+
asyncio.create_task(self.start())
|
|
119
|
+
|
|
120
|
+
async def start(self):
|
|
121
|
+
"""
|
|
122
|
+
Start the main task establishing the Deepgram connection and processing the events.
|
|
123
|
+
"""
|
|
124
|
+
if self._is_closed:
|
|
125
|
+
logger.warning("Cannot setup connection - Deepgram instance is closed")
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
# Establish a Deepgram connection.
|
|
129
|
+
# Use a lock to make sure it's established only once
|
|
130
|
+
async with self._connect_lock:
|
|
131
|
+
if self.dg_connection is not None:
|
|
132
|
+
logger.debug("Connection already set up, skipping initialization")
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
logger.info("Creating a Deepgram connection with options %s", self.options)
|
|
137
|
+
dg_connection = await self._stack.enter_async_context(
|
|
138
|
+
self.deepgram.listen.v1.connect(**self.options)
|
|
139
|
+
)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
# Log the error and set connection to None
|
|
142
|
+
logger.exception("Error setting up Deepgram connection")
|
|
143
|
+
self.dg_connection = None
|
|
144
|
+
# Emit error immediately
|
|
145
|
+
self._emit_error_event(e, "Deepgram connection setup")
|
|
146
|
+
raise
|
|
147
|
+
finally:
|
|
148
|
+
self._connected_once.set()
|
|
149
|
+
|
|
150
|
+
self.dg_connection = dg_connection
|
|
151
|
+
# Start the keep-alive loop to keep the connection open
|
|
152
|
+
asyncio.create_task(self._keepalive_loop())
|
|
153
|
+
|
|
154
|
+
# Register event handlers
|
|
155
|
+
self.dg_connection.on(
|
|
156
|
+
EventType.OPEN,
|
|
157
|
+
lambda msg: logger.debug(f"Deepgram connection opened. message={msg}"),
|
|
158
|
+
)
|
|
159
|
+
self.dg_connection.on(EventType.CLOSE, self._on_connection_close)
|
|
160
|
+
self.dg_connection.on(EventType.ERROR, self._on_connection_error)
|
|
161
|
+
self.dg_connection.on(EventType.MESSAGE, self._on_message)
|
|
162
|
+
|
|
163
|
+
# Start processing the events from Deepgram.
|
|
164
|
+
# This is a blocking call.
|
|
165
|
+
logger.debug("Listening to the events from a Deepgram connection")
|
|
166
|
+
await self.dg_connection.start_listening()
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
async def started(self):
|
|
170
|
+
"""
|
|
171
|
+
Wait until the Deepgram connection is established.
|
|
172
|
+
"""
|
|
173
|
+
if self._connected_once.is_set():
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
await asyncio.wait_for(
|
|
177
|
+
self._connected_once.wait(), timeout=self._connection_timeout
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
async def close(self):
|
|
181
|
+
"""Close the Deepgram connection and clean up resources."""
|
|
182
|
+
if self._is_closed:
|
|
183
|
+
logger.debug("Deepgram STT service already closed")
|
|
184
|
+
return
|
|
185
|
+
|
|
186
|
+
logger.info("Closing Deepgram STT service")
|
|
187
|
+
self._is_closed = True
|
|
188
|
+
|
|
189
|
+
# Close the Deepgram connection if it exists
|
|
190
|
+
if self.dg_connection:
|
|
191
|
+
logger.debug("Closing Deepgram connection")
|
|
192
|
+
try:
|
|
193
|
+
await self.dg_connection.send_control(
|
|
194
|
+
ListenV1ControlMessage(type="CloseStream")
|
|
195
|
+
)
|
|
196
|
+
await self._stack.aclose()
|
|
197
|
+
self.dg_connection = None
|
|
198
|
+
except Exception:
|
|
199
|
+
logger.exception("Error closing Deepgram connection")
|
|
200
|
+
|
|
201
|
+
async def _on_message(
|
|
202
|
+
self,
|
|
203
|
+
message: ListenV1ResultsEvent
|
|
204
|
+
| ListenV1MetadataEvent
|
|
205
|
+
| ListenV1UtteranceEndEvent
|
|
206
|
+
| ListenV1SpeechStartedEvent,
|
|
207
|
+
):
|
|
208
|
+
if message.type != "Results":
|
|
209
|
+
logger.debug(
|
|
210
|
+
"Received non-transcript message, skip processing. message=%s", message
|
|
211
|
+
)
|
|
212
|
+
return
|
|
213
|
+
|
|
214
|
+
transcript = message.dict()
|
|
215
|
+
|
|
216
|
+
# Get the transcript text from the response
|
|
217
|
+
alternatives = transcript.get("channel", {}).get("alternatives", [])
|
|
218
|
+
if not alternatives:
|
|
219
|
+
return
|
|
220
|
+
|
|
221
|
+
transcript_text = alternatives[0].get("transcript", "")
|
|
222
|
+
if not transcript_text:
|
|
223
|
+
return
|
|
224
|
+
|
|
225
|
+
# Check if this is a final result
|
|
226
|
+
is_final = transcript.get("is_final", False)
|
|
227
|
+
|
|
228
|
+
# Create metadata with useful information
|
|
229
|
+
metadata = {
|
|
230
|
+
"confidence": alternatives[0].get("confidence", 0),
|
|
231
|
+
"words": alternatives[0].get("words", []),
|
|
232
|
+
"is_final": is_final,
|
|
233
|
+
"channel_index": transcript.get("channel_index", 0),
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Emit immediately for real-time responsiveness
|
|
237
|
+
if is_final:
|
|
238
|
+
self._emit_transcript_event(transcript_text, self._current_user, metadata)
|
|
239
|
+
else:
|
|
240
|
+
self._emit_partial_transcript_event(
|
|
241
|
+
transcript_text, self._current_user, metadata
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
logger.debug(
|
|
245
|
+
"Received transcript",
|
|
246
|
+
extra={
|
|
247
|
+
"is_final": is_final,
|
|
248
|
+
"text_length": len(transcript_text),
|
|
249
|
+
"confidence": metadata["confidence"],
|
|
250
|
+
},
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
async def _on_connection_error(self, error: websockets.WebSocketException):
|
|
254
|
+
error_text = str(error) if error is not None else "Unknown error"
|
|
255
|
+
logger.error("Deepgram error received: %s", error_text)
|
|
256
|
+
# Emit error immediately
|
|
257
|
+
error_obj = Exception(f"Deepgram error: {error_text}")
|
|
258
|
+
self._emit_error_event(error_obj, "Deepgram connection")
|
|
259
|
+
|
|
260
|
+
async def _on_connection_close(self, message: Any):
|
|
261
|
+
logger.warning(f"Deepgram connection closed. message={message}")
|
|
262
|
+
await self.close()
|
|
263
|
+
|
|
264
|
+
async def _process_audio_impl(
|
|
265
|
+
self,
|
|
266
|
+
pcm_data: PcmData,
|
|
267
|
+
user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None,
|
|
268
|
+
) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
|
|
269
|
+
"""
|
|
270
|
+
Process audio data through Deepgram for transcription.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
pcm_data: The PCM audio data to process.
|
|
274
|
+
user_metadata: Additional metadata about the user or session.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
None - Deepgram operates in asynchronous mode and emits events directly
|
|
278
|
+
when transcripts arrive from the streaming service.
|
|
279
|
+
"""
|
|
280
|
+
if self._is_closed:
|
|
281
|
+
logger.warning("Deepgram connection is closed, ignoring audio")
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
# Store the current user context for transcript events
|
|
285
|
+
self._current_user = user_metadata # type: ignore[assignment]
|
|
286
|
+
|
|
287
|
+
# Check if the input sample rate matches the expected sample rate
|
|
288
|
+
if pcm_data.sample_rate != self.sample_rate:
|
|
289
|
+
logger.warning(
|
|
290
|
+
"Input audio sample rate (%s Hz) does not match the expected sample rate (%s Hz). "
|
|
291
|
+
"This may result in incorrect transcriptions. Consider resampling the audio.",
|
|
292
|
+
pcm_data.sample_rate,
|
|
293
|
+
self.sample_rate,
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
# Convert PCM data to bytes if needed
|
|
297
|
+
audio_data = pcm_data.samples
|
|
298
|
+
if not isinstance(audio_data, bytes):
|
|
299
|
+
# Convert numpy array to bytes
|
|
300
|
+
audio_data = audio_data.astype(np.int16).tobytes()
|
|
301
|
+
|
|
302
|
+
# Wait for the attempt to establish the connection
|
|
303
|
+
try:
|
|
304
|
+
await self.started()
|
|
305
|
+
except asyncio.TimeoutError:
|
|
306
|
+
logger.error(
|
|
307
|
+
f"Deepgram connection is not established within {self._connection_timeout} seconds. "
|
|
308
|
+
f"Skipping the audio package."
|
|
309
|
+
)
|
|
310
|
+
return None
|
|
311
|
+
|
|
312
|
+
# Send the audio data to Deepgram
|
|
313
|
+
logger.debug(
|
|
314
|
+
"Sending audio data to Deepgram",
|
|
315
|
+
extra={"audio_bytes": len(audio_data)},
|
|
316
|
+
)
|
|
317
|
+
await self._send_audio(audio_data)
|
|
318
|
+
return None
|
|
319
|
+
|
|
320
|
+
async def _send_audio(self, data: bytes):
|
|
321
|
+
if self.dg_connection is None:
|
|
322
|
+
logger.warning("Deepgram connection is not established")
|
|
323
|
+
return
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
await self.dg_connection.send_media(data)
|
|
327
|
+
self._last_sent_at = time.time()
|
|
328
|
+
except Exception as e:
|
|
329
|
+
# Raise exception to be handled by base class
|
|
330
|
+
raise Exception(f"Deepgram audio transmission error: {e}") from e
|
|
331
|
+
|
|
332
|
+
async def _keepalive_loop(self):
|
|
333
|
+
"""
|
|
334
|
+
Send the silence audio every `interval` seconds
|
|
335
|
+
to prevent Deepgram from closing the connection.
|
|
336
|
+
"""
|
|
337
|
+
while not self._is_closed and self.dg_connection is not None:
|
|
338
|
+
if self._last_sent_at + self._keep_alive_interval <= time.time():
|
|
339
|
+
logger.debug("Sending keepalive packet to Deepgram...")
|
|
340
|
+
# Send audio silence to keep the connection open
|
|
341
|
+
await self._send_audio(self._keep_alive_data)
|
|
342
|
+
# Send keep-alive message as well
|
|
343
|
+
await self.dg_connection.send_control(
|
|
344
|
+
ListenV1ControlMessage(type="KeepAlive")
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Sleep max for 1s to avoid missing the keep-alive schedule
|
|
348
|
+
timeout = min(self._keep_alive_interval, 1.0)
|
|
349
|
+
await asyncio.sleep(timeout)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def generate_silence(sample_rate: int, duration_ms: int) -> bytes:
|
|
5
|
+
"""
|
|
6
|
+
Generate a silence of the given sample_rate and duration_ms.
|
|
7
|
+
"""
|
|
8
|
+
# Audio parameters
|
|
9
|
+
channels = 1
|
|
10
|
+
sample_format = np.int16 # 16-bit signed PCM
|
|
11
|
+
|
|
12
|
+
# Number of samples = sample_rate * duration_seconds
|
|
13
|
+
num_samples = int(sample_rate * (duration_ms / 1000.0))
|
|
14
|
+
|
|
15
|
+
# Create silence raw bytes (s16 mono PCM)
|
|
16
|
+
pcm_bytes = np.zeros((num_samples, channels), dtype=sample_format).tobytes()
|
|
17
|
+
return pcm_bytes
|
|
18
|
+
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# Deepgram Speech-to-Text Plugin
|
|
2
|
-
|
|
3
|
-
A high-quality Speech-to-Text (STT) plugin for GetStream that uses the Deepgram API.
|
|
4
|
-
|
|
5
|
-
## Installation
|
|
6
|
-
|
|
7
|
-
```bash
|
|
8
|
-
pip install getstream-plugins-deepgram
|
|
9
|
-
```
|
|
10
|
-
|
|
11
|
-
## Usage
|
|
12
|
-
|
|
13
|
-
```python
|
|
14
|
-
from getstream.plugins.deepgram import DeepgramSTT
|
|
15
|
-
|
|
16
|
-
# Initialize with API key from environment variable
|
|
17
|
-
stt = DeepgramSTT()
|
|
18
|
-
|
|
19
|
-
# Or specify API key directly
|
|
20
|
-
stt = DeepgramSTT(api_key="your_deepgram_api_key")
|
|
21
|
-
|
|
22
|
-
# Register event handlers
|
|
23
|
-
@stt.on("transcript")
|
|
24
|
-
def on_transcript(text, user, metadata):
|
|
25
|
-
print(f"Final transcript from {user}: {text}")
|
|
26
|
-
|
|
27
|
-
@stt.on("partial_transcript")
|
|
28
|
-
def on_partial(text, user, metadata):
|
|
29
|
-
print(f"Partial transcript from {user}: {text}")
|
|
30
|
-
|
|
31
|
-
# Process audio
|
|
32
|
-
await stt.process_audio(pcm_data)
|
|
33
|
-
|
|
34
|
-
# When done
|
|
35
|
-
await stt.close()
|
|
36
|
-
```
|
|
37
|
-
|
|
38
|
-
## Configuration Options
|
|
39
|
-
|
|
40
|
-
- `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
|
|
41
|
-
- `options`: Deepgram LiveOptions for configuring the transcription
|
|
42
|
-
- `sample_rate`: Sample rate of the audio in Hz (default: 16000)
|
|
43
|
-
- `language`: Language code for transcription (default: "en-US")
|
|
44
|
-
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 5.0)
|
|
45
|
-
|
|
46
|
-
## Requirements
|
|
47
|
-
|
|
48
|
-
- Python 3.10+
|
|
49
|
-
- deepgram-sdk>=4.5.0
|
|
50
|
-
- numpy>=2.2.6,<2.3
|
|
@@ -1,276 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
from typing import Dict, Any, Optional, Tuple, List, Union, TYPE_CHECKING
|
|
4
|
-
|
|
5
|
-
if TYPE_CHECKING:
|
|
6
|
-
from vision_agents.core.edge.types import Participant
|
|
7
|
-
import numpy as np
|
|
8
|
-
import os
|
|
9
|
-
import time
|
|
10
|
-
|
|
11
|
-
from deepgram import DeepgramClient, LiveTranscriptionEvents, LiveOptions, DeepgramClientOptions
|
|
12
|
-
from vision_agents.core import stt
|
|
13
|
-
from getstream.video.rtc.track_util import PcmData
|
|
14
|
-
|
|
15
|
-
logger = logging.getLogger(__name__)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class STT(stt.STT):
|
|
19
|
-
"""
|
|
20
|
-
Deepgram-based Speech-to-Text implementation.
|
|
21
|
-
|
|
22
|
-
This implementation operates in asynchronous mode - it receives streaming transcripts
|
|
23
|
-
from Deepgram's WebSocket connection and emits events immediately as they arrive,
|
|
24
|
-
providing real-time responsiveness for live transcription scenarios.
|
|
25
|
-
|
|
26
|
-
Events:
|
|
27
|
-
- transcript: Emitted when a complete transcript is available.
|
|
28
|
-
Args: text (str), user_metadata (dict), metadata (dict)
|
|
29
|
-
- partial_transcript: Emitted when a partial transcript is available.
|
|
30
|
-
Args: text (str), user_metadata (dict), metadata (dict)
|
|
31
|
-
- error: Emitted when an error occurs during transcription.
|
|
32
|
-
Args: error (Exception)
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def __init__(
|
|
36
|
-
self,
|
|
37
|
-
api_key: Optional[str] = None,
|
|
38
|
-
options: Optional[LiveOptions] = None, # type: ignore
|
|
39
|
-
sample_rate: int = 48000,
|
|
40
|
-
language: str = "en-US",
|
|
41
|
-
interim_results: bool = True,
|
|
42
|
-
client: Optional[DeepgramClient] = None,
|
|
43
|
-
):
|
|
44
|
-
"""
|
|
45
|
-
Initialize the Deepgram STT service.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
api_key: Deepgram API key. If not provided, the DEEPGRAM_API_KEY
|
|
49
|
-
environment variable will be used automatically.
|
|
50
|
-
options: Deepgram live transcription options
|
|
51
|
-
sample_rate: Sample rate of the audio in Hz (default: 48000)
|
|
52
|
-
language: Language code for transcription
|
|
53
|
-
interim_results: Whether to emit interim results (partial transcripts with the partial_transcript event).
|
|
54
|
-
"""
|
|
55
|
-
super().__init__(sample_rate=sample_rate)
|
|
56
|
-
|
|
57
|
-
# If no API key was provided, check for DEEPGRAM_API_KEY in environment
|
|
58
|
-
if api_key is None:
|
|
59
|
-
api_key = os.environ.get("DEEPGRAM_API_KEY")
|
|
60
|
-
if not api_key:
|
|
61
|
-
logger.warning(
|
|
62
|
-
"No API key provided and DEEPGRAM_API_KEY environment variable not found."
|
|
63
|
-
)
|
|
64
|
-
|
|
65
|
-
# Initialize DeepgramClient with the API key
|
|
66
|
-
logger.info("Initializing Deepgram client")
|
|
67
|
-
config = DeepgramClientOptions(
|
|
68
|
-
options={"keepalive": "true"} # Comment this out to see the effect of not using keepalive
|
|
69
|
-
)
|
|
70
|
-
self.deepgram = client if client is not None else DeepgramClient(api_key, config)
|
|
71
|
-
self.dg_connection: Optional[Any] = None
|
|
72
|
-
self.options = options or LiveOptions(
|
|
73
|
-
model="nova-2",
|
|
74
|
-
language=language,
|
|
75
|
-
encoding="linear16",
|
|
76
|
-
sample_rate=sample_rate,
|
|
77
|
-
channels=1,
|
|
78
|
-
interim_results=interim_results,
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
# Track current user context for associating transcripts with users
|
|
82
|
-
self._current_user: Optional[Dict[str, Any]] = None
|
|
83
|
-
|
|
84
|
-
self._setup_connection()
|
|
85
|
-
|
|
86
|
-
def _handle_transcript_result(
|
|
87
|
-
self, is_final: bool, text: str, metadata: Dict[str, Any]
|
|
88
|
-
):
|
|
89
|
-
"""
|
|
90
|
-
Handle a transcript result by emitting it immediately.
|
|
91
|
-
"""
|
|
92
|
-
# Emit immediately for real-time responsiveness
|
|
93
|
-
if is_final:
|
|
94
|
-
self._emit_transcript_event(text, self._current_user, metadata)
|
|
95
|
-
else:
|
|
96
|
-
self._emit_partial_transcript_event(text, self._current_user, metadata)
|
|
97
|
-
|
|
98
|
-
logger.debug(
|
|
99
|
-
"Handled transcript result",
|
|
100
|
-
extra={
|
|
101
|
-
"is_final": is_final,
|
|
102
|
-
"text_length": len(text),
|
|
103
|
-
},
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
def _setup_connection(self):
|
|
107
|
-
"""Set up the Deepgram connection with event handlers."""
|
|
108
|
-
if self._is_closed:
|
|
109
|
-
logger.warning("Cannot setup connection - Deepgram instance is closed")
|
|
110
|
-
return
|
|
111
|
-
|
|
112
|
-
if self.dg_connection is not None:
|
|
113
|
-
logger.debug("Connection already set up, skipping initialization")
|
|
114
|
-
return
|
|
115
|
-
|
|
116
|
-
try:
|
|
117
|
-
# Use the newer websocket interface instead of deprecated live
|
|
118
|
-
logger.debug("Setting up Deepgram WebSocket connection")
|
|
119
|
-
self.dg_connection = self.deepgram.listen.websocket.v("1")
|
|
120
|
-
assert self.dg_connection is not None
|
|
121
|
-
|
|
122
|
-
# Handler for transcript results
|
|
123
|
-
def handle_transcript(conn, result=None):
|
|
124
|
-
try:
|
|
125
|
-
# Update the last activity time
|
|
126
|
-
self.last_activity_time = time.time()
|
|
127
|
-
|
|
128
|
-
# Check if result is already a dict (from LiveResultResponse or test mocks)
|
|
129
|
-
if isinstance(result, dict):
|
|
130
|
-
transcript = result
|
|
131
|
-
elif hasattr(result, "to_dict"):
|
|
132
|
-
transcript = result.to_dict()
|
|
133
|
-
elif hasattr(result, "to_json"):
|
|
134
|
-
transcript = json.loads(result.to_json())
|
|
135
|
-
elif isinstance(result, (str, bytes, bytearray)):
|
|
136
|
-
transcript = json.loads(result)
|
|
137
|
-
else:
|
|
138
|
-
logger.warning(
|
|
139
|
-
"Unrecognized transcript format: %s", type(result)
|
|
140
|
-
)
|
|
141
|
-
return
|
|
142
|
-
|
|
143
|
-
# Get the transcript text from the response
|
|
144
|
-
alternatives = transcript.get("channel", {}).get("alternatives", [])
|
|
145
|
-
if not alternatives:
|
|
146
|
-
return
|
|
147
|
-
|
|
148
|
-
transcript_text = alternatives[0].get("transcript", "")
|
|
149
|
-
if not transcript_text:
|
|
150
|
-
return
|
|
151
|
-
|
|
152
|
-
# Check if this is a final result
|
|
153
|
-
is_final = transcript.get("is_final", False)
|
|
154
|
-
|
|
155
|
-
# Create metadata with useful information
|
|
156
|
-
metadata = {
|
|
157
|
-
"confidence": alternatives[0].get("confidence", 0),
|
|
158
|
-
"words": alternatives[0].get("words", []),
|
|
159
|
-
"is_final": is_final,
|
|
160
|
-
"channel_index": transcript.get("channel_index", 0),
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
# Handle the result (both collect and emit)
|
|
164
|
-
self._handle_transcript_result(is_final, transcript_text, metadata)
|
|
165
|
-
|
|
166
|
-
logger.debug(
|
|
167
|
-
"Received transcript",
|
|
168
|
-
extra={
|
|
169
|
-
"is_final": is_final,
|
|
170
|
-
"text_length": len(transcript_text),
|
|
171
|
-
"confidence": metadata["confidence"],
|
|
172
|
-
},
|
|
173
|
-
)
|
|
174
|
-
except Exception as e:
|
|
175
|
-
logger.error("Error processing transcript", exc_info=e)
|
|
176
|
-
# Emit error immediately
|
|
177
|
-
self._emit_error_event(e, "Deepgram transcript processing")
|
|
178
|
-
|
|
179
|
-
# Handler for errors
|
|
180
|
-
def handle_error(conn, error=None):
|
|
181
|
-
# Update the last activity time
|
|
182
|
-
self.last_activity_time = time.time()
|
|
183
|
-
|
|
184
|
-
error_text = str(error) if error is not None else "Unknown error"
|
|
185
|
-
logger.error("Deepgram error received: %s", error_text)
|
|
186
|
-
|
|
187
|
-
# Emit error immediately
|
|
188
|
-
error_obj = Exception(f"Deepgram error: {error_text}")
|
|
189
|
-
self._emit_error_event(error_obj, "Deepgram connection")
|
|
190
|
-
|
|
191
|
-
# Register event handlers directly
|
|
192
|
-
self.dg_connection.on(LiveTranscriptionEvents.Transcript, handle_transcript)
|
|
193
|
-
self.dg_connection.on(LiveTranscriptionEvents.Error, handle_error)
|
|
194
|
-
|
|
195
|
-
# Start the connection
|
|
196
|
-
logger.info("Starting Deepgram connection with options %s", self.options)
|
|
197
|
-
self.dg_connection.start(self.options)
|
|
198
|
-
|
|
199
|
-
except Exception as e:
|
|
200
|
-
# Log the error and set connection to None
|
|
201
|
-
logger.error("Error setting up Deepgram connection", exc_info=e)
|
|
202
|
-
self.dg_connection = None
|
|
203
|
-
# Emit error immediately
|
|
204
|
-
self._emit_error_event(e, "Deepgram connection setup")
|
|
205
|
-
|
|
206
|
-
async def _process_audio_impl(
|
|
207
|
-
self, pcm_data: PcmData, user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None
|
|
208
|
-
) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
|
|
209
|
-
"""
|
|
210
|
-
Process audio data through Deepgram for transcription.
|
|
211
|
-
|
|
212
|
-
Args:
|
|
213
|
-
pcm_data: The PCM audio data to process.
|
|
214
|
-
user_metadata: Additional metadata about the user or session.
|
|
215
|
-
|
|
216
|
-
Returns:
|
|
217
|
-
None - Deepgram operates in asynchronous mode and emits events directly
|
|
218
|
-
when transcripts arrive from the streaming service.
|
|
219
|
-
"""
|
|
220
|
-
if self._is_closed:
|
|
221
|
-
logger.warning("Deepgram connection is closed, ignoring audio")
|
|
222
|
-
return None
|
|
223
|
-
|
|
224
|
-
# Store the current user context for transcript events
|
|
225
|
-
self._current_user = user_metadata # type: ignore[assignment]
|
|
226
|
-
|
|
227
|
-
# Check if the input sample rate matches the expected sample rate
|
|
228
|
-
if pcm_data.sample_rate != self.sample_rate:
|
|
229
|
-
logger.warning(
|
|
230
|
-
"Input audio sample rate (%s Hz) does not match the expected sample rate (%s Hz). "
|
|
231
|
-
"This may result in incorrect transcriptions. Consider resampling the audio.",
|
|
232
|
-
pcm_data.sample_rate,
|
|
233
|
-
self.sample_rate,
|
|
234
|
-
)
|
|
235
|
-
|
|
236
|
-
# Update the last activity time
|
|
237
|
-
self.last_activity_time = time.time()
|
|
238
|
-
|
|
239
|
-
# Convert PCM data to bytes if needed
|
|
240
|
-
audio_data = pcm_data.samples
|
|
241
|
-
if not isinstance(audio_data, bytes):
|
|
242
|
-
# Convert numpy array to bytes
|
|
243
|
-
audio_data = audio_data.astype(np.int16).tobytes()
|
|
244
|
-
|
|
245
|
-
# Send the audio data to Deepgram
|
|
246
|
-
try:
|
|
247
|
-
logger.debug(
|
|
248
|
-
"Sending audio data to Deepgram",
|
|
249
|
-
extra={"audio_bytes": len(audio_data)},
|
|
250
|
-
)
|
|
251
|
-
assert self.dg_connection is not None
|
|
252
|
-
self.dg_connection.send(audio_data)
|
|
253
|
-
except Exception as e:
|
|
254
|
-
# Raise exception to be handled by base class
|
|
255
|
-
raise Exception(f"Deepgram audio transmission error: {e}")
|
|
256
|
-
|
|
257
|
-
# Return None for asynchronous mode - events are emitted when they arrive
|
|
258
|
-
return None
|
|
259
|
-
|
|
260
|
-
async def close(self):
|
|
261
|
-
"""Close the Deepgram connection and clean up resources."""
|
|
262
|
-
if self._is_closed:
|
|
263
|
-
logger.debug("Deepgram STT service already closed")
|
|
264
|
-
return
|
|
265
|
-
|
|
266
|
-
logger.info("Closing Deepgram STT service")
|
|
267
|
-
self._is_closed = True
|
|
268
|
-
|
|
269
|
-
# Close the Deepgram connection if it exists
|
|
270
|
-
if self.dg_connection:
|
|
271
|
-
logger.debug("Closing Deepgram connection")
|
|
272
|
-
try:
|
|
273
|
-
self.dg_connection.finish()
|
|
274
|
-
self.dg_connection = None
|
|
275
|
-
except Exception as e:
|
|
276
|
-
logger.error("Error closing Deepgram connection", exc_info=e)
|
|
File without changes
|