vision-agents-plugins-deepgram 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vision-agents-plugins-deepgram might be problematic. Click here for more details.
- PKG-INFO +15 -11
- README.md +13 -9
- pyproject.toml +1 -1
- vision_agents/plugins/deepgram/stt.py +229 -156
- vision_agents/plugins/deepgram/utils.py +18 -0
- {vision_agents_plugins_deepgram-0.1.5.dist-info → vision_agents_plugins_deepgram-0.1.6.dist-info}/METADATA +15 -11
- vision_agents_plugins_deepgram-0.1.6.dist-info/RECORD +13 -0
- vision_agents_plugins_deepgram-0.1.5.dist-info/RECORD +0 -11
- {vision_agents_plugins_deepgram-0.1.5.dist-info → vision_agents_plugins_deepgram-0.1.6.dist-info}/WHEEL +0 -0
PKG-INFO
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vision-agents-plugins-deepgram
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Deepgram STT integration for Vision Agents
|
|
5
5
|
Project-URL: Documentation, https://visionagents.ai/
|
|
6
6
|
Project-URL: Website, https://visionagents.ai/
|
|
@@ -8,31 +8,32 @@ Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
|
8
8
|
License-Expression: MIT
|
|
9
9
|
Keywords: AI,STT,agents,deepgram,speech-to-text,transcription,voice agents
|
|
10
10
|
Requires-Python: >=3.10
|
|
11
|
-
Requires-Dist: deepgram-sdk
|
|
11
|
+
Requires-Dist: deepgram-sdk<5.1,>=5.0.0
|
|
12
12
|
Requires-Dist: numpy<2.3,>=2.2.6
|
|
13
13
|
Requires-Dist: vision-agents
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
|
|
16
16
|
# Deepgram Speech-to-Text Plugin
|
|
17
17
|
|
|
18
|
-
A high-quality Speech-to-Text (STT) plugin for
|
|
18
|
+
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
19
19
|
|
|
20
20
|
## Installation
|
|
21
21
|
|
|
22
22
|
```bash
|
|
23
|
-
|
|
23
|
+
uv add vision-agents-plugins-deepgram
|
|
24
24
|
```
|
|
25
25
|
|
|
26
26
|
## Usage
|
|
27
27
|
|
|
28
28
|
```python
|
|
29
|
-
from
|
|
29
|
+
from vision_agents.plugins import deepgram
|
|
30
|
+
from getstream.video.rtc.track_util import PcmData
|
|
30
31
|
|
|
31
32
|
# Initialize with API key from environment variable
|
|
32
|
-
stt =
|
|
33
|
+
stt = deepgram.STT()
|
|
33
34
|
|
|
34
35
|
# Or specify API key directly
|
|
35
|
-
stt =
|
|
36
|
+
stt = deepgram.STT(api_key="your_deepgram_api_key")
|
|
36
37
|
|
|
37
38
|
# Register event handlers
|
|
38
39
|
@stt.on("transcript")
|
|
@@ -44,6 +45,7 @@ def on_partial(text, user, metadata):
|
|
|
44
45
|
print(f"Partial transcript from {user}: {text}")
|
|
45
46
|
|
|
46
47
|
# Process audio
|
|
48
|
+
pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
|
|
47
49
|
await stt.process_audio(pcm_data)
|
|
48
50
|
|
|
49
51
|
# When done
|
|
@@ -52,14 +54,16 @@ await stt.close()
|
|
|
52
54
|
|
|
53
55
|
## Configuration Options
|
|
54
56
|
|
|
55
|
-
- `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
|
|
56
|
-
- `options`: Deepgram
|
|
57
|
+
- `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
|
|
58
|
+
- `options`: Deepgram options for configuring the transcription.
|
|
59
|
+
See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
|
|
57
60
|
- `sample_rate`: Sample rate of the audio in Hz (default: 16000)
|
|
58
61
|
- `language`: Language code for transcription (default: "en-US")
|
|
59
|
-
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default:
|
|
62
|
+
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
|
|
63
|
+
- `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
|
|
60
64
|
|
|
61
65
|
## Requirements
|
|
62
66
|
|
|
63
67
|
- Python 3.10+
|
|
64
|
-
- deepgram-sdk>=
|
|
68
|
+
- deepgram-sdk>=5.0.0,<5.1
|
|
65
69
|
- numpy>=2.2.6,<2.3
|
README.md
CHANGED
|
@@ -1,23 +1,24 @@
|
|
|
1
1
|
# Deepgram Speech-to-Text Plugin
|
|
2
2
|
|
|
3
|
-
A high-quality Speech-to-Text (STT) plugin for
|
|
3
|
+
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
4
4
|
|
|
5
5
|
## Installation
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
|
-
|
|
8
|
+
uv add vision-agents-plugins-deepgram
|
|
9
9
|
```
|
|
10
10
|
|
|
11
11
|
## Usage
|
|
12
12
|
|
|
13
13
|
```python
|
|
14
|
-
from
|
|
14
|
+
from vision_agents.plugins import deepgram
|
|
15
|
+
from getstream.video.rtc.track_util import PcmData
|
|
15
16
|
|
|
16
17
|
# Initialize with API key from environment variable
|
|
17
|
-
stt =
|
|
18
|
+
stt = deepgram.STT()
|
|
18
19
|
|
|
19
20
|
# Or specify API key directly
|
|
20
|
-
stt =
|
|
21
|
+
stt = deepgram.STT(api_key="your_deepgram_api_key")
|
|
21
22
|
|
|
22
23
|
# Register event handlers
|
|
23
24
|
@stt.on("transcript")
|
|
@@ -29,6 +30,7 @@ def on_partial(text, user, metadata):
|
|
|
29
30
|
print(f"Partial transcript from {user}: {text}")
|
|
30
31
|
|
|
31
32
|
# Process audio
|
|
33
|
+
pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
|
|
32
34
|
await stt.process_audio(pcm_data)
|
|
33
35
|
|
|
34
36
|
# When done
|
|
@@ -37,14 +39,16 @@ await stt.close()
|
|
|
37
39
|
|
|
38
40
|
## Configuration Options
|
|
39
41
|
|
|
40
|
-
- `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
|
|
41
|
-
- `options`: Deepgram
|
|
42
|
+
- `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
|
|
43
|
+
- `options`: Deepgram options for configuring the transcription.
|
|
44
|
+
See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
|
|
42
45
|
- `sample_rate`: Sample rate of the audio in Hz (default: 16000)
|
|
43
46
|
- `language`: Language code for transcription (default: "en-US")
|
|
44
|
-
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default:
|
|
47
|
+
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
|
|
48
|
+
- `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
|
|
45
49
|
|
|
46
50
|
## Requirements
|
|
47
51
|
|
|
48
52
|
- Python 3.10+
|
|
49
|
-
- deepgram-sdk>=
|
|
53
|
+
- deepgram-sdk>=5.0.0,<5.1
|
|
50
54
|
- numpy>=2.2.6,<2.3
|
pyproject.toml
CHANGED
|
@@ -1,17 +1,31 @@
|
|
|
1
|
-
import
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextlib
|
|
2
3
|
import logging
|
|
3
|
-
from typing import Dict, Any, Optional, Tuple, List, Union, TYPE_CHECKING
|
|
4
|
-
|
|
5
|
-
if TYPE_CHECKING:
|
|
6
|
-
from vision_agents.core.edge.types import Participant
|
|
7
|
-
import numpy as np
|
|
8
4
|
import os
|
|
9
5
|
import time
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
|
10
7
|
|
|
11
|
-
|
|
12
|
-
|
|
8
|
+
import numpy as np
|
|
9
|
+
import websockets
|
|
10
|
+
from deepgram import AsyncDeepgramClient
|
|
11
|
+
from deepgram.core.events import EventType
|
|
12
|
+
from deepgram.extensions.types.sockets import (
|
|
13
|
+
ListenV1ControlMessage,
|
|
14
|
+
ListenV1MetadataEvent,
|
|
15
|
+
ListenV1ResultsEvent,
|
|
16
|
+
ListenV1SpeechStartedEvent,
|
|
17
|
+
ListenV1UtteranceEndEvent,
|
|
18
|
+
)
|
|
19
|
+
from deepgram.listen.v1.socket_client import AsyncV1SocketClient
|
|
13
20
|
from getstream.video.rtc.track_util import PcmData
|
|
14
21
|
|
|
22
|
+
from vision_agents.core import stt
|
|
23
|
+
|
|
24
|
+
from .utils import generate_silence
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
from vision_agents.core.edge.types import Participant
|
|
28
|
+
|
|
15
29
|
logger = logging.getLogger(__name__)
|
|
16
30
|
|
|
17
31
|
|
|
@@ -35,11 +49,13 @@ class STT(stt.STT):
|
|
|
35
49
|
def __init__(
|
|
36
50
|
self,
|
|
37
51
|
api_key: Optional[str] = None,
|
|
38
|
-
options: Optional[
|
|
52
|
+
options: Optional[dict] = None,
|
|
39
53
|
sample_rate: int = 48000,
|
|
40
54
|
language: str = "en-US",
|
|
41
55
|
interim_results: bool = True,
|
|
42
|
-
client: Optional[
|
|
56
|
+
client: Optional[AsyncDeepgramClient] = None,
|
|
57
|
+
keep_alive_interval: float = 1.0,
|
|
58
|
+
connection_timeout: float = 15.0,
|
|
43
59
|
):
|
|
44
60
|
"""
|
|
45
61
|
Initialize the Deepgram STT service.
|
|
@@ -51,6 +67,8 @@ class STT(stt.STT):
|
|
|
51
67
|
sample_rate: Sample rate of the audio in Hz (default: 48000)
|
|
52
68
|
language: Language code for transcription
|
|
53
69
|
interim_results: Whether to emit interim results (partial transcripts with the partial_transcript event).
|
|
70
|
+
connection_timeout: Time to wait for the Deepgram connection to be established.
|
|
71
|
+
|
|
54
72
|
"""
|
|
55
73
|
super().__init__(sample_rate=sample_rate)
|
|
56
74
|
|
|
@@ -64,147 +82,189 @@ class STT(stt.STT):
|
|
|
64
82
|
|
|
65
83
|
# Initialize DeepgramClient with the API key
|
|
66
84
|
logger.info("Initializing Deepgram client")
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
)
|
|
70
|
-
self.deepgram = client if client is not None else DeepgramClient(api_key, config)
|
|
71
|
-
self.dg_connection: Optional[Any] = None
|
|
72
|
-
self.options = options or LiveOptions(
|
|
73
|
-
model="nova-2",
|
|
74
|
-
language=language,
|
|
75
|
-
encoding="linear16",
|
|
76
|
-
sample_rate=sample_rate,
|
|
77
|
-
channels=1,
|
|
78
|
-
interim_results=interim_results,
|
|
85
|
+
self.deepgram = (
|
|
86
|
+
client if client is not None else AsyncDeepgramClient(api_key=api_key)
|
|
79
87
|
)
|
|
88
|
+
self.dg_connection: Optional[AsyncV1SocketClient] = None
|
|
89
|
+
|
|
90
|
+
self.options = options or {
|
|
91
|
+
"model": "nova-2",
|
|
92
|
+
"language": language,
|
|
93
|
+
"encoding": "linear16",
|
|
94
|
+
"sample_rate": sample_rate,
|
|
95
|
+
"channels": 1,
|
|
96
|
+
"interim_results": interim_results,
|
|
97
|
+
}
|
|
80
98
|
|
|
81
99
|
# Track current user context for associating transcripts with users
|
|
82
100
|
self._current_user: Optional[Dict[str, Any]] = None
|
|
83
101
|
|
|
84
|
-
|
|
102
|
+
# Generate a silence audio to use as keep-alive message
|
|
103
|
+
self._keep_alive_data = generate_silence(
|
|
104
|
+
sample_rate=sample_rate, duration_ms=10
|
|
105
|
+
)
|
|
106
|
+
self._keep_alive_interval = keep_alive_interval
|
|
107
|
+
|
|
108
|
+
self._stack = contextlib.AsyncExitStack()
|
|
109
|
+
# An event to detect that the connection was established once.
|
|
110
|
+
self._connected_once = asyncio.Event()
|
|
111
|
+
# Time to wait for connection to be established before sending the event
|
|
112
|
+
self._connection_timeout = connection_timeout
|
|
113
|
+
self._last_sent_at = float("-inf")
|
|
114
|
+
# Lock to prevent concurrent connection opening
|
|
115
|
+
self._connect_lock = asyncio.Lock()
|
|
85
116
|
|
|
86
|
-
|
|
87
|
-
self
|
|
88
|
-
|
|
117
|
+
# Start the listener loop in the background
|
|
118
|
+
asyncio.create_task(self.start())
|
|
119
|
+
|
|
120
|
+
async def start(self):
|
|
89
121
|
"""
|
|
90
|
-
|
|
122
|
+
Start the main task establishing the Deepgram connection and processing the events.
|
|
91
123
|
"""
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
else:
|
|
96
|
-
self._emit_partial_transcript_event(text, self._current_user, metadata)
|
|
124
|
+
if self._is_closed:
|
|
125
|
+
logger.warning("Cannot setup connection - Deepgram instance is closed")
|
|
126
|
+
return None
|
|
97
127
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
"
|
|
103
|
-
|
|
128
|
+
# Establish a Deepgram connection.
|
|
129
|
+
# Use a lock to make sure it's established only once
|
|
130
|
+
async with self._connect_lock:
|
|
131
|
+
if self.dg_connection is not None:
|
|
132
|
+
logger.debug("Connection already set up, skipping initialization")
|
|
133
|
+
return None
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
logger.info("Creating a Deepgram connection with options %s", self.options)
|
|
137
|
+
dg_connection = await self._stack.enter_async_context(
|
|
138
|
+
self.deepgram.listen.v1.connect(**self.options)
|
|
139
|
+
)
|
|
140
|
+
except Exception as e:
|
|
141
|
+
# Log the error and set connection to None
|
|
142
|
+
logger.exception("Error setting up Deepgram connection")
|
|
143
|
+
self.dg_connection = None
|
|
144
|
+
# Emit error immediately
|
|
145
|
+
self._emit_error_event(e, "Deepgram connection setup")
|
|
146
|
+
raise
|
|
147
|
+
finally:
|
|
148
|
+
self._connected_once.set()
|
|
149
|
+
|
|
150
|
+
self.dg_connection = dg_connection
|
|
151
|
+
# Start the keep-alive loop to keep the connection open
|
|
152
|
+
asyncio.create_task(self._keepalive_loop())
|
|
153
|
+
|
|
154
|
+
# Register event handlers
|
|
155
|
+
self.dg_connection.on(
|
|
156
|
+
EventType.OPEN,
|
|
157
|
+
lambda msg: logger.debug(f"Deepgram connection opened. message={msg}"),
|
|
104
158
|
)
|
|
159
|
+
self.dg_connection.on(EventType.CLOSE, self._on_connection_close)
|
|
160
|
+
self.dg_connection.on(EventType.ERROR, self._on_connection_error)
|
|
161
|
+
self.dg_connection.on(EventType.MESSAGE, self._on_message)
|
|
162
|
+
|
|
163
|
+
# Start processing the events from Deepgram.
|
|
164
|
+
# This is a blocking call.
|
|
165
|
+
logger.debug("Listening to the events from a Deepgram connection")
|
|
166
|
+
await self.dg_connection.start_listening()
|
|
167
|
+
return None
|
|
105
168
|
|
|
106
|
-
def
|
|
107
|
-
"""
|
|
169
|
+
async def started(self):
|
|
170
|
+
"""
|
|
171
|
+
Wait until the Deepgram connection is established.
|
|
172
|
+
"""
|
|
173
|
+
if self._connected_once.is_set():
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
await asyncio.wait_for(
|
|
177
|
+
self._connected_once.wait(), timeout=self._connection_timeout
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
async def close(self):
|
|
181
|
+
"""Close the Deepgram connection and clean up resources."""
|
|
108
182
|
if self._is_closed:
|
|
109
|
-
logger.
|
|
183
|
+
logger.debug("Deepgram STT service already closed")
|
|
110
184
|
return
|
|
111
185
|
|
|
112
|
-
|
|
113
|
-
|
|
186
|
+
logger.info("Closing Deepgram STT service")
|
|
187
|
+
self._is_closed = True
|
|
188
|
+
|
|
189
|
+
# Close the Deepgram connection if it exists
|
|
190
|
+
if self.dg_connection:
|
|
191
|
+
logger.debug("Closing Deepgram connection")
|
|
192
|
+
try:
|
|
193
|
+
await self.dg_connection.send_control(
|
|
194
|
+
ListenV1ControlMessage(type="CloseStream")
|
|
195
|
+
)
|
|
196
|
+
await self._stack.aclose()
|
|
197
|
+
self.dg_connection = None
|
|
198
|
+
except Exception:
|
|
199
|
+
logger.exception("Error closing Deepgram connection")
|
|
200
|
+
|
|
201
|
+
async def _on_message(
|
|
202
|
+
self,
|
|
203
|
+
message: ListenV1ResultsEvent
|
|
204
|
+
| ListenV1MetadataEvent
|
|
205
|
+
| ListenV1UtteranceEndEvent
|
|
206
|
+
| ListenV1SpeechStartedEvent,
|
|
207
|
+
):
|
|
208
|
+
if message.type != "Results":
|
|
209
|
+
logger.debug(
|
|
210
|
+
"Received non-transcript message, skip processing. message=%s", message
|
|
211
|
+
)
|
|
114
212
|
return
|
|
115
213
|
|
|
116
|
-
|
|
117
|
-
# Use the newer websocket interface instead of deprecated live
|
|
118
|
-
logger.debug("Setting up Deepgram WebSocket connection")
|
|
119
|
-
self.dg_connection = self.deepgram.listen.websocket.v("1")
|
|
120
|
-
assert self.dg_connection is not None
|
|
121
|
-
|
|
122
|
-
# Handler for transcript results
|
|
123
|
-
def handle_transcript(conn, result=None):
|
|
124
|
-
try:
|
|
125
|
-
# Update the last activity time
|
|
126
|
-
self.last_activity_time = time.time()
|
|
127
|
-
|
|
128
|
-
# Check if result is already a dict (from LiveResultResponse or test mocks)
|
|
129
|
-
if isinstance(result, dict):
|
|
130
|
-
transcript = result
|
|
131
|
-
elif hasattr(result, "to_dict"):
|
|
132
|
-
transcript = result.to_dict()
|
|
133
|
-
elif hasattr(result, "to_json"):
|
|
134
|
-
transcript = json.loads(result.to_json())
|
|
135
|
-
elif isinstance(result, (str, bytes, bytearray)):
|
|
136
|
-
transcript = json.loads(result)
|
|
137
|
-
else:
|
|
138
|
-
logger.warning(
|
|
139
|
-
"Unrecognized transcript format: %s", type(result)
|
|
140
|
-
)
|
|
141
|
-
return
|
|
142
|
-
|
|
143
|
-
# Get the transcript text from the response
|
|
144
|
-
alternatives = transcript.get("channel", {}).get("alternatives", [])
|
|
145
|
-
if not alternatives:
|
|
146
|
-
return
|
|
147
|
-
|
|
148
|
-
transcript_text = alternatives[0].get("transcript", "")
|
|
149
|
-
if not transcript_text:
|
|
150
|
-
return
|
|
151
|
-
|
|
152
|
-
# Check if this is a final result
|
|
153
|
-
is_final = transcript.get("is_final", False)
|
|
154
|
-
|
|
155
|
-
# Create metadata with useful information
|
|
156
|
-
metadata = {
|
|
157
|
-
"confidence": alternatives[0].get("confidence", 0),
|
|
158
|
-
"words": alternatives[0].get("words", []),
|
|
159
|
-
"is_final": is_final,
|
|
160
|
-
"channel_index": transcript.get("channel_index", 0),
|
|
161
|
-
}
|
|
162
|
-
|
|
163
|
-
# Handle the result (both collect and emit)
|
|
164
|
-
self._handle_transcript_result(is_final, transcript_text, metadata)
|
|
165
|
-
|
|
166
|
-
logger.debug(
|
|
167
|
-
"Received transcript",
|
|
168
|
-
extra={
|
|
169
|
-
"is_final": is_final,
|
|
170
|
-
"text_length": len(transcript_text),
|
|
171
|
-
"confidence": metadata["confidence"],
|
|
172
|
-
},
|
|
173
|
-
)
|
|
174
|
-
except Exception as e:
|
|
175
|
-
logger.error("Error processing transcript", exc_info=e)
|
|
176
|
-
# Emit error immediately
|
|
177
|
-
self._emit_error_event(e, "Deepgram transcript processing")
|
|
178
|
-
|
|
179
|
-
# Handler for errors
|
|
180
|
-
def handle_error(conn, error=None):
|
|
181
|
-
# Update the last activity time
|
|
182
|
-
self.last_activity_time = time.time()
|
|
183
|
-
|
|
184
|
-
error_text = str(error) if error is not None else "Unknown error"
|
|
185
|
-
logger.error("Deepgram error received: %s", error_text)
|
|
214
|
+
transcript = message.dict()
|
|
186
215
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
216
|
+
# Get the transcript text from the response
|
|
217
|
+
alternatives = transcript.get("channel", {}).get("alternatives", [])
|
|
218
|
+
if not alternatives:
|
|
219
|
+
return
|
|
190
220
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
221
|
+
transcript_text = alternatives[0].get("transcript", "")
|
|
222
|
+
if not transcript_text:
|
|
223
|
+
return
|
|
194
224
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
self.dg_connection.start(self.options)
|
|
225
|
+
# Check if this is a final result
|
|
226
|
+
is_final = transcript.get("is_final", False)
|
|
198
227
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
228
|
+
# Create metadata with useful information
|
|
229
|
+
metadata = {
|
|
230
|
+
"confidence": alternatives[0].get("confidence", 0),
|
|
231
|
+
"words": alternatives[0].get("words", []),
|
|
232
|
+
"is_final": is_final,
|
|
233
|
+
"channel_index": transcript.get("channel_index", 0),
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Emit immediately for real-time responsiveness
|
|
237
|
+
if is_final:
|
|
238
|
+
self._emit_transcript_event(transcript_text, self._current_user, metadata)
|
|
239
|
+
else:
|
|
240
|
+
self._emit_partial_transcript_event(
|
|
241
|
+
transcript_text, self._current_user, metadata
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
logger.debug(
|
|
245
|
+
"Received transcript",
|
|
246
|
+
extra={
|
|
247
|
+
"is_final": is_final,
|
|
248
|
+
"text_length": len(transcript_text),
|
|
249
|
+
"confidence": metadata["confidence"],
|
|
250
|
+
},
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
async def _on_connection_error(self, error: websockets.WebSocketException):
|
|
254
|
+
error_text = str(error) if error is not None else "Unknown error"
|
|
255
|
+
logger.error("Deepgram error received: %s", error_text)
|
|
256
|
+
# Emit error immediately
|
|
257
|
+
error_obj = Exception(f"Deepgram error: {error_text}")
|
|
258
|
+
self._emit_error_event(error_obj, "Deepgram connection")
|
|
259
|
+
|
|
260
|
+
async def _on_connection_close(self, message: Any):
|
|
261
|
+
logger.warning(f"Deepgram connection closed. message={message}")
|
|
262
|
+
await self.close()
|
|
205
263
|
|
|
206
264
|
async def _process_audio_impl(
|
|
207
|
-
self,
|
|
265
|
+
self,
|
|
266
|
+
pcm_data: PcmData,
|
|
267
|
+
user_metadata: Optional[Union[Dict[str, Any], "Participant"]] = None,
|
|
208
268
|
) -> Optional[List[Tuple[bool, str, Dict[str, Any]]]]:
|
|
209
269
|
"""
|
|
210
270
|
Process audio data through Deepgram for transcription.
|
|
@@ -233,44 +293,57 @@ class STT(stt.STT):
|
|
|
233
293
|
self.sample_rate,
|
|
234
294
|
)
|
|
235
295
|
|
|
236
|
-
# Update the last activity time
|
|
237
|
-
self.last_activity_time = time.time()
|
|
238
|
-
|
|
239
296
|
# Convert PCM data to bytes if needed
|
|
240
297
|
audio_data = pcm_data.samples
|
|
241
298
|
if not isinstance(audio_data, bytes):
|
|
242
299
|
# Convert numpy array to bytes
|
|
243
300
|
audio_data = audio_data.astype(np.int16).tobytes()
|
|
244
301
|
|
|
245
|
-
#
|
|
302
|
+
# Wait for the attempt to establish the connection
|
|
246
303
|
try:
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
304
|
+
await self.started()
|
|
305
|
+
except asyncio.TimeoutError:
|
|
306
|
+
logger.error(
|
|
307
|
+
f"Deepgram connection is not established within {self._connection_timeout} seconds. "
|
|
308
|
+
f"Skipping the audio package."
|
|
250
309
|
)
|
|
251
|
-
|
|
252
|
-
self.dg_connection.send(audio_data)
|
|
253
|
-
except Exception as e:
|
|
254
|
-
# Raise exception to be handled by base class
|
|
255
|
-
raise Exception(f"Deepgram audio transmission error: {e}")
|
|
310
|
+
return None
|
|
256
311
|
|
|
257
|
-
#
|
|
312
|
+
# Send the audio data to Deepgram
|
|
313
|
+
logger.debug(
|
|
314
|
+
"Sending audio data to Deepgram",
|
|
315
|
+
extra={"audio_bytes": len(audio_data)},
|
|
316
|
+
)
|
|
317
|
+
await self._send_audio(audio_data)
|
|
258
318
|
return None
|
|
259
319
|
|
|
260
|
-
async def
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
logger.debug("Deepgram STT service already closed")
|
|
320
|
+
async def _send_audio(self, data: bytes):
|
|
321
|
+
if self.dg_connection is None:
|
|
322
|
+
logger.warning("Deepgram connection is not established")
|
|
264
323
|
return
|
|
265
324
|
|
|
266
|
-
|
|
267
|
-
|
|
325
|
+
try:
|
|
326
|
+
await self.dg_connection.send_media(data)
|
|
327
|
+
self._last_sent_at = time.time()
|
|
328
|
+
except Exception as e:
|
|
329
|
+
# Raise exception to be handled by base class
|
|
330
|
+
raise Exception(f"Deepgram audio transmission error: {e}") from e
|
|
268
331
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
logger.
|
|
332
|
+
async def _keepalive_loop(self):
|
|
333
|
+
"""
|
|
334
|
+
Send the silence audio every `interval` seconds
|
|
335
|
+
to prevent Deepgram from closing the connection.
|
|
336
|
+
"""
|
|
337
|
+
while not self._is_closed and self.dg_connection is not None:
|
|
338
|
+
if self._last_sent_at + self._keep_alive_interval <= time.time():
|
|
339
|
+
logger.debug("Sending keepalive packet to Deepgram...")
|
|
340
|
+
# Send audio silence to keep the connection open
|
|
341
|
+
await self._send_audio(self._keep_alive_data)
|
|
342
|
+
# Send keep-alive message as well
|
|
343
|
+
await self.dg_connection.send_control(
|
|
344
|
+
ListenV1ControlMessage(type="KeepAlive")
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
# Sleep max for 1s to avoid missing the keep-alive schedule
|
|
348
|
+
timeout = min(self._keep_alive_interval, 1.0)
|
|
349
|
+
await asyncio.sleep(timeout)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def generate_silence(sample_rate: int, duration_ms: int) -> bytes:
|
|
5
|
+
"""
|
|
6
|
+
Generate a silence of the given sample_rate and duration_ms.
|
|
7
|
+
"""
|
|
8
|
+
# Audio parameters
|
|
9
|
+
channels = 1
|
|
10
|
+
sample_format = np.int16 # 16-bit signed PCM
|
|
11
|
+
|
|
12
|
+
# Number of samples = sample_rate * duration_seconds
|
|
13
|
+
num_samples = int(sample_rate * (duration_ms / 1000.0))
|
|
14
|
+
|
|
15
|
+
# Create silence raw bytes (s16 mono PCM)
|
|
16
|
+
pcm_bytes = np.zeros((num_samples, channels), dtype=sample_format).tobytes()
|
|
17
|
+
return pcm_bytes
|
|
18
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: vision-agents-plugins-deepgram
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.6
|
|
4
4
|
Summary: Deepgram STT integration for Vision Agents
|
|
5
5
|
Project-URL: Documentation, https://visionagents.ai/
|
|
6
6
|
Project-URL: Website, https://visionagents.ai/
|
|
@@ -8,31 +8,32 @@ Project-URL: Source, https://github.com/GetStream/Vision-Agents
|
|
|
8
8
|
License-Expression: MIT
|
|
9
9
|
Keywords: AI,STT,agents,deepgram,speech-to-text,transcription,voice agents
|
|
10
10
|
Requires-Python: >=3.10
|
|
11
|
-
Requires-Dist: deepgram-sdk
|
|
11
|
+
Requires-Dist: deepgram-sdk<5.1,>=5.0.0
|
|
12
12
|
Requires-Dist: numpy<2.3,>=2.2.6
|
|
13
13
|
Requires-Dist: vision-agents
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
|
|
16
16
|
# Deepgram Speech-to-Text Plugin
|
|
17
17
|
|
|
18
|
-
A high-quality Speech-to-Text (STT) plugin for
|
|
18
|
+
A high-quality Speech-to-Text (STT) plugin for Vision agents that uses the Deepgram API.
|
|
19
19
|
|
|
20
20
|
## Installation
|
|
21
21
|
|
|
22
22
|
```bash
|
|
23
|
-
|
|
23
|
+
uv add vision-agents-plugins-deepgram
|
|
24
24
|
```
|
|
25
25
|
|
|
26
26
|
## Usage
|
|
27
27
|
|
|
28
28
|
```python
|
|
29
|
-
from
|
|
29
|
+
from vision_agents.plugins import deepgram
|
|
30
|
+
from getstream.video.rtc.track_util import PcmData
|
|
30
31
|
|
|
31
32
|
# Initialize with API key from environment variable
|
|
32
|
-
stt =
|
|
33
|
+
stt = deepgram.STT()
|
|
33
34
|
|
|
34
35
|
# Or specify API key directly
|
|
35
|
-
stt =
|
|
36
|
+
stt = deepgram.STT(api_key="your_deepgram_api_key")
|
|
36
37
|
|
|
37
38
|
# Register event handlers
|
|
38
39
|
@stt.on("transcript")
|
|
@@ -44,6 +45,7 @@ def on_partial(text, user, metadata):
|
|
|
44
45
|
print(f"Partial transcript from {user}: {text}")
|
|
45
46
|
|
|
46
47
|
# Process audio
|
|
48
|
+
pcm_data = PcmData(samples=b"\x00\x00" * 1000, sample_rate=48000, format="s16")
|
|
47
49
|
await stt.process_audio(pcm_data)
|
|
48
50
|
|
|
49
51
|
# When done
|
|
@@ -52,14 +54,16 @@ await stt.close()
|
|
|
52
54
|
|
|
53
55
|
## Configuration Options
|
|
54
56
|
|
|
55
|
-
- `api_key`: Deepgram API key (default: reads from DEEPGRAM_API_KEY environment variable)
|
|
56
|
-
- `options`: Deepgram
|
|
57
|
+
- `api_key`: Deepgram API key (default: reads from `DEEPGRAM_API_KEY` environment variable)
|
|
58
|
+
- `options`: Deepgram options for configuring the transcription.
|
|
59
|
+
See the [Deepgram Listen V1 Connect API documentation](https://github.com/deepgram/deepgram-python-sdk/blob/main/websockets-reference.md#%EF%B8%8F-parameters) for more details.
|
|
57
60
|
- `sample_rate`: Sample rate of the audio in Hz (default: 16000)
|
|
58
61
|
- `language`: Language code for transcription (default: "en-US")
|
|
59
|
-
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default:
|
|
62
|
+
- `keep_alive_interval`: Interval in seconds to send keep-alive messages (default: 1.0s)
|
|
63
|
+
- `connection_timeout`: Timeout to wait for the Deepgram connection to be established before skipping the in seconds to send keep-alive messages (default: 15.0s)
|
|
60
64
|
|
|
61
65
|
## Requirements
|
|
62
66
|
|
|
63
67
|
- Python 3.10+
|
|
64
|
-
- deepgram-sdk>=
|
|
68
|
+
- deepgram-sdk>=5.0.0,<5.1
|
|
65
69
|
- numpy>=2.2.6,<2.3
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
./.gitignore,sha256=S6wPCu4rBDB_yyTYoXbMIR-pn4OPv6b3Ulnx1n5RWvo,916
|
|
2
|
+
./PKG-INFO,sha256=Dk3w-R0OZAg3vtpBAH04f7XZLFDGVbGHopiZUjngTiQ,2273
|
|
3
|
+
./README.md,sha256=CX3wmR5ztY0crI5VSmBt2K0vBVjFvEhBr-SNuycL1Uc,1717
|
|
4
|
+
./pyproject.toml,sha256=W6nptgCD5B-Nmob7_af6knTNrXDRWAT-BPaGKzHVXHY,1102
|
|
5
|
+
./vision_agents/plugins/deepgram/__init__.py,sha256=iBBsZvcyd4KfkcUHsi1QiVVQnPEKvAweGZ40eHeENs4,159
|
|
6
|
+
./vision_agents/plugins/deepgram/stt.py,sha256=I2eNU_O_xAX5rDJufm-ooVvF4kYxOrPh0_F2i8diYWY,13124
|
|
7
|
+
./vision_agents/plugins/deepgram/utils.py,sha256=7xcGxnhcuVpqHIp1F_d1ARTq6y0jQGZsPx_2hwBifZ0,527
|
|
8
|
+
vision_agents/plugins/deepgram/__init__.py,sha256=iBBsZvcyd4KfkcUHsi1QiVVQnPEKvAweGZ40eHeENs4,159
|
|
9
|
+
vision_agents/plugins/deepgram/stt.py,sha256=I2eNU_O_xAX5rDJufm-ooVvF4kYxOrPh0_F2i8diYWY,13124
|
|
10
|
+
vision_agents/plugins/deepgram/utils.py,sha256=7xcGxnhcuVpqHIp1F_d1ARTq6y0jQGZsPx_2hwBifZ0,527
|
|
11
|
+
vision_agents_plugins_deepgram-0.1.6.dist-info/METADATA,sha256=Dk3w-R0OZAg3vtpBAH04f7XZLFDGVbGHopiZUjngTiQ,2273
|
|
12
|
+
vision_agents_plugins_deepgram-0.1.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
13
|
+
vision_agents_plugins_deepgram-0.1.6.dist-info/RECORD,,
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
./.gitignore,sha256=S6wPCu4rBDB_yyTYoXbMIR-pn4OPv6b3Ulnx1n5RWvo,916
|
|
2
|
-
./PKG-INFO,sha256=KWHYHyxCwhi8_0YAo0-QYJcLARlbvrck6g0WsmeFtlQ,1793
|
|
3
|
-
./README.md,sha256=RQMD14Xdhof5KIHFkJe0GK4lomoyijCDiBpdt9RG5Bk,1242
|
|
4
|
-
./pyproject.toml,sha256=lWrmuNRybdSuN1cKoRDwW20J4gQ-FPnrSs0AUt3z5Dk,1097
|
|
5
|
-
./vision_agents/plugins/deepgram/__init__.py,sha256=iBBsZvcyd4KfkcUHsi1QiVVQnPEKvAweGZ40eHeENs4,159
|
|
6
|
-
./vision_agents/plugins/deepgram/stt.py,sha256=jMMIAG8NkBB5CkH-MmJX1KwlUTbmapOcdDBiS4jddCI,11151
|
|
7
|
-
vision_agents/plugins/deepgram/__init__.py,sha256=iBBsZvcyd4KfkcUHsi1QiVVQnPEKvAweGZ40eHeENs4,159
|
|
8
|
-
vision_agents/plugins/deepgram/stt.py,sha256=jMMIAG8NkBB5CkH-MmJX1KwlUTbmapOcdDBiS4jddCI,11151
|
|
9
|
-
vision_agents_plugins_deepgram-0.1.5.dist-info/METADATA,sha256=KWHYHyxCwhi8_0YAo0-QYJcLARlbvrck6g0WsmeFtlQ,1793
|
|
10
|
-
vision_agents_plugins_deepgram-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
11
|
-
vision_agents_plugins_deepgram-0.1.5.dist-info/RECORD,,
|
|
File without changes
|