videosdk-plugins-deepgram 0.0.42__tar.gz → 0.0.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-deepgram might be problematic. Click here for more details.
- {videosdk_plugins_deepgram-0.0.42 → videosdk_plugins_deepgram-0.0.44}/PKG-INFO +2 -2
- {videosdk_plugins_deepgram-0.0.42 → videosdk_plugins_deepgram-0.0.44}/pyproject.toml +1 -1
- videosdk_plugins_deepgram-0.0.44/videosdk/plugins/deepgram/__init__.py +4 -0
- videosdk_plugins_deepgram-0.0.44/videosdk/plugins/deepgram/stt_v2.py +280 -0
- videosdk_plugins_deepgram-0.0.44/videosdk/plugins/deepgram/version.py +1 -0
- videosdk_plugins_deepgram-0.0.42/videosdk/plugins/deepgram/__init__.py +0 -3
- videosdk_plugins_deepgram-0.0.42/videosdk/plugins/deepgram/version.py +0 -1
- {videosdk_plugins_deepgram-0.0.42 → videosdk_plugins_deepgram-0.0.44}/.gitignore +0 -0
- {videosdk_plugins_deepgram-0.0.42 → videosdk_plugins_deepgram-0.0.44}/README.md +0 -0
- {videosdk_plugins_deepgram-0.0.42 → videosdk_plugins_deepgram-0.0.44}/videosdk/plugins/deepgram/stt.py +0 -0
- {videosdk_plugins_deepgram-0.0.42 → videosdk_plugins_deepgram-0.0.44}/videosdk/plugins/deepgram/tts.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-deepgram
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.44
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for Deepgram
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
12
12
|
Classifier: Topic :: Multimedia :: Video
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.44
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
|
|
18
18
|
# VideoSDK Deepgram Plugin
|
|
@@ -20,7 +20,7 @@ classifiers = [
|
|
|
20
20
|
"Topic :: Multimedia :: Video",
|
|
21
21
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
22
|
]
|
|
23
|
-
dependencies = ["videosdk-agents>=0.0.
|
|
23
|
+
dependencies = ["videosdk-agents>=0.0.44"]
|
|
24
24
|
|
|
25
25
|
[tool.hatch.version]
|
|
26
26
|
path = "videosdk/plugins/deepgram/version.py"
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import numpy as np
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
import os
|
|
8
|
+
from urllib.parse import urlencode
|
|
9
|
+
import aiohttp
|
|
10
|
+
from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
|
|
11
|
+
import logging
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from scipy import signal
|
|
15
|
+
SCIPY_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
SCIPY_AVAILABLE = False
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DeepgramSTTV2(BaseSTT):
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
*,
|
|
25
|
+
api_key: str | None = None,
|
|
26
|
+
model: str = "flux-general-en",
|
|
27
|
+
input_sample_rate: int = 48000,
|
|
28
|
+
target_sample_rate: int = 16000,
|
|
29
|
+
eager_eot_threshold:float=0.6,
|
|
30
|
+
eot_threshold:float=0.8,
|
|
31
|
+
eot_timeout_ms:int=7000,
|
|
32
|
+
base_url: str = "wss://api.deepgram.com/v2/listen",
|
|
33
|
+
) -> None:
|
|
34
|
+
"""Initialize the Deepgram STT plugin
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
api_key (str | None, optional): Deepgram API key. Uses DEEPGRAM_API_KEY environment variable if not provided. Defaults to None.
|
|
38
|
+
model (str): The model to use for the STT plugin. Defaults to "flux-general-en".
|
|
39
|
+
input_sample_rate (int): The input sample rate to use for the STT plugin. Defaults to 48000.
|
|
40
|
+
target_sample_rate (int): The target sample rate to use for the STT plugin. Defaults to 16000.
|
|
41
|
+
eager_eot_threshold (float): Eager end-of-turn threshold. Defaults to 0.6.
|
|
42
|
+
eot_threshold (float): End-of-turn threshold. Defaults to 0.8.
|
|
43
|
+
eot_timeout_ms (int): End-of-turn timeout in milliseconds. Defaults to 7000.
|
|
44
|
+
base_url (str): The base URL to use for the STT plugin. Defaults to "wss://api.deepgram.com/v1/listen".
|
|
45
|
+
"""
|
|
46
|
+
super().__init__()
|
|
47
|
+
|
|
48
|
+
self.api_key = api_key or os.getenv("DEEPGRAM_API_KEY")
|
|
49
|
+
if not self.api_key:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
"Deepgram API key must be provided either through api_key parameter or DEEPGRAM_API_KEY environment variable")
|
|
52
|
+
|
|
53
|
+
self.model = model
|
|
54
|
+
self.input_sample_rate = input_sample_rate
|
|
55
|
+
self.target_sample_rate = target_sample_rate
|
|
56
|
+
self.eager_eot_threshold = eager_eot_threshold
|
|
57
|
+
self.eot_threshold=eot_threshold
|
|
58
|
+
self.eot_timeout_ms = eot_timeout_ms
|
|
59
|
+
self.base_url = base_url
|
|
60
|
+
|
|
61
|
+
self._stream_buffer = bytearray()
|
|
62
|
+
self._target_chunk_size = int(0.1 * self.target_sample_rate * 2)
|
|
63
|
+
self._min_chunk_size = int(0.05 * self.target_sample_rate * 2)
|
|
64
|
+
|
|
65
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
66
|
+
self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
|
|
67
|
+
self._ws_task: Optional[asyncio.Task] = None
|
|
68
|
+
self._last_transcript: str = ""
|
|
69
|
+
self._ws_task = None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def process_audio(
|
|
73
|
+
self,
|
|
74
|
+
audio_frames: bytes,
|
|
75
|
+
**kwargs: Any
|
|
76
|
+
) -> None:
|
|
77
|
+
"""Process audio frames and send to Deeepgram's Flux API"""
|
|
78
|
+
|
|
79
|
+
if not self._ws:
|
|
80
|
+
await self._connect_ws()
|
|
81
|
+
self._ws_task = asyncio.create_task(self._listen_for_responses())
|
|
82
|
+
|
|
83
|
+
try:
|
|
84
|
+
resampled_audio = self._resample_audio(audio_frames)
|
|
85
|
+
if not resampled_audio:
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
self._stream_buffer.extend(resampled_audio)
|
|
89
|
+
# chunk size 100ms
|
|
90
|
+
while len(self._stream_buffer) >= self._target_chunk_size:
|
|
91
|
+
chunk_to_send = bytes(self._stream_buffer[:self._target_chunk_size])
|
|
92
|
+
self._stream_buffer = self._stream_buffer[self._target_chunk_size:]
|
|
93
|
+
|
|
94
|
+
await self._ws.send_bytes(bytes(chunk_to_send))
|
|
95
|
+
|
|
96
|
+
except Exception as e:
|
|
97
|
+
logger.error(f"Error in process_audio: {str(e)}")
|
|
98
|
+
self.emit("error", str(e))
|
|
99
|
+
if self._ws:
|
|
100
|
+
await self._ws.close()
|
|
101
|
+
self._ws = None
|
|
102
|
+
if self._ws_task:
|
|
103
|
+
self._ws_task.cancel()
|
|
104
|
+
self._ws_task = None
|
|
105
|
+
|
|
106
|
+
async def _listen_for_responses(self) -> None:
|
|
107
|
+
"""Background task to listen for WebSocket responses"""
|
|
108
|
+
if not self._ws:
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
async for msg in self._ws:
|
|
113
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
114
|
+
data = msg.json()
|
|
115
|
+
responses = self._handle_ws_message(data)
|
|
116
|
+
for response in responses:
|
|
117
|
+
if self._transcript_callback:
|
|
118
|
+
await self._transcript_callback(response)
|
|
119
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
120
|
+
logger.error(f"WebSocket error: {self._ws.exception()}")
|
|
121
|
+
self.emit(
|
|
122
|
+
"error", f"WebSocket error: {self._ws.exception()}")
|
|
123
|
+
break
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"Error in WebSocket listener: {str(e)}")
|
|
126
|
+
self.emit("error", f"Error in WebSocket listener: {str(e)}")
|
|
127
|
+
finally:
|
|
128
|
+
if self._ws:
|
|
129
|
+
await self._ws.close()
|
|
130
|
+
self._ws = None
|
|
131
|
+
|
|
132
|
+
async def _connect_ws(self) -> None:
|
|
133
|
+
"""Establish WebSocket connection with Deepgram's Streaming API"""
|
|
134
|
+
if not self._session:
|
|
135
|
+
self._session = aiohttp.ClientSession()
|
|
136
|
+
|
|
137
|
+
query_params = {
|
|
138
|
+
"model": self.model,
|
|
139
|
+
"encoding": "linear16",
|
|
140
|
+
"sample_rate": self.target_sample_rate,
|
|
141
|
+
"eot_threshold": self.eot_threshold,
|
|
142
|
+
"eot_timeout_ms": self.eot_timeout_ms,
|
|
143
|
+
"eager_eot_threshold": self.eager_eot_threshold,
|
|
144
|
+
}
|
|
145
|
+
headers = {"Authorization": f"Token {self.api_key}"}
|
|
146
|
+
ws_url = f"{self.base_url}?{urlencode(query_params)}"
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
self._ws = await self._session.ws_connect(ws_url, headers=headers)
|
|
150
|
+
logger.info("Connected to Deepgram V2 WebSocket.")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
logger.error(f"Error connecting to WebSocket: {str(e)}")
|
|
153
|
+
raise
|
|
154
|
+
|
|
155
|
+
def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
|
|
156
|
+
"""Handle incoming WebSocket messages and generate STT responses"""
|
|
157
|
+
responses = []
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
if msg.get("type") != "TurnInfo":
|
|
161
|
+
return responses
|
|
162
|
+
|
|
163
|
+
event = msg.get("event")
|
|
164
|
+
transcript = msg.get("transcript", "")
|
|
165
|
+
start_time = msg.get("audio_window_start", 0.0)
|
|
166
|
+
end_time = msg.get("audio_window_end", 0.0)
|
|
167
|
+
confidence = msg.get("end_of_turn_confidence", 0.0)
|
|
168
|
+
|
|
169
|
+
self._last_transcript = transcript
|
|
170
|
+
|
|
171
|
+
# Emit turn-related events
|
|
172
|
+
if event == "StartOfTurn":
|
|
173
|
+
global_event_emitter.emit("speech_started")
|
|
174
|
+
elif event == "EagerEndOfTurn":
|
|
175
|
+
# TODO
|
|
176
|
+
# global_event_emitter.emit("speech_eager_end")
|
|
177
|
+
pass
|
|
178
|
+
elif event == "EndOfTurn":
|
|
179
|
+
global_event_emitter.emit("speech_stopped")
|
|
180
|
+
if transcript and self._transcript_callback:
|
|
181
|
+
responses.append(
|
|
182
|
+
STTResponse(
|
|
183
|
+
event_type=SpeechEventType.FINAL,
|
|
184
|
+
data=SpeechData(
|
|
185
|
+
text=transcript,
|
|
186
|
+
confidence=confidence,
|
|
187
|
+
start_time=start_time,
|
|
188
|
+
end_time=end_time,
|
|
189
|
+
),
|
|
190
|
+
metadata={"model": self.model},
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
elif event == "TurnResumed":
|
|
194
|
+
# TODO
|
|
195
|
+
# global_event_emitter.emit("speech_resumed")
|
|
196
|
+
pass
|
|
197
|
+
|
|
198
|
+
# Send interim transcript for ongoing turn
|
|
199
|
+
if transcript and event not in ("EndOfTurn",):
|
|
200
|
+
responses.append(
|
|
201
|
+
STTResponse(
|
|
202
|
+
event_type=SpeechEventType.INTERIM,
|
|
203
|
+
data=SpeechData(
|
|
204
|
+
text=transcript,
|
|
205
|
+
confidence=confidence,
|
|
206
|
+
start_time=start_time,
|
|
207
|
+
end_time=end_time,
|
|
208
|
+
),
|
|
209
|
+
metadata={"model": self.model},
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(f"Error handling WebSocket message: {str(e)}")
|
|
215
|
+
|
|
216
|
+
return responses
|
|
217
|
+
|
|
218
|
+
def _resample_audio(self, audio_bytes: bytes) -> bytes:
|
|
219
|
+
"""Resample audio from input sample rate to target sample rate and convert to mono."""
|
|
220
|
+
try:
|
|
221
|
+
if not audio_bytes:
|
|
222
|
+
return b''
|
|
223
|
+
|
|
224
|
+
raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
|
|
225
|
+
if raw_audio.size == 0:
|
|
226
|
+
return b''
|
|
227
|
+
|
|
228
|
+
if raw_audio.size % 2 == 0:
|
|
229
|
+
stereo_audio = raw_audio.reshape(-1, 2)
|
|
230
|
+
mono_audio = stereo_audio.astype(np.float32).mean(axis=1)
|
|
231
|
+
else:
|
|
232
|
+
mono_audio = raw_audio.astype(np.float32)
|
|
233
|
+
|
|
234
|
+
if self.input_sample_rate != self.target_sample_rate:
|
|
235
|
+
target_length = int(len(mono_audio) * self.target_sample_rate / self.input_sample_rate)
|
|
236
|
+
resampled_data = signal.resample(mono_audio, target_length)
|
|
237
|
+
else:
|
|
238
|
+
resampled_data = mono_audio
|
|
239
|
+
|
|
240
|
+
resampled_data = np.clip(resampled_data, -32767, 32767)
|
|
241
|
+
return resampled_data.astype(np.int16).tobytes()
|
|
242
|
+
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.error(f"Error resampling audio: {e}")
|
|
245
|
+
return b''
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
async def aclose(self) -> None:
|
|
249
|
+
"""Cleanup resources"""
|
|
250
|
+
|
|
251
|
+
if len(self._stream_buffer) >= self._min_chunk_size and self._ws:
|
|
252
|
+
try:
|
|
253
|
+
final_chunk = bytes(self._stream_buffer)
|
|
254
|
+
await self._ws.send_bytes(final_chunk)
|
|
255
|
+
except Exception as e:
|
|
256
|
+
logger.error(f"Error sending final audio: {e}")
|
|
257
|
+
|
|
258
|
+
if self._ws:
|
|
259
|
+
try:
|
|
260
|
+
await self._ws.send_str(json.dumps({"type": "Terminate"}))
|
|
261
|
+
await asyncio.sleep(0.5)
|
|
262
|
+
except Exception as e:
|
|
263
|
+
logger.error(f"Error sending termination: {e}")
|
|
264
|
+
|
|
265
|
+
if self._ws_task:
|
|
266
|
+
self._ws_task.cancel()
|
|
267
|
+
try:
|
|
268
|
+
await self._ws_task
|
|
269
|
+
except asyncio.CancelledError:
|
|
270
|
+
pass
|
|
271
|
+
self._ws_task = None
|
|
272
|
+
|
|
273
|
+
if self._ws:
|
|
274
|
+
await self._ws.close()
|
|
275
|
+
self._ws = None
|
|
276
|
+
|
|
277
|
+
if self._session:
|
|
278
|
+
await self._session.close()
|
|
279
|
+
self._session = None
|
|
280
|
+
await super().aclose()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.44"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.42"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|