videosdk-plugins-deepgram 0.0.42__py3-none-any.whl → 0.0.44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-deepgram might be problematic. Click here for more details.

@@ -1,3 +1,4 @@
1
1
  from .stt import DeepgramSTT
2
+ from .stt_v2 import DeepgramSTTV2
2
3
  from .tts import DeepgramTTS
3
- __all__ = ["DeepgramSTT", "DeepgramTTS"]
4
+ __all__ = ["DeepgramSTT","DeepgramSTTV2" "DeepgramTTS"]
@@ -0,0 +1,280 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import numpy as np
6
+ from typing import Any, Optional
7
+ import os
8
+ from urllib.parse import urlencode
9
+ import aiohttp
10
+ from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
11
+ import logging
12
+
13
+ try:
14
+ from scipy import signal
15
+ SCIPY_AVAILABLE = True
16
+ except ImportError:
17
+ SCIPY_AVAILABLE = False
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class DeepgramSTTV2(BaseSTT):
22
+ def __init__(
23
+ self,
24
+ *,
25
+ api_key: str | None = None,
26
+ model: str = "flux-general-en",
27
+ input_sample_rate: int = 48000,
28
+ target_sample_rate: int = 16000,
29
+ eager_eot_threshold:float=0.6,
30
+ eot_threshold:float=0.8,
31
+ eot_timeout_ms:int=7000,
32
+ base_url: str = "wss://api.deepgram.com/v2/listen",
33
+ ) -> None:
34
+ """Initialize the Deepgram STT plugin
35
+
36
+ Args:
37
+ api_key (str | None, optional): Deepgram API key. Uses DEEPGRAM_API_KEY environment variable if not provided. Defaults to None.
38
+ model (str): The model to use for the STT plugin. Defaults to "flux-general-en".
39
+ input_sample_rate (int): The input sample rate to use for the STT plugin. Defaults to 48000.
40
+ target_sample_rate (int): The target sample rate to use for the STT plugin. Defaults to 16000.
41
+ eager_eot_threshold (float): Eager end-of-turn threshold. Defaults to 0.6.
42
+ eot_threshold (float): End-of-turn threshold. Defaults to 0.8.
43
+ eot_timeout_ms (int): End-of-turn timeout in milliseconds. Defaults to 7000.
44
+ base_url (str): The base URL to use for the STT plugin. Defaults to "wss://api.deepgram.com/v1/listen".
45
+ """
46
+ super().__init__()
47
+
48
+ self.api_key = api_key or os.getenv("DEEPGRAM_API_KEY")
49
+ if not self.api_key:
50
+ raise ValueError(
51
+ "Deepgram API key must be provided either through api_key parameter or DEEPGRAM_API_KEY environment variable")
52
+
53
+ self.model = model
54
+ self.input_sample_rate = input_sample_rate
55
+ self.target_sample_rate = target_sample_rate
56
+ self.eager_eot_threshold = eager_eot_threshold
57
+ self.eot_threshold=eot_threshold
58
+ self.eot_timeout_ms = eot_timeout_ms
59
+ self.base_url = base_url
60
+
61
+ self._stream_buffer = bytearray()
62
+ self._target_chunk_size = int(0.1 * self.target_sample_rate * 2)
63
+ self._min_chunk_size = int(0.05 * self.target_sample_rate * 2)
64
+
65
+ self._session: Optional[aiohttp.ClientSession] = None
66
+ self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
67
+ self._ws_task: Optional[asyncio.Task] = None
68
+ self._last_transcript: str = ""
69
+ self._ws_task = None
70
+
71
+
72
+ async def process_audio(
73
+ self,
74
+ audio_frames: bytes,
75
+ **kwargs: Any
76
+ ) -> None:
77
+ """Process audio frames and send to Deeepgram's Flux API"""
78
+
79
+ if not self._ws:
80
+ await self._connect_ws()
81
+ self._ws_task = asyncio.create_task(self._listen_for_responses())
82
+
83
+ try:
84
+ resampled_audio = self._resample_audio(audio_frames)
85
+ if not resampled_audio:
86
+ return
87
+
88
+ self._stream_buffer.extend(resampled_audio)
89
+ # chunk size 100ms
90
+ while len(self._stream_buffer) >= self._target_chunk_size:
91
+ chunk_to_send = bytes(self._stream_buffer[:self._target_chunk_size])
92
+ self._stream_buffer = self._stream_buffer[self._target_chunk_size:]
93
+
94
+ await self._ws.send_bytes(bytes(chunk_to_send))
95
+
96
+ except Exception as e:
97
+ logger.error(f"Error in process_audio: {str(e)}")
98
+ self.emit("error", str(e))
99
+ if self._ws:
100
+ await self._ws.close()
101
+ self._ws = None
102
+ if self._ws_task:
103
+ self._ws_task.cancel()
104
+ self._ws_task = None
105
+
106
+ async def _listen_for_responses(self) -> None:
107
+ """Background task to listen for WebSocket responses"""
108
+ if not self._ws:
109
+ return
110
+
111
+ try:
112
+ async for msg in self._ws:
113
+ if msg.type == aiohttp.WSMsgType.TEXT:
114
+ data = msg.json()
115
+ responses = self._handle_ws_message(data)
116
+ for response in responses:
117
+ if self._transcript_callback:
118
+ await self._transcript_callback(response)
119
+ elif msg.type == aiohttp.WSMsgType.ERROR:
120
+ logger.error(f"WebSocket error: {self._ws.exception()}")
121
+ self.emit(
122
+ "error", f"WebSocket error: {self._ws.exception()}")
123
+ break
124
+ except Exception as e:
125
+ logger.error(f"Error in WebSocket listener: {str(e)}")
126
+ self.emit("error", f"Error in WebSocket listener: {str(e)}")
127
+ finally:
128
+ if self._ws:
129
+ await self._ws.close()
130
+ self._ws = None
131
+
132
+ async def _connect_ws(self) -> None:
133
+ """Establish WebSocket connection with Deepgram's Streaming API"""
134
+ if not self._session:
135
+ self._session = aiohttp.ClientSession()
136
+
137
+ query_params = {
138
+ "model": self.model,
139
+ "encoding": "linear16",
140
+ "sample_rate": self.target_sample_rate,
141
+ "eot_threshold": self.eot_threshold,
142
+ "eot_timeout_ms": self.eot_timeout_ms,
143
+ "eager_eot_threshold": self.eager_eot_threshold,
144
+ }
145
+ headers = {"Authorization": f"Token {self.api_key}"}
146
+ ws_url = f"{self.base_url}?{urlencode(query_params)}"
147
+
148
+ try:
149
+ self._ws = await self._session.ws_connect(ws_url, headers=headers)
150
+ logger.info("Connected to Deepgram V2 WebSocket.")
151
+ except Exception as e:
152
+ logger.error(f"Error connecting to WebSocket: {str(e)}")
153
+ raise
154
+
155
+ def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
156
+ """Handle incoming WebSocket messages and generate STT responses"""
157
+ responses = []
158
+
159
+ try:
160
+ if msg.get("type") != "TurnInfo":
161
+ return responses
162
+
163
+ event = msg.get("event")
164
+ transcript = msg.get("transcript", "")
165
+ start_time = msg.get("audio_window_start", 0.0)
166
+ end_time = msg.get("audio_window_end", 0.0)
167
+ confidence = msg.get("end_of_turn_confidence", 0.0)
168
+
169
+ self._last_transcript = transcript
170
+
171
+ # Emit turn-related events
172
+ if event == "StartOfTurn":
173
+ global_event_emitter.emit("speech_started")
174
+ elif event == "EagerEndOfTurn":
175
+ # TODO
176
+ # global_event_emitter.emit("speech_eager_end")
177
+ pass
178
+ elif event == "EndOfTurn":
179
+ global_event_emitter.emit("speech_stopped")
180
+ if transcript and self._transcript_callback:
181
+ responses.append(
182
+ STTResponse(
183
+ event_type=SpeechEventType.FINAL,
184
+ data=SpeechData(
185
+ text=transcript,
186
+ confidence=confidence,
187
+ start_time=start_time,
188
+ end_time=end_time,
189
+ ),
190
+ metadata={"model": self.model},
191
+ )
192
+ )
193
+ elif event == "TurnResumed":
194
+ # TODO
195
+ # global_event_emitter.emit("speech_resumed")
196
+ pass
197
+
198
+ # Send interim transcript for ongoing turn
199
+ if transcript and event not in ("EndOfTurn",):
200
+ responses.append(
201
+ STTResponse(
202
+ event_type=SpeechEventType.INTERIM,
203
+ data=SpeechData(
204
+ text=transcript,
205
+ confidence=confidence,
206
+ start_time=start_time,
207
+ end_time=end_time,
208
+ ),
209
+ metadata={"model": self.model},
210
+ )
211
+ )
212
+
213
+ except Exception as e:
214
+ logger.error(f"Error handling WebSocket message: {str(e)}")
215
+
216
+ return responses
217
+
218
+ def _resample_audio(self, audio_bytes: bytes) -> bytes:
219
+ """Resample audio from input sample rate to target sample rate and convert to mono."""
220
+ try:
221
+ if not audio_bytes:
222
+ return b''
223
+
224
+ raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
225
+ if raw_audio.size == 0:
226
+ return b''
227
+
228
+ if raw_audio.size % 2 == 0:
229
+ stereo_audio = raw_audio.reshape(-1, 2)
230
+ mono_audio = stereo_audio.astype(np.float32).mean(axis=1)
231
+ else:
232
+ mono_audio = raw_audio.astype(np.float32)
233
+
234
+ if self.input_sample_rate != self.target_sample_rate:
235
+ target_length = int(len(mono_audio) * self.target_sample_rate / self.input_sample_rate)
236
+ resampled_data = signal.resample(mono_audio, target_length)
237
+ else:
238
+ resampled_data = mono_audio
239
+
240
+ resampled_data = np.clip(resampled_data, -32767, 32767)
241
+ return resampled_data.astype(np.int16).tobytes()
242
+
243
+ except Exception as e:
244
+ logger.error(f"Error resampling audio: {e}")
245
+ return b''
246
+
247
+
248
+ async def aclose(self) -> None:
249
+ """Cleanup resources"""
250
+
251
+ if len(self._stream_buffer) >= self._min_chunk_size and self._ws:
252
+ try:
253
+ final_chunk = bytes(self._stream_buffer)
254
+ await self._ws.send_bytes(final_chunk)
255
+ except Exception as e:
256
+ logger.error(f"Error sending final audio: {e}")
257
+
258
+ if self._ws:
259
+ try:
260
+ await self._ws.send_str(json.dumps({"type": "Terminate"}))
261
+ await asyncio.sleep(0.5)
262
+ except Exception as e:
263
+ logger.error(f"Error sending termination: {e}")
264
+
265
+ if self._ws_task:
266
+ self._ws_task.cancel()
267
+ try:
268
+ await self._ws_task
269
+ except asyncio.CancelledError:
270
+ pass
271
+ self._ws_task = None
272
+
273
+ if self._ws:
274
+ await self._ws.close()
275
+ self._ws = None
276
+
277
+ if self._session:
278
+ await self._session.close()
279
+ self._session = None
280
+ await super().aclose()
@@ -1 +1 @@
1
- __version__ = "0.0.42"
1
+ __version__ = "0.0.44"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-deepgram
3
- Version: 0.0.42
3
+ Version: 0.0.44
4
4
  Summary: VideoSDK Agent Framework plugin for Deepgram
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
12
12
  Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
- Requires-Dist: videosdk-agents>=0.0.42
15
+ Requires-Dist: videosdk-agents>=0.0.44
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  # VideoSDK Deepgram Plugin
@@ -0,0 +1,8 @@
1
+ videosdk/plugins/deepgram/__init__.py,sha256=3Bz_6gDJ7C2SHXHqZEmjpiwspGtZx7JT8ig9S8TSNDY,147
2
+ videosdk/plugins/deepgram/stt.py,sha256=FrhLg-57kqySa07Zo8yFJmMjcDcmdkihWcBMfFcxBRY,8309
3
+ videosdk/plugins/deepgram/stt_v2.py,sha256=Q9GL_zunKnExXYoaMSbqzSLzOdiFXjNtM3sOOlssYLw,10735
4
+ videosdk/plugins/deepgram/tts.py,sha256=hd4oifQ3lRV-Ry57EGf-8VrcBm2bM9Fj3VUKSAfmgh8,6884
5
+ videosdk/plugins/deepgram/version.py,sha256=BBkwy5ixyfFpzkdcy0ZL3_AIHWjaIidRK81NCFvqzCg,23
6
+ videosdk_plugins_deepgram-0.0.44.dist-info/METADATA,sha256=BzpR_0dQKUfNy_glax7IWkDOUK1naa5nnvjwLKH-MdY,767
7
+ videosdk_plugins_deepgram-0.0.44.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
8
+ videosdk_plugins_deepgram-0.0.44.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- videosdk/plugins/deepgram/__init__.py,sha256=gzJRVqNliLxQHAbCSiki_YiFqicxRCSKgTk1cCLRkFg,98
2
- videosdk/plugins/deepgram/stt.py,sha256=FrhLg-57kqySa07Zo8yFJmMjcDcmdkihWcBMfFcxBRY,8309
3
- videosdk/plugins/deepgram/tts.py,sha256=hd4oifQ3lRV-Ry57EGf-8VrcBm2bM9Fj3VUKSAfmgh8,6884
4
- videosdk/plugins/deepgram/version.py,sha256=8xjpb8W03gbMhwi-AuZS56wu4p52tvJab-FgIbCxfag,23
5
- videosdk_plugins_deepgram-0.0.42.dist-info/METADATA,sha256=4AoZ0wWkD6lGdoRGeaS-N2Z811-2E1fZTUrCs5gnhwc,767
6
- videosdk_plugins_deepgram-0.0.42.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
- videosdk_plugins_deepgram-0.0.42.dist-info/RECORD,,