videosdk-plugins-deepgram 0.0.38__py3-none-any.whl → 0.0.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
1
  from .stt import DeepgramSTT
2
+ from .stt_v2 import DeepgramSTTV2
2
3
  from .tts import DeepgramTTS
3
- __all__ = ["DeepgramSTT", "DeepgramTTS"]
4
+ __all__ = ["DeepgramSTT","DeepgramSTTV2" "DeepgramTTS"]
@@ -0,0 +1,291 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import numpy as np
6
+ from typing import Any, Optional
7
+ import os
8
+ from urllib.parse import urlencode
9
+ import aiohttp
10
+ from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
11
+ import logging
12
+
13
+ try:
14
+ from scipy import signal
15
+ SCIPY_AVAILABLE = True
16
+ except ImportError:
17
+ SCIPY_AVAILABLE = False
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class DeepgramSTTV2(BaseSTT):
22
+ def __init__(
23
+ self,
24
+ *,
25
+ api_key: str | None = None,
26
+ model: str = "flux-general-en",
27
+ input_sample_rate: int = 48000,
28
+ target_sample_rate: int = 16000,
29
+ eager_eot_threshold:float=0.6,
30
+ eot_threshold:float=0.8,
31
+ eot_timeout_ms:int=7000,
32
+ base_url: str = "wss://api.deepgram.com/v2/listen",
33
+ enable_preemptive_generation: bool = False,
34
+ ) -> None:
35
+ """Initialize the Deepgram STT plugin
36
+
37
+ Args:
38
+ api_key (str | None, optional): Deepgram API key. Uses DEEPGRAM_API_KEY environment variable if not provided. Defaults to None.
39
+ model (str): The model to use for the STT plugin. Defaults to "flux-general-en".
40
+ input_sample_rate (int): The input sample rate to use for the STT plugin. Defaults to 48000.
41
+ target_sample_rate (int): The target sample rate to use for the STT plugin. Defaults to 16000.
42
+ eager_eot_threshold (float): Eager end-of-turn threshold. Defaults to 0.6.
43
+ eot_threshold (float): End-of-turn threshold. Defaults to 0.8.
44
+ eot_timeout_ms (int): End-of-turn timeout in milliseconds. Defaults to 7000.
45
+ base_url (str): The base URL to use for the STT plugin. Defaults to "wss://api.deepgram.com/v2/listen".
46
+ enable_preemptive_generation (bool): Enable preemptive generation based on EagerEndOfTurn events. Defaults to False.
47
+ """
48
+ super().__init__()
49
+
50
+ self.api_key = api_key or os.getenv("DEEPGRAM_API_KEY")
51
+ if not self.api_key:
52
+ raise ValueError(
53
+ "Deepgram API key must be provided either through api_key parameter or DEEPGRAM_API_KEY environment variable")
54
+
55
+ self.model = model
56
+ self.input_sample_rate = input_sample_rate
57
+ self.target_sample_rate = target_sample_rate
58
+ self.eager_eot_threshold = eager_eot_threshold
59
+ self.eot_threshold=eot_threshold
60
+ self.eot_timeout_ms = eot_timeout_ms
61
+ self.base_url = base_url
62
+ self.enable_preemptive_generation = enable_preemptive_generation
63
+
64
+ self._stream_buffer = bytearray()
65
+ self._target_chunk_size = int(0.1 * self.target_sample_rate * 2)
66
+ self._min_chunk_size = int(0.05 * self.target_sample_rate * 2)
67
+
68
+ self._session: Optional[aiohttp.ClientSession] = None
69
+ self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
70
+ self._ws_task: Optional[asyncio.Task] = None
71
+ self._last_transcript: str = ""
72
+ self._ws_task = None
73
+
74
+
75
+ async def process_audio(
76
+ self,
77
+ audio_frames: bytes,
78
+ **kwargs: Any
79
+ ) -> None:
80
+ """Process audio frames and send to Deeepgram's Flux API"""
81
+
82
+ if not self._ws:
83
+ await self._connect_ws()
84
+ self._ws_task = asyncio.create_task(self._listen_for_responses())
85
+
86
+ try:
87
+ resampled_audio = self._resample_audio(audio_frames)
88
+ if not resampled_audio:
89
+ return
90
+
91
+ self._stream_buffer.extend(resampled_audio)
92
+ # chunk size 100ms
93
+ while len(self._stream_buffer) >= self._target_chunk_size:
94
+ chunk_to_send = bytes(self._stream_buffer[:self._target_chunk_size])
95
+ self._stream_buffer = self._stream_buffer[self._target_chunk_size:]
96
+
97
+ await self._ws.send_bytes(bytes(chunk_to_send))
98
+
99
+ except Exception as e:
100
+ logger.error(f"Error in process_audio: {str(e)}")
101
+ self.emit("error", str(e))
102
+ if self._ws:
103
+ await self._ws.close()
104
+ self._ws = None
105
+ if self._ws_task:
106
+ self._ws_task.cancel()
107
+ self._ws_task = None
108
+
109
+ async def _listen_for_responses(self) -> None:
110
+ """Background task to listen for WebSocket responses"""
111
+ if not self._ws:
112
+ return
113
+
114
+ try:
115
+ async for msg in self._ws:
116
+ if msg.type == aiohttp.WSMsgType.TEXT:
117
+ data = msg.json()
118
+ responses = self._handle_ws_message(data)
119
+ for response in responses:
120
+ if self._transcript_callback:
121
+ await self._transcript_callback(response)
122
+ elif msg.type == aiohttp.WSMsgType.ERROR:
123
+ logger.error(f"WebSocket error: {self._ws.exception()}")
124
+ self.emit(
125
+ "error", f"WebSocket error: {self._ws.exception()}")
126
+ break
127
+ except Exception as e:
128
+ logger.error(f"Error in WebSocket listener: {str(e)}")
129
+ self.emit("error", f"Error in WebSocket listener: {str(e)}")
130
+ finally:
131
+ if self._ws:
132
+ await self._ws.close()
133
+ self._ws = None
134
+
135
+ async def _connect_ws(self) -> None:
136
+ """Establish WebSocket connection with Deepgram's Streaming API"""
137
+ if not self._session:
138
+ self._session = aiohttp.ClientSession()
139
+
140
+ query_params = {
141
+ "model": self.model,
142
+ "encoding": "linear16",
143
+ "sample_rate": self.target_sample_rate,
144
+ "eot_threshold": self.eot_threshold,
145
+ "eot_timeout_ms": self.eot_timeout_ms,
146
+ "eager_eot_threshold": self.eager_eot_threshold,
147
+ }
148
+ headers = {"Authorization": f"Token {self.api_key}"}
149
+ ws_url = f"{self.base_url}?{urlencode(query_params)}"
150
+
151
+ try:
152
+ self._ws = await self._session.ws_connect(ws_url, headers=headers)
153
+ logger.info("Connected to Deepgram V2 WebSocket.")
154
+ except Exception as e:
155
+ logger.error(f"Error connecting to WebSocket: {str(e)}")
156
+ raise
157
+
158
+ def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
159
+ """Handle incoming WebSocket messages and generate STT responses"""
160
+ responses = []
161
+
162
+ try:
163
+ if msg.get("type") != "TurnInfo":
164
+ return responses
165
+
166
+ event = msg.get("event")
167
+ transcript = msg.get("transcript", "")
168
+ # logger.info(f"{event} and {transcript}")
169
+ start_time = msg.get("audio_window_start", 0.0)
170
+ end_time = msg.get("audio_window_end", 0.0)
171
+ confidence = msg.get("end_of_turn_confidence", 0.0)
172
+
173
+ self._last_transcript = transcript
174
+ # Emit turn-related events
175
+ if event == "StartOfTurn":
176
+ global_event_emitter.emit("speech_started")
177
+ elif event == "EagerEndOfTurn":
178
+ # Handle EagerEndOfTurn for preemptive generation
179
+ if self.enable_preemptive_generation and transcript and self._transcript_callback:
180
+ responses.append(
181
+ STTResponse(
182
+ event_type=SpeechEventType.PREFLIGHT,
183
+ data=SpeechData(
184
+ text=transcript,
185
+ confidence=confidence,
186
+ start_time=start_time,
187
+ end_time=end_time,
188
+ ),
189
+ metadata={"model": self.model},
190
+ )
191
+ )
192
+ elif event == "EndOfTurn":
193
+ logger.info(f"EndOfTurn (FINAL) Transcript: {transcript} and Confidence: {confidence}")
194
+ global_event_emitter.emit("speech_stopped")
195
+ if transcript and self._transcript_callback:
196
+ responses.append(
197
+ STTResponse(
198
+ event_type=SpeechEventType.FINAL,
199
+ data=SpeechData(
200
+ text=transcript,
201
+ confidence=confidence,
202
+ start_time=start_time,
203
+ end_time=end_time,
204
+ ),
205
+ metadata={"model": self.model},
206
+ )
207
+ )
208
+ elif event == "TurnResumed":
209
+ # Send interim to signal user continued speaking
210
+ if self.enable_preemptive_generation and transcript:
211
+ responses.append(
212
+ STTResponse(
213
+ event_type=SpeechEventType.INTERIM,
214
+ data=SpeechData(
215
+ text=transcript,
216
+ confidence=confidence,
217
+ start_time=start_time,
218
+ end_time=end_time,
219
+ ),
220
+ metadata={"model": self.model, "turn_resumed": True},
221
+ )
222
+ )
223
+
224
+ except Exception as e:
225
+ logger.error(f"Error handling WebSocket message: {str(e)}")
226
+
227
+ return responses
228
+
229
+ def _resample_audio(self, audio_bytes: bytes) -> bytes:
230
+ """Resample audio from input sample rate to target sample rate and convert to mono."""
231
+ try:
232
+ if not audio_bytes:
233
+ return b''
234
+
235
+ raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
236
+ if raw_audio.size == 0:
237
+ return b''
238
+
239
+ if raw_audio.size % 2 == 0:
240
+ stereo_audio = raw_audio.reshape(-1, 2)
241
+ mono_audio = stereo_audio.astype(np.float32).mean(axis=1)
242
+ else:
243
+ mono_audio = raw_audio.astype(np.float32)
244
+
245
+ if self.input_sample_rate != self.target_sample_rate:
246
+ target_length = int(len(mono_audio) * self.target_sample_rate / self.input_sample_rate)
247
+ resampled_data = signal.resample(mono_audio, target_length)
248
+ else:
249
+ resampled_data = mono_audio
250
+
251
+ resampled_data = np.clip(resampled_data, -32767, 32767)
252
+ return resampled_data.astype(np.int16).tobytes()
253
+
254
+ except Exception as e:
255
+ logger.error(f"Error resampling audio: {e}")
256
+ return b''
257
+
258
+
259
+ async def aclose(self) -> None:
260
+ """Cleanup resources"""
261
+
262
+ if len(self._stream_buffer) >= self._min_chunk_size and self._ws:
263
+ try:
264
+ final_chunk = bytes(self._stream_buffer)
265
+ await self._ws.send_bytes(final_chunk)
266
+ except Exception as e:
267
+ logger.error(f"Error sending final audio: {e}")
268
+
269
+ if self._ws:
270
+ try:
271
+ await self._ws.send_str(json.dumps({"type": "Terminate"}))
272
+ await asyncio.sleep(0.5)
273
+ except Exception as e:
274
+ logger.error(f"Error sending termination: {e}")
275
+
276
+ if self._ws_task:
277
+ self._ws_task.cancel()
278
+ try:
279
+ await self._ws_task
280
+ except asyncio.CancelledError:
281
+ pass
282
+ self._ws_task = None
283
+
284
+ if self._ws:
285
+ await self._ws.close()
286
+ self._ws = None
287
+
288
+ if self._session:
289
+ await self._session.close()
290
+ self._session = None
291
+ await super().aclose()
@@ -81,12 +81,16 @@ class DeepgramTTS(TTS):
81
81
  self.emit("error", "Audio track or event loop not set")
82
82
  return
83
83
 
84
- await self.interrupt()
84
+ self._should_stop = True
85
+ if self._send_task and not self._send_task.done():
86
+ self._send_task.cancel()
87
+
85
88
  self._should_stop = False
86
89
  await self._stream_synthesis(text)
87
90
 
88
91
  except Exception as e:
89
92
  self.emit("error", f"TTS synthesis failed: {str(e)}")
93
+ raise
90
94
 
91
95
  async def _stream_synthesis(self, text: Union[AsyncIterator[str], str]) -> None:
92
96
  try:
@@ -1 +1 @@
1
- __version__ = "0.0.38"
1
+ __version__ = "0.0.55"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-deepgram
3
- Version: 0.0.38
3
+ Version: 0.0.55
4
4
  Summary: VideoSDK Agent Framework plugin for Deepgram
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
12
12
  Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
- Requires-Dist: videosdk-agents>=0.0.38
15
+ Requires-Dist: videosdk-agents>=0.0.55
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  # VideoSDK Deepgram Plugin
@@ -0,0 +1,8 @@
1
+ videosdk/plugins/deepgram/__init__.py,sha256=3Bz_6gDJ7C2SHXHqZEmjpiwspGtZx7JT8ig9S8TSNDY,147
2
+ videosdk/plugins/deepgram/stt.py,sha256=FrhLg-57kqySa07Zo8yFJmMjcDcmdkihWcBMfFcxBRY,8309
3
+ videosdk/plugins/deepgram/stt_v2.py,sha256=ybC4dCOMPzc6iq8d0eyybvQaY0e_U_IkivR5kkrGRB4,11761
4
+ videosdk/plugins/deepgram/tts.py,sha256=ZcQoF50wmNnWsozo4iwOcZIxavGelLMQLqCRk3GvAvs,7025
5
+ videosdk/plugins/deepgram/version.py,sha256=jXOOiiBG8YrcSzw2nujXNZ67Lc4_BDh5cva6CjfFnJo,23
6
+ videosdk_plugins_deepgram-0.0.55.dist-info/METADATA,sha256=VZ34CjoCYKWNVRJFOJBYYm8j7SiiptfL507jqdh1llk,767
7
+ videosdk_plugins_deepgram-0.0.55.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
8
+ videosdk_plugins_deepgram-0.0.55.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,7 +0,0 @@
1
- videosdk/plugins/deepgram/__init__.py,sha256=gzJRVqNliLxQHAbCSiki_YiFqicxRCSKgTk1cCLRkFg,98
2
- videosdk/plugins/deepgram/stt.py,sha256=FrhLg-57kqySa07Zo8yFJmMjcDcmdkihWcBMfFcxBRY,8309
3
- videosdk/plugins/deepgram/tts.py,sha256=hd4oifQ3lRV-Ry57EGf-8VrcBm2bM9Fj3VUKSAfmgh8,6884
4
- videosdk/plugins/deepgram/version.py,sha256=R5QxTjVaID7odO0eBWpOnyCjNQxBZ7cpyruM_NMOoDc,23
5
- videosdk_plugins_deepgram-0.0.38.dist-info/METADATA,sha256=9mXxKWbfmNQ3BrruolL09sPjKPfeuCtZoKiven2VZfU,767
6
- videosdk_plugins_deepgram-0.0.38.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
- videosdk_plugins_deepgram-0.0.38.dist-info/RECORD,,