vision-agents-plugins-deepgram 0.1.9__tar.gz → 0.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of vision-agents-plugins-deepgram might be problematic. Click here for more details.
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.11}/PKG-INFO +1 -1
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.11}/vision_agents/plugins/deepgram/stt.py +21 -45
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.11}/.gitignore +0 -0
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.11}/README.md +0 -0
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.11}/pyproject.toml +0 -0
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.11}/vision_agents/plugins/deepgram/__init__.py +0 -0
- {vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.11}/vision_agents/plugins/deepgram/utils.py +0 -0
|
@@ -3,7 +3,7 @@ import contextlib
|
|
|
3
3
|
import logging
|
|
4
4
|
import os
|
|
5
5
|
import time
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Any, Dict, Optional
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import websockets
|
|
@@ -20,11 +20,11 @@ from deepgram.listen.v1.socket_client import AsyncV1SocketClient
|
|
|
20
20
|
from getstream.video.rtc.track_util import PcmData
|
|
21
21
|
|
|
22
22
|
from vision_agents.core import stt
|
|
23
|
+
from vision_agents.core.stt import TranscriptResponse
|
|
23
24
|
|
|
24
25
|
from .utils import generate_silence
|
|
25
26
|
|
|
26
|
-
|
|
27
|
-
from vision_agents.core.edge.types import Participant
|
|
27
|
+
from vision_agents.core.edge.types import Participant
|
|
28
28
|
|
|
29
29
|
logger = logging.getLogger(__name__)
|
|
30
30
|
|
|
@@ -50,7 +50,6 @@ class STT(stt.STT):
|
|
|
50
50
|
self,
|
|
51
51
|
api_key: Optional[str] = None,
|
|
52
52
|
options: Optional[dict] = None,
|
|
53
|
-
sample_rate: int = 48000,
|
|
54
53
|
language: str = "en-US",
|
|
55
54
|
interim_results: bool = True,
|
|
56
55
|
client: Optional[AsyncDeepgramClient] = None,
|
|
@@ -70,7 +69,7 @@ class STT(stt.STT):
|
|
|
70
69
|
connection_timeout: Time to wait for the Deepgram connection to be established.
|
|
71
70
|
|
|
72
71
|
"""
|
|
73
|
-
super().__init__(
|
|
72
|
+
super().__init__(provider_name="deepgram")
|
|
74
73
|
|
|
75
74
|
# If no API key was provided, check for DEEPGRAM_API_KEY in environment
|
|
76
75
|
if api_key is None:
|
|
@@ -86,12 +85,13 @@ class STT(stt.STT):
|
|
|
86
85
|
client if client is not None else AsyncDeepgramClient(api_key=api_key)
|
|
87
86
|
)
|
|
88
87
|
self.dg_connection: Optional[AsyncV1SocketClient] = None
|
|
88
|
+
self.sample_rate = 48000
|
|
89
89
|
|
|
90
90
|
self.options = options or {
|
|
91
91
|
"model": "nova-2",
|
|
92
92
|
"language": language,
|
|
93
93
|
"encoding": "linear16",
|
|
94
|
-
"sample_rate": sample_rate,
|
|
94
|
+
"sample_rate": self.sample_rate,
|
|
95
95
|
"channels": 1,
|
|
96
96
|
"interim_results": interim_results,
|
|
97
97
|
}
|
|
@@ -101,7 +101,7 @@ class STT(stt.STT):
|
|
|
101
101
|
|
|
102
102
|
# Generate a silence audio to use as keep-alive message
|
|
103
103
|
self._keep_alive_data = generate_silence(
|
|
104
|
-
sample_rate=sample_rate, duration_ms=10
|
|
104
|
+
sample_rate=self.sample_rate, duration_ms=10
|
|
105
105
|
)
|
|
106
106
|
self._keep_alive_interval = keep_alive_interval
|
|
107
107
|
|
|
@@ -121,7 +121,7 @@ class STT(stt.STT):
|
|
|
121
121
|
"""
|
|
122
122
|
Start the main task establishing the Deepgram connection and processing the events.
|
|
123
123
|
"""
|
|
124
|
-
if self.
|
|
124
|
+
if self.closed:
|
|
125
125
|
logger.warning("Cannot setup connection - Deepgram instance is closed")
|
|
126
126
|
return None
|
|
127
127
|
|
|
@@ -178,15 +178,8 @@ class STT(stt.STT):
|
|
|
178
178
|
)
|
|
179
179
|
|
|
180
180
|
async def close(self):
|
|
181
|
+
await super().close()
|
|
181
182
|
"""Close the Deepgram connection and clean up resources."""
|
|
182
|
-
if self._is_closed:
|
|
183
|
-
logger.debug("Deepgram STT service already closed")
|
|
184
|
-
return
|
|
185
|
-
|
|
186
|
-
logger.info("Closing Deepgram STT service")
|
|
187
|
-
self._is_closed = True
|
|
188
|
-
|
|
189
|
-
# Close the Deepgram connection if it exists
|
|
190
183
|
if self.dg_connection:
|
|
191
184
|
logger.debug("Closing Deepgram connection")
|
|
192
185
|
try:
|
|
@@ -225,20 +218,17 @@ class STT(stt.STT):
|
|
|
225
218
|
# Check if this is a final result
|
|
226
219
|
is_final = transcript.get("is_final", False)
|
|
227
220
|
|
|
228
|
-
# Create metadata
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
"is_final": is_final,
|
|
233
|
-
"channel_index": transcript.get("channel_index", 0),
|
|
234
|
-
}
|
|
221
|
+
# Create response metadata
|
|
222
|
+
response_metadata = TranscriptResponse(
|
|
223
|
+
confidence=alternatives[0].get("confidence", 0),
|
|
224
|
+
)
|
|
235
225
|
|
|
236
226
|
# Emit immediately for real-time responsiveness
|
|
237
227
|
if is_final:
|
|
238
|
-
self._emit_transcript_event(transcript_text, self._current_user,
|
|
228
|
+
self._emit_transcript_event(transcript_text, self._current_user, response_metadata)
|
|
239
229
|
else:
|
|
240
230
|
self._emit_partial_transcript_event(
|
|
241
|
-
transcript_text, self._current_user,
|
|
231
|
+
transcript_text, self._current_user, response_metadata
|
|
242
232
|
)
|
|
243
233
|
|
|
244
234
|
logger.debug(
|
|
@@ -246,7 +236,7 @@ class STT(stt.STT):
|
|
|
246
236
|
extra={
|
|
247
237
|
"is_final": is_final,
|
|
248
238
|
"text_length": len(transcript_text),
|
|
249
|
-
"confidence":
|
|
239
|
+
"confidence": response_metadata.confidence,
|
|
250
240
|
},
|
|
251
241
|
)
|
|
252
242
|
|
|
@@ -261,29 +251,15 @@ class STT(stt.STT):
|
|
|
261
251
|
logger.warning(f"Deepgram connection closed. message={message}")
|
|
262
252
|
await self.close()
|
|
263
253
|
|
|
264
|
-
async def
|
|
254
|
+
async def process_audio(
|
|
265
255
|
self,
|
|
266
256
|
pcm_data: PcmData,
|
|
267
|
-
|
|
268
|
-
)
|
|
269
|
-
|
|
270
|
-
Process audio data through Deepgram for transcription.
|
|
271
|
-
|
|
272
|
-
Args:
|
|
273
|
-
pcm_data: The PCM audio data to process.
|
|
274
|
-
user_metadata: Additional metadata about the user or session.
|
|
275
|
-
|
|
276
|
-
Returns:
|
|
277
|
-
None - Deepgram operates in asynchronous mode and emits events directly
|
|
278
|
-
when transcripts arrive from the streaming service.
|
|
279
|
-
"""
|
|
280
|
-
if self._is_closed:
|
|
257
|
+
participant: Optional[Participant] = None,
|
|
258
|
+
):
|
|
259
|
+
if self.closed:
|
|
281
260
|
logger.warning("Deepgram connection is closed, ignoring audio")
|
|
282
261
|
return None
|
|
283
262
|
|
|
284
|
-
# Store the current user context for transcript events
|
|
285
|
-
self._current_user = user_metadata # type: ignore[assignment]
|
|
286
|
-
|
|
287
263
|
# Check if the input sample rate matches the expected sample rate
|
|
288
264
|
if pcm_data.sample_rate != self.sample_rate:
|
|
289
265
|
logger.warning(
|
|
@@ -334,7 +310,7 @@ class STT(stt.STT):
|
|
|
334
310
|
Send the silence audio every `interval` seconds
|
|
335
311
|
to prevent Deepgram from closing the connection.
|
|
336
312
|
"""
|
|
337
|
-
while not self.
|
|
313
|
+
while not self.closed and self.dg_connection is not None:
|
|
338
314
|
if self._last_sent_at + self._keep_alive_interval <= time.time():
|
|
339
315
|
logger.debug("Sending keepalive packet to Deepgram...")
|
|
340
316
|
# Send audio silence to keep the connection open
|
|
File without changes
|
|
File without changes
|
{vision_agents_plugins_deepgram-0.1.9 → vision_agents_plugins_deepgram-0.1.11}/pyproject.toml
RENAMED
|
File without changes
|
|
File without changes
|