videosdk-plugins-deepgram 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-deepgram might be problematic. Click here for more details.
- videosdk_plugins_deepgram-0.0.1/.gitignore +10 -0
- videosdk_plugins_deepgram-0.0.1/PKG-INFO +25 -0
- videosdk_plugins_deepgram-0.0.1/README.md +9 -0
- videosdk_plugins_deepgram-0.0.1/pyproject.toml +33 -0
- videosdk_plugins_deepgram-0.0.1/videosdk/plugins/deepgram/__init__.py +3 -0
- videosdk_plugins_deepgram-0.0.1/videosdk/plugins/deepgram/stt.py +191 -0
- videosdk_plugins_deepgram-0.0.1/videosdk/plugins/deepgram/version.py +1 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: videosdk-plugins-deepgram
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: VideoSDK Agent Framework plugin for Deepgram
|
|
5
|
+
Author: videosdk
|
|
6
|
+
Keywords: ai,audio,deepgram,video,videosdk
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Topic :: Communications :: Conferencing
|
|
10
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
11
|
+
Classifier: Topic :: Multimedia :: Video
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Requires-Dist: videosdk-agents>=0.0.9
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
VideoSDK Deepgram Plugin
|
|
18
|
+
|
|
19
|
+
Agent Framework plugin for stt services from Deepgram.
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install videosdk-plugins-deepgram
|
|
25
|
+
```
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "videosdk-plugins-deepgram"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "VideoSDK Agent Framework plugin for Deepgram"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
authors = [{ name = "videosdk"}]
|
|
12
|
+
keywords = ["video", "audio", "ai", "videosdk", "deepgram"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Topic :: Communications :: Conferencing",
|
|
18
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
19
|
+
"Topic :: Multimedia :: Video",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"videosdk-agents>=0.0.9"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[tool.hatch.version]
|
|
27
|
+
path = "videosdk/plugins/deepgram/version.py"
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["videosdk"]
|
|
31
|
+
|
|
32
|
+
[tool.hatch.build.targets.sdist]
|
|
33
|
+
include = ["/videosdk"]
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import time
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
import os
|
|
7
|
+
from urllib.parse import urlencode
|
|
8
|
+
import aiohttp
|
|
9
|
+
from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
|
|
10
|
+
|
|
11
|
+
class DeepgramSTT(BaseSTT):
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
*,
|
|
15
|
+
api_key: str | None = None,
|
|
16
|
+
model: str = "nova-2",
|
|
17
|
+
language: str = "en-US",
|
|
18
|
+
interim_results: bool = True,
|
|
19
|
+
punctuate: bool = True,
|
|
20
|
+
smart_format: bool = True,
|
|
21
|
+
sample_rate: int = 48000,
|
|
22
|
+
endpointing: int = 50,
|
|
23
|
+
filler_words: bool = True,
|
|
24
|
+
base_url: str = "wss://api.deepgram.com/v1/listen",
|
|
25
|
+
) -> None:
|
|
26
|
+
super().__init__()
|
|
27
|
+
|
|
28
|
+
self.api_key = api_key or os.getenv("DEEPGRAM_API_KEY")
|
|
29
|
+
if not self.api_key:
|
|
30
|
+
raise ValueError("Deepgram API key must be provided either through api_key parameter or DEEPGRAM_API_KEY environment variable")
|
|
31
|
+
|
|
32
|
+
self.model = model
|
|
33
|
+
self.language = language
|
|
34
|
+
self.sample_rate = sample_rate
|
|
35
|
+
self.interim_results = interim_results
|
|
36
|
+
self.punctuate = punctuate
|
|
37
|
+
self.smart_format = smart_format
|
|
38
|
+
self.endpointing = endpointing
|
|
39
|
+
self.filler_words = filler_words
|
|
40
|
+
self.base_url = base_url
|
|
41
|
+
|
|
42
|
+
# WebSocket session for streaming
|
|
43
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
44
|
+
self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
|
|
45
|
+
self._ws_task: Optional[asyncio.Task] = None
|
|
46
|
+
|
|
47
|
+
self._last_speech_event_time = 0.0
|
|
48
|
+
self._previous_speech_event_time = 0.0
|
|
49
|
+
|
|
50
|
+
async def process_audio(
|
|
51
|
+
self,
|
|
52
|
+
audio_frames: bytes,
|
|
53
|
+
language: Optional[str] = None,
|
|
54
|
+
**kwargs: Any
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Process audio frames and send to Deepgram's Streaming API"""
|
|
57
|
+
|
|
58
|
+
if not self._ws:
|
|
59
|
+
await self._connect_ws()
|
|
60
|
+
# Start listening for responses in background
|
|
61
|
+
self._ws_task = asyncio.create_task(self._listen_for_responses())
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
await self._ws.send_bytes(audio_frames)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"Error in process_audio: {str(e)}")
|
|
67
|
+
self.emit("error", str(e))
|
|
68
|
+
if self._ws:
|
|
69
|
+
await self._ws.close()
|
|
70
|
+
self._ws = None
|
|
71
|
+
if self._ws_task:
|
|
72
|
+
self._ws_task.cancel()
|
|
73
|
+
self._ws_task = None
|
|
74
|
+
|
|
75
|
+
async def _listen_for_responses(self) -> None:
|
|
76
|
+
"""Background task to listen for WebSocket responses"""
|
|
77
|
+
if not self._ws:
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
try:
|
|
81
|
+
async for msg in self._ws:
|
|
82
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
83
|
+
data = msg.json()
|
|
84
|
+
responses = self._handle_ws_message(data)
|
|
85
|
+
for response in responses:
|
|
86
|
+
if self._transcript_callback:
|
|
87
|
+
await self._transcript_callback(response)
|
|
88
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
89
|
+
self.emit("error", f"WebSocket error: {self._ws.exception()}")
|
|
90
|
+
break
|
|
91
|
+
except Exception as e:
|
|
92
|
+
self.emit("error", f"Error in WebSocket listener: {str(e)}")
|
|
93
|
+
finally:
|
|
94
|
+
if self._ws:
|
|
95
|
+
await self._ws.close()
|
|
96
|
+
self._ws = None
|
|
97
|
+
|
|
98
|
+
async def _connect_ws(self) -> None:
|
|
99
|
+
"""Establish WebSocket connection with Deepgram's Streaming API"""
|
|
100
|
+
|
|
101
|
+
if not self._session:
|
|
102
|
+
self._session = aiohttp.ClientSession()
|
|
103
|
+
|
|
104
|
+
query_params = {
|
|
105
|
+
"model": self.model,
|
|
106
|
+
"language": self.language,
|
|
107
|
+
"interim_results": str(self.interim_results).lower(),
|
|
108
|
+
"punctuate": str(self.punctuate).lower(),
|
|
109
|
+
"smart_format": str(self.smart_format).lower(),
|
|
110
|
+
"encoding": "linear16",
|
|
111
|
+
"sample_rate": str(self.sample_rate),
|
|
112
|
+
"channels": 2,
|
|
113
|
+
"endpointing": self.endpointing,
|
|
114
|
+
"filler_words": str(self.filler_words).lower(),
|
|
115
|
+
"vad_events": "true",
|
|
116
|
+
"no_delay": "true",
|
|
117
|
+
}
|
|
118
|
+
headers = {
|
|
119
|
+
"Authorization": f"Token {self.api_key}",
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
ws_url = f"{self.base_url}?{urlencode(query_params)}"
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
self._ws = await self._session.ws_connect(ws_url, headers=headers)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"Error connecting to WebSocket: {str(e)}")
|
|
128
|
+
raise
|
|
129
|
+
|
|
130
|
+
def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
|
|
131
|
+
"""Handle incoming WebSocket messages and generate STT responses"""
|
|
132
|
+
responses = []
|
|
133
|
+
try:
|
|
134
|
+
if msg["type"] == "SpeechStarted":
|
|
135
|
+
current_time = time.time()
|
|
136
|
+
|
|
137
|
+
if self._last_speech_event_time == 0.0:
|
|
138
|
+
self._last_speech_event_time = current_time
|
|
139
|
+
return responses
|
|
140
|
+
|
|
141
|
+
if current_time - self._last_speech_event_time < 1.0:
|
|
142
|
+
global_event_emitter.emit("speech_started")
|
|
143
|
+
|
|
144
|
+
self._previous_speech_event_time = self._last_speech_event_time
|
|
145
|
+
self._last_speech_event_time = current_time
|
|
146
|
+
|
|
147
|
+
if msg["type"] == "Results":
|
|
148
|
+
channel = msg["channel"]
|
|
149
|
+
alternatives = channel["alternatives"]
|
|
150
|
+
|
|
151
|
+
if alternatives and len(alternatives) > 0:
|
|
152
|
+
alt = alternatives[0]
|
|
153
|
+
is_final = msg["is_final"]
|
|
154
|
+
if alt["transcript"] == "":
|
|
155
|
+
return responses
|
|
156
|
+
|
|
157
|
+
response = STTResponse(
|
|
158
|
+
event_type=SpeechEventType.FINAL if is_final else SpeechEventType.INTERIM,
|
|
159
|
+
data=SpeechData(
|
|
160
|
+
text=alt["transcript"],
|
|
161
|
+
language=self.language,
|
|
162
|
+
confidence=alt.get("confidence", 0.0),
|
|
163
|
+
start_time=alt["words"][0]["start"] if alt["words"] else 0.0,
|
|
164
|
+
end_time=alt["words"][-1]["end"] if alt["words"] else 0.0,
|
|
165
|
+
),
|
|
166
|
+
metadata={"model": self.model}
|
|
167
|
+
)
|
|
168
|
+
responses.append(response)
|
|
169
|
+
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print(f"Error handling WebSocket message: {str(e)}")
|
|
172
|
+
|
|
173
|
+
return responses
|
|
174
|
+
|
|
175
|
+
async def aclose(self) -> None:
|
|
176
|
+
"""Cleanup resources"""
|
|
177
|
+
if self._ws_task:
|
|
178
|
+
self._ws_task.cancel()
|
|
179
|
+
try:
|
|
180
|
+
await self._ws_task
|
|
181
|
+
except asyncio.CancelledError:
|
|
182
|
+
pass
|
|
183
|
+
self._ws_task = None
|
|
184
|
+
|
|
185
|
+
if self._ws:
|
|
186
|
+
await self._ws.close()
|
|
187
|
+
self._ws = None
|
|
188
|
+
|
|
189
|
+
if self._session:
|
|
190
|
+
await self._session.close()
|
|
191
|
+
self._session = None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.1"
|