videosdk-plugins-deepgram 0.0.31__tar.gz → 0.0.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-deepgram might be problematic. Click here for more details.
- {videosdk_plugins_deepgram-0.0.31 → videosdk_plugins_deepgram-0.0.32}/.gitignore +3 -2
- {videosdk_plugins_deepgram-0.0.31 → videosdk_plugins_deepgram-0.0.32}/PKG-INFO +2 -2
- {videosdk_plugins_deepgram-0.0.31 → videosdk_plugins_deepgram-0.0.32}/pyproject.toml +1 -1
- {videosdk_plugins_deepgram-0.0.31 → videosdk_plugins_deepgram-0.0.32}/videosdk/plugins/deepgram/stt.py +40 -23
- videosdk_plugins_deepgram-0.0.32/videosdk/plugins/deepgram/version.py +1 -0
- videosdk_plugins_deepgram-0.0.31/videosdk/plugins/deepgram/version.py +0 -1
- {videosdk_plugins_deepgram-0.0.31 → videosdk_plugins_deepgram-0.0.32}/README.md +0 -0
- {videosdk_plugins_deepgram-0.0.31 → videosdk_plugins_deepgram-0.0.32}/videosdk/plugins/deepgram/__init__.py +0 -0
|
@@ -2,13 +2,12 @@ myenv/
|
|
|
2
2
|
venv/
|
|
3
3
|
env/
|
|
4
4
|
__pycache__/
|
|
5
|
-
|
|
5
|
+
.venv/
|
|
6
6
|
.env
|
|
7
7
|
.env.local
|
|
8
8
|
test_env/
|
|
9
9
|
dist/
|
|
10
10
|
.DS_Store
|
|
11
|
-
|
|
12
11
|
node_modules/
|
|
13
12
|
credentials.json
|
|
14
13
|
.Python
|
|
@@ -16,3 +15,5 @@ build/
|
|
|
16
15
|
eggs/
|
|
17
16
|
sdist/
|
|
18
17
|
wheels/
|
|
18
|
+
docs/
|
|
19
|
+
agent-sdk-reference/
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-deepgram
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.32
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for Deepgram
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
12
12
|
Classifier: Topic :: Multimedia :: Video
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.32
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
|
|
18
18
|
# VideoSDK Deepgram Plugin
|
|
@@ -20,7 +20,7 @@ classifiers = [
|
|
|
20
20
|
"Topic :: Multimedia :: Video",
|
|
21
21
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
22
|
]
|
|
23
|
-
dependencies = ["videosdk-agents>=0.0.
|
|
23
|
+
dependencies = ["videosdk-agents>=0.0.32"]
|
|
24
24
|
|
|
25
25
|
[tool.hatch.version]
|
|
26
26
|
path = "videosdk/plugins/deepgram/version.py"
|
|
@@ -11,6 +11,7 @@ import logging
|
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
14
|
+
|
|
14
15
|
class DeepgramSTT(BaseSTT):
|
|
15
16
|
def __init__(
|
|
16
17
|
self,
|
|
@@ -26,12 +27,27 @@ class DeepgramSTT(BaseSTT):
|
|
|
26
27
|
filler_words: bool = True,
|
|
27
28
|
base_url: str = "wss://api.deepgram.com/v1/listen",
|
|
28
29
|
) -> None:
|
|
30
|
+
"""Initialize the Deepgram STT plugin
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
api_key (str | None, optional): Deepgram API key. Uses DEEPGRAM_API_KEY environment variable if not provided. Defaults to None.
|
|
34
|
+
model (str): The model to use for the STT plugin. Defaults to "nova-2".
|
|
35
|
+
language (str): The language to use for the STT plugin. Defaults to "en-US".
|
|
36
|
+
interim_results (bool): Whether to return interim results. Defaults to True.
|
|
37
|
+
punctuate (bool): Whether to add punctuation. Defaults to True.
|
|
38
|
+
smart_format (bool): Whether to use smart formatting. Defaults to True.
|
|
39
|
+
sample_rate (int): Sample rate to use for the STT plugin. Defaults to 48000.
|
|
40
|
+
endpointing (int): Endpointing threshold. Defaults to 50.
|
|
41
|
+
filler_words (bool): Whether to include filler words. Defaults to True.
|
|
42
|
+
base_url (str): The base URL to use for the STT plugin. Defaults to "wss://api.deepgram.com/v1/listen".
|
|
43
|
+
"""
|
|
29
44
|
super().__init__()
|
|
30
|
-
|
|
45
|
+
|
|
31
46
|
self.api_key = api_key or os.getenv("DEEPGRAM_API_KEY")
|
|
32
47
|
if not self.api_key:
|
|
33
|
-
raise ValueError(
|
|
34
|
-
|
|
48
|
+
raise ValueError(
|
|
49
|
+
"Deepgram API key must be provided either through api_key parameter or DEEPGRAM_API_KEY environment variable")
|
|
50
|
+
|
|
35
51
|
self.model = model
|
|
36
52
|
self.language = language
|
|
37
53
|
self.sample_rate = sample_rate
|
|
@@ -46,7 +62,7 @@ class DeepgramSTT(BaseSTT):
|
|
|
46
62
|
self._ws_task: Optional[asyncio.Task] = None
|
|
47
63
|
self._last_speech_event_time = 0.0
|
|
48
64
|
self._previous_speech_event_time = 0.0
|
|
49
|
-
|
|
65
|
+
|
|
50
66
|
async def process_audio(
|
|
51
67
|
self,
|
|
52
68
|
audio_frames: bytes,
|
|
@@ -54,11 +70,11 @@ class DeepgramSTT(BaseSTT):
|
|
|
54
70
|
**kwargs: Any
|
|
55
71
|
) -> None:
|
|
56
72
|
"""Process audio frames and send to Deepgram's Streaming API"""
|
|
57
|
-
|
|
73
|
+
|
|
58
74
|
if not self._ws:
|
|
59
75
|
await self._connect_ws()
|
|
60
76
|
self._ws_task = asyncio.create_task(self._listen_for_responses())
|
|
61
|
-
|
|
77
|
+
|
|
62
78
|
try:
|
|
63
79
|
await self._ws.send_bytes(audio_frames)
|
|
64
80
|
except Exception as e:
|
|
@@ -75,7 +91,7 @@ class DeepgramSTT(BaseSTT):
|
|
|
75
91
|
"""Background task to listen for WebSocket responses"""
|
|
76
92
|
if not self._ws:
|
|
77
93
|
return
|
|
78
|
-
|
|
94
|
+
|
|
79
95
|
try:
|
|
80
96
|
async for msg in self._ws:
|
|
81
97
|
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
@@ -86,7 +102,8 @@ class DeepgramSTT(BaseSTT):
|
|
|
86
102
|
await self._transcript_callback(response)
|
|
87
103
|
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
88
104
|
logger.error(f"WebSocket error: {self._ws.exception()}")
|
|
89
|
-
self.emit(
|
|
105
|
+
self.emit(
|
|
106
|
+
"error", f"WebSocket error: {self._ws.exception()}")
|
|
90
107
|
break
|
|
91
108
|
except Exception as e:
|
|
92
109
|
logger.error(f"Error in WebSocket listener: {str(e)}")
|
|
@@ -95,13 +112,13 @@ class DeepgramSTT(BaseSTT):
|
|
|
95
112
|
if self._ws:
|
|
96
113
|
await self._ws.close()
|
|
97
114
|
self._ws = None
|
|
98
|
-
|
|
115
|
+
|
|
99
116
|
async def _connect_ws(self) -> None:
|
|
100
117
|
"""Establish WebSocket connection with Deepgram's Streaming API"""
|
|
101
|
-
|
|
118
|
+
|
|
102
119
|
if not self._session:
|
|
103
120
|
self._session = aiohttp.ClientSession()
|
|
104
|
-
|
|
121
|
+
|
|
105
122
|
query_params = {
|
|
106
123
|
"model": self.model,
|
|
107
124
|
"language": self.language,
|
|
@@ -109,7 +126,7 @@ class DeepgramSTT(BaseSTT):
|
|
|
109
126
|
"punctuate": str(self.punctuate).lower(),
|
|
110
127
|
"smart_format": str(self.smart_format).lower(),
|
|
111
128
|
"encoding": "linear16",
|
|
112
|
-
"sample_rate": str(self.sample_rate),
|
|
129
|
+
"sample_rate": str(self.sample_rate),
|
|
113
130
|
"channels": 2,
|
|
114
131
|
"endpointing": self.endpointing,
|
|
115
132
|
"filler_words": str(self.filler_words).lower(),
|
|
@@ -119,22 +136,22 @@ class DeepgramSTT(BaseSTT):
|
|
|
119
136
|
headers = {
|
|
120
137
|
"Authorization": f"Token {self.api_key}",
|
|
121
138
|
}
|
|
122
|
-
|
|
139
|
+
|
|
123
140
|
ws_url = f"{self.base_url}?{urlencode(query_params)}"
|
|
124
|
-
|
|
141
|
+
|
|
125
142
|
try:
|
|
126
143
|
self._ws = await self._session.ws_connect(ws_url, headers=headers)
|
|
127
144
|
except Exception as e:
|
|
128
145
|
logger.error(f"Error connecting to WebSocket: {str(e)}")
|
|
129
146
|
raise
|
|
130
|
-
|
|
147
|
+
|
|
131
148
|
def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
|
|
132
149
|
"""Handle incoming WebSocket messages and generate STT responses"""
|
|
133
150
|
responses = []
|
|
134
151
|
try:
|
|
135
152
|
if msg["type"] == "SpeechStarted":
|
|
136
153
|
current_time = time.time()
|
|
137
|
-
|
|
154
|
+
|
|
138
155
|
if self._last_speech_event_time == 0.0:
|
|
139
156
|
self._last_speech_event_time = current_time
|
|
140
157
|
return responses
|
|
@@ -144,17 +161,17 @@ class DeepgramSTT(BaseSTT):
|
|
|
144
161
|
|
|
145
162
|
self._previous_speech_event_time = self._last_speech_event_time
|
|
146
163
|
self._last_speech_event_time = current_time
|
|
147
|
-
|
|
164
|
+
|
|
148
165
|
if msg["type"] == "Results":
|
|
149
166
|
channel = msg["channel"]
|
|
150
167
|
alternatives = channel["alternatives"]
|
|
151
|
-
|
|
168
|
+
|
|
152
169
|
if alternatives and len(alternatives) > 0:
|
|
153
170
|
alt = alternatives[0]
|
|
154
171
|
is_final = msg["is_final"]
|
|
155
172
|
if alt["transcript"] == "":
|
|
156
173
|
return responses
|
|
157
|
-
|
|
174
|
+
|
|
158
175
|
response = STTResponse(
|
|
159
176
|
event_type=SpeechEventType.FINAL if is_final else SpeechEventType.INTERIM,
|
|
160
177
|
data=SpeechData(
|
|
@@ -167,10 +184,10 @@ class DeepgramSTT(BaseSTT):
|
|
|
167
184
|
metadata={"model": self.model}
|
|
168
185
|
)
|
|
169
186
|
responses.append(response)
|
|
170
|
-
|
|
187
|
+
|
|
171
188
|
except Exception as e:
|
|
172
189
|
logger.error(f"Error handling WebSocket message: {str(e)}")
|
|
173
|
-
|
|
190
|
+
|
|
174
191
|
return responses
|
|
175
192
|
|
|
176
193
|
async def aclose(self) -> None:
|
|
@@ -182,11 +199,11 @@ class DeepgramSTT(BaseSTT):
|
|
|
182
199
|
except asyncio.CancelledError:
|
|
183
200
|
pass
|
|
184
201
|
self._ws_task = None
|
|
185
|
-
|
|
202
|
+
|
|
186
203
|
if self._ws:
|
|
187
204
|
await self._ws.close()
|
|
188
205
|
self._ws = None
|
|
189
|
-
|
|
206
|
+
|
|
190
207
|
if self._session:
|
|
191
208
|
await self._session.close()
|
|
192
209
|
self._session = None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.32"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.31"
|
|
File without changes
|
|
File without changes
|