videosdk-plugins-assemblyai 0.0.30__py3-none-any.whl → 0.0.31__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-assemblyai might be problematic. Click here for more details.
- videosdk/plugins/assemblyai/stt.py +242 -103
- videosdk/plugins/assemblyai/version.py +1 -1
- {videosdk_plugins_assemblyai-0.0.30.dist-info → videosdk_plugins_assemblyai-0.0.31.dist-info}/METADATA +3 -2
- videosdk_plugins_assemblyai-0.0.31.dist-info/RECORD +6 -0
- videosdk_plugins_assemblyai-0.0.30.dist-info/RECORD +0 -6
- {videosdk_plugins_assemblyai-0.0.30.dist-info → videosdk_plugins_assemblyai-0.0.31.dist-info}/WHEEL +0 -0
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
-
import
|
|
4
|
+
import json
|
|
5
5
|
import os
|
|
6
|
-
import
|
|
7
|
-
from
|
|
6
|
+
from typing import Any, Optional
|
|
7
|
+
from urllib.parse import urlencode
|
|
8
|
+
import logging
|
|
8
9
|
|
|
9
|
-
import aiohttp
|
|
10
10
|
import numpy as np
|
|
11
|
-
|
|
11
|
+
import aiohttp
|
|
12
|
+
from videosdk.agents import STT as BaseSTT, STTResponse, SpeechData, SpeechEventType, global_event_emitter
|
|
12
13
|
|
|
13
14
|
try:
|
|
14
15
|
from scipy import signal
|
|
@@ -16,134 +17,272 @@ try:
|
|
|
16
17
|
except ImportError:
|
|
17
18
|
SCIPY_AVAILABLE = False
|
|
18
19
|
|
|
19
|
-
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
20
21
|
|
|
21
|
-
class AssemblyAISTT(
|
|
22
|
+
class AssemblyAISTT(BaseSTT):
|
|
22
23
|
"""
|
|
23
|
-
VideoSDK Agent Framework STT plugin for AssemblyAI.
|
|
24
|
+
VideoSDK Agent Framework STT plugin for AssemblyAI Streaming API.
|
|
25
|
+
Real-time speech-to-text using WebSocket connection.
|
|
24
26
|
"""
|
|
25
27
|
|
|
26
28
|
def __init__(
|
|
27
29
|
self,
|
|
28
30
|
*,
|
|
29
31
|
api_key: str | None = None,
|
|
30
|
-
language_code: str = "en_us",
|
|
31
32
|
input_sample_rate: int = 48000,
|
|
32
33
|
target_sample_rate: int = 16000,
|
|
33
|
-
|
|
34
|
-
|
|
34
|
+
format_turns: bool = True,
|
|
35
|
+
word_boost: list[str] | None = None,
|
|
36
|
+
end_of_turn_confidence_threshold: float = 0.5,
|
|
37
|
+
min_end_of_turn_silence_when_confident: int = 800,
|
|
38
|
+
max_turn_silence: int = 2000,
|
|
35
39
|
) -> None:
|
|
36
40
|
super().__init__()
|
|
41
|
+
|
|
37
42
|
if not SCIPY_AVAILABLE:
|
|
38
43
|
raise ImportError("scipy is not installed. Please install it with 'pip install scipy'")
|
|
39
44
|
|
|
40
45
|
self.api_key = api_key or os.getenv("ASSEMBLYAI_API_KEY")
|
|
41
46
|
if not self.api_key:
|
|
42
|
-
raise ValueError(
|
|
43
|
-
|
|
44
|
-
|
|
47
|
+
raise ValueError(
|
|
48
|
+
"AssemblyAI API key must be provided either through the 'api_key' parameter "
|
|
49
|
+
"or the 'ASSEMBLYAI_API_KEY' environment variable."
|
|
50
|
+
)
|
|
45
51
|
self.input_sample_rate = input_sample_rate
|
|
46
52
|
self.target_sample_rate = target_sample_rate
|
|
47
|
-
self.
|
|
48
|
-
self.
|
|
49
|
-
|
|
50
|
-
self.
|
|
51
|
-
self.
|
|
52
|
-
self._is_speaking = False
|
|
53
|
-
self._silence_frames = 0
|
|
54
|
-
self._lock = asyncio.Lock()
|
|
53
|
+
self.format_turns = format_turns
|
|
54
|
+
self.word_boost = word_boost or []
|
|
55
|
+
self.end_of_turn_confidence_threshold = end_of_turn_confidence_threshold
|
|
56
|
+
self.min_end_of_turn_silence_when_confident = min_end_of_turn_silence_when_confident
|
|
57
|
+
self.max_turn_silence = max_turn_silence
|
|
55
58
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
+
connection_params = {
|
|
60
|
+
"sample_rate": self.target_sample_rate,
|
|
61
|
+
"format_turns": self.format_turns,
|
|
62
|
+
}
|
|
63
|
+
|
|
59
64
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
self._is_speaking = False
|
|
73
|
-
self._silence_frames = 0
|
|
65
|
+
if self.end_of_turn_confidence_threshold != 0.7:
|
|
66
|
+
connection_params["end_of_turn_confidence_threshold"] = self.end_of_turn_confidence_threshold
|
|
67
|
+
if self.min_end_of_turn_silence_when_confident != 1500:
|
|
68
|
+
connection_params["min_end_of_turn_silence_when_confident"] = self.min_end_of_turn_silence_when_confident
|
|
69
|
+
if self.max_turn_silence != 3000:
|
|
70
|
+
connection_params["max_turn_silence"] = self.max_turn_silence
|
|
71
|
+
|
|
72
|
+
if self.word_boost:
|
|
73
|
+
connection_params["word_boost"] = json.dumps(self.word_boost)
|
|
74
|
+
|
|
75
|
+
self.ws_url = f"wss://streaming.assemblyai.com/v3/ws?{urlencode(connection_params)}"
|
|
76
|
+
logger.info(f"[AssemblyAI] WebSocket URL: {self.ws_url}")
|
|
74
77
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
79
|
+
self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
|
|
80
|
+
self._ws_task: Optional[asyncio.Task] = None
|
|
81
|
+
|
|
82
|
+
self._stream_buffer = bytearray()
|
|
83
|
+
self._target_chunk_size = int(0.1 * self.target_sample_rate * 2)
|
|
84
|
+
self._min_chunk_size = int(0.05 * self.target_sample_rate * 2)
|
|
85
|
+
|
|
86
|
+
self._last_speech_event_time = 0.0
|
|
87
|
+
self._last_transcript = ""
|
|
88
|
+
self._is_speaking = False
|
|
78
89
|
|
|
79
|
-
async def
|
|
80
|
-
|
|
81
|
-
|
|
90
|
+
async def process_audio(
|
|
91
|
+
self,
|
|
92
|
+
audio_frames: bytes,
|
|
93
|
+
**kwargs: Any
|
|
94
|
+
) -> None:
|
|
95
|
+
"""Process audio frames and send to AssemblyAI's Streaming API"""
|
|
96
|
+
|
|
97
|
+
if not self._ws:
|
|
98
|
+
await self._connect_ws()
|
|
99
|
+
self._ws_task = asyncio.create_task(self._listen_for_responses())
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
resampled_audio = self._resample_audio(audio_frames)
|
|
103
|
+
if not resampled_audio:
|
|
82
104
|
return
|
|
83
|
-
|
|
84
|
-
self.
|
|
105
|
+
|
|
106
|
+
self._stream_buffer.extend(resampled_audio)
|
|
107
|
+
|
|
108
|
+
while len(self._stream_buffer) >= self._target_chunk_size:
|
|
109
|
+
chunk_to_send = bytes(self._stream_buffer[:self._target_chunk_size])
|
|
110
|
+
self._stream_buffer = self._stream_buffer[self._target_chunk_size:]
|
|
111
|
+
|
|
112
|
+
await self._ws.send_bytes(chunk_to_send)
|
|
113
|
+
|
|
114
|
+
except Exception as e:
|
|
115
|
+
logger.error(f"Error in process_audio: {str(e)}")
|
|
116
|
+
self.emit("error", str(e))
|
|
117
|
+
if self._ws:
|
|
118
|
+
await self._ws.close()
|
|
119
|
+
self._ws = None
|
|
120
|
+
if self._ws_task:
|
|
121
|
+
self._ws_task.cancel()
|
|
122
|
+
self._ws_task = None
|
|
123
|
+
|
|
124
|
+
async def _listen_for_responses(self) -> None:
|
|
125
|
+
"""Background task to listen for WebSocket responses"""
|
|
126
|
+
if not self._ws:
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
async for msg in self._ws:
|
|
131
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
132
|
+
data = msg.json()
|
|
133
|
+
responses = self._handle_ws_message(data)
|
|
134
|
+
for response in responses:
|
|
135
|
+
if self._transcript_callback:
|
|
136
|
+
await self._transcript_callback(response)
|
|
137
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
138
|
+
logger.error(f"WebSocket error: {self._ws.exception()}")
|
|
139
|
+
self.emit("error", f"WebSocket error: {self._ws.exception()}")
|
|
140
|
+
break
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"Error in WebSocket listener: {str(e)}")
|
|
143
|
+
self.emit("error", f"Error in WebSocket listener: {str(e)}")
|
|
144
|
+
finally:
|
|
145
|
+
if self._ws:
|
|
146
|
+
await self._ws.close()
|
|
147
|
+
self._ws = None
|
|
148
|
+
|
|
149
|
+
async def _connect_ws(self) -> None:
|
|
150
|
+
"""Establish WebSocket connection with AssemblyAI's Streaming API"""
|
|
151
|
+
|
|
152
|
+
if not self._session:
|
|
153
|
+
self._session = aiohttp.ClientSession()
|
|
154
|
+
|
|
155
|
+
headers = {
|
|
156
|
+
"Authorization": self.api_key,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
self._ws = await self._session.ws_connect(self.ws_url, headers=headers)
|
|
161
|
+
logger.info("[AssemblyAI] WebSocket connection opened")
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.error(f"Error connecting to WebSocket: {str(e)}")
|
|
164
|
+
raise
|
|
165
|
+
|
|
166
|
+
def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
|
|
167
|
+
"""Handle incoming WebSocket messages and generate STT responses"""
|
|
168
|
+
responses = []
|
|
85
169
|
|
|
86
170
|
try:
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
171
|
+
msg_type = msg.get('type')
|
|
172
|
+
logger.info(f"[AssemblyAI] Message type: {msg_type}")
|
|
173
|
+
|
|
174
|
+
if msg_type == "Begin":
|
|
175
|
+
session_id = msg.get('id')
|
|
176
|
+
logger.info(f"[AssemblyAI] Session began: ID={session_id}")
|
|
177
|
+
|
|
178
|
+
elif msg_type == "Turn":
|
|
179
|
+
transcript = msg.get('transcript', '')
|
|
180
|
+
formatted = msg.get('turn_is_formatted', False)
|
|
181
|
+
confidence = msg.get('confidence', 1.0)
|
|
182
|
+
|
|
183
|
+
if transcript and transcript.strip():
|
|
184
|
+
self._last_transcript = transcript.strip()
|
|
185
|
+
|
|
186
|
+
event_type = SpeechEventType.FINAL if formatted else SpeechEventType.INTERIM
|
|
187
|
+
|
|
188
|
+
response = STTResponse(
|
|
189
|
+
event_type=event_type,
|
|
190
|
+
data=SpeechData(
|
|
191
|
+
text=transcript.strip(),
|
|
192
|
+
confidence=confidence
|
|
193
|
+
)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
responses.append(response)
|
|
197
|
+
|
|
198
|
+
if not self._is_speaking:
|
|
199
|
+
self._is_speaking = True
|
|
200
|
+
global_event_emitter.emit("speech_started")
|
|
201
|
+
|
|
202
|
+
if formatted:
|
|
203
|
+
self._is_speaking = False
|
|
204
|
+
self._last_transcript = ""
|
|
205
|
+
|
|
206
|
+
elif msg_type == "Termination":
|
|
207
|
+
if self._last_transcript and self._is_speaking:
|
|
208
|
+
final_response = STTResponse(
|
|
209
|
+
event_type=SpeechEventType.FINAL,
|
|
210
|
+
data=SpeechData(
|
|
211
|
+
text=self._last_transcript,
|
|
212
|
+
confidence=1.0
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
responses.append(final_response)
|
|
216
|
+
self._last_transcript = ""
|
|
217
|
+
self._is_speaking = False
|
|
218
|
+
|
|
219
|
+
elif msg_type == "Error":
|
|
220
|
+
error_msg = msg.get('error', 'Unknown error')
|
|
221
|
+
logger.error(f"AssemblyAI Error: {error_msg}")
|
|
119
222
|
|
|
120
223
|
except Exception as e:
|
|
121
|
-
|
|
122
|
-
|
|
224
|
+
logger.error(f"Error handling WebSocket message: {str(e)}")
|
|
225
|
+
|
|
226
|
+
return responses
|
|
123
227
|
|
|
124
228
|
def _resample_audio(self, audio_bytes: bytes) -> bytes:
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
229
|
+
"""Resample audio from input sample rate to target sample rate and convert to mono."""
|
|
230
|
+
try:
|
|
231
|
+
if not audio_bytes:
|
|
232
|
+
return b''
|
|
233
|
+
|
|
234
|
+
raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
|
|
235
|
+
if raw_audio.size == 0:
|
|
236
|
+
return b''
|
|
237
|
+
|
|
238
|
+
if raw_audio.size % 2 == 0:
|
|
239
|
+
stereo_audio = raw_audio.reshape(-1, 2)
|
|
240
|
+
mono_audio = stereo_audio.astype(np.float32).mean(axis=1)
|
|
241
|
+
else:
|
|
242
|
+
mono_audio = raw_audio.astype(np.float32)
|
|
243
|
+
|
|
244
|
+
if self.input_sample_rate != self.target_sample_rate:
|
|
245
|
+
target_length = int(len(mono_audio) * self.target_sample_rate / self.input_sample_rate)
|
|
246
|
+
resampled_data = signal.resample(mono_audio, target_length)
|
|
247
|
+
else:
|
|
248
|
+
resampled_data = mono_audio
|
|
249
|
+
|
|
250
|
+
resampled_data = np.clip(resampled_data, -32767, 32767)
|
|
251
|
+
return resampled_data.astype(np.int16).tobytes()
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.error(f"Error resampling audio: {e}")
|
|
255
|
+
return b''
|
|
142
256
|
|
|
143
257
|
async def aclose(self) -> None:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
258
|
+
"""Cleanup resources"""
|
|
259
|
+
|
|
260
|
+
if len(self._stream_buffer) >= self._min_chunk_size and self._ws:
|
|
261
|
+
try:
|
|
262
|
+
final_chunk = bytes(self._stream_buffer)
|
|
263
|
+
await self._ws.send_bytes(final_chunk)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
logger.error(f"Error sending final audio: {e}")
|
|
266
|
+
|
|
267
|
+
if self._ws:
|
|
268
|
+
try:
|
|
269
|
+
await self._ws.send_str(json.dumps({"type": "Terminate"}))
|
|
270
|
+
await asyncio.sleep(0.5)
|
|
271
|
+
except Exception as e:
|
|
272
|
+
logger.error(f"Error sending termination: {e}")
|
|
147
273
|
|
|
148
|
-
if self.
|
|
149
|
-
|
|
274
|
+
if self._ws_task:
|
|
275
|
+
self._ws_task.cancel()
|
|
276
|
+
try:
|
|
277
|
+
await self._ws_task
|
|
278
|
+
except asyncio.CancelledError:
|
|
279
|
+
pass
|
|
280
|
+
self._ws_task = None
|
|
281
|
+
|
|
282
|
+
if self._ws:
|
|
283
|
+
await self._ws.close()
|
|
284
|
+
self._ws = None
|
|
285
|
+
|
|
286
|
+
if self._session:
|
|
287
|
+
await self._session.close()
|
|
288
|
+
self._session = None
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.31"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-assemblyai
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.31
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for AssemblyAI
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,7 +12,8 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
12
12
|
Classifier: Topic :: Multimedia :: Video
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
|
-
Requires-Dist:
|
|
15
|
+
Requires-Dist: aiohttp
|
|
16
|
+
Requires-Dist: videosdk-agents>=0.0.31
|
|
16
17
|
Description-Content-Type: text/markdown
|
|
17
18
|
|
|
18
19
|
# VideoSDK Assembly AI Plugin
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
videosdk/plugins/assemblyai/__init__.py,sha256=T4deawBZKrOiGrKFcfksus-wmb5rF5KY7_p6QBRd4QE,59
|
|
2
|
+
videosdk/plugins/assemblyai/stt.py,sha256=_P2IIObmaANmBDxzc7TVxXa9zOEv-Aax5ijNEUzNWvM,11005
|
|
3
|
+
videosdk/plugins/assemblyai/version.py,sha256=YRrMDApG1V6fNZdI4BA631DmsqtamSXgI1yu0tnt4h0,23
|
|
4
|
+
videosdk_plugins_assemblyai-0.0.31.dist-info/METADATA,sha256=TnqW_xMOuHWbty0ZDLM3n5dq87YdTjbBhNEnPgeV2rg,804
|
|
5
|
+
videosdk_plugins_assemblyai-0.0.31.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
+
videosdk_plugins_assemblyai-0.0.31.dist-info/RECORD,,
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
videosdk/plugins/assemblyai/__init__.py,sha256=T4deawBZKrOiGrKFcfksus-wmb5rF5KY7_p6QBRd4QE,59
|
|
2
|
-
videosdk/plugins/assemblyai/stt.py,sha256=4qBQSn0gvQ4ET0ilFW6lry7NEOd_Sc1yqG6v4op6u7M,6247
|
|
3
|
-
videosdk/plugins/assemblyai/version.py,sha256=8ZeepqkW4DvpVeNm92mx0tIzgvVevS4NKWkTXXHuXNY,23
|
|
4
|
-
videosdk_plugins_assemblyai-0.0.30.dist-info/METADATA,sha256=1pk6-Le16OuZ_8ayfLQTQYg9nlP8HHn3bmlwiukI_A0,781
|
|
5
|
-
videosdk_plugins_assemblyai-0.0.30.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
-
videosdk_plugins_assemblyai-0.0.30.dist-info/RECORD,,
|
{videosdk_plugins_assemblyai-0.0.30.dist-info → videosdk_plugins_assemblyai-0.0.31.dist-info}/WHEEL
RENAMED
|
File without changes
|