videosdk-plugins-assemblyai 0.0.30__py3-none-any.whl → 0.0.32__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-assemblyai might be problematic. Click here for more details.

@@ -1,14 +1,15 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
- import io
4
+ import json
5
5
  import os
6
- import wave
7
- from typing import Any
6
+ from typing import Any, Optional
7
+ from urllib.parse import urlencode
8
+ import logging
8
9
 
9
- import aiohttp
10
10
  import numpy as np
11
- from videosdk.agents import STT, STTResponse, SpeechData, SpeechEventType, global_event_emitter
11
+ import aiohttp
12
+ from videosdk.agents import STT as BaseSTT, STTResponse, SpeechData, SpeechEventType, global_event_emitter
12
13
 
13
14
  try:
14
15
  from scipy import signal
@@ -16,134 +17,280 @@ try:
16
17
  except ImportError:
17
18
  SCIPY_AVAILABLE = False
18
19
 
19
- ASSEMBLYAI_API_URL = "https://api.assemblyai.com/v2"
20
+ logger = logging.getLogger(__name__)
20
21
 
21
- class AssemblyAISTT(STT):
22
- """
23
- VideoSDK Agent Framework STT plugin for AssemblyAI.
24
- """
22
+ class AssemblyAISTT(BaseSTT):
25
23
 
26
24
  def __init__(
27
25
  self,
28
26
  *,
29
27
  api_key: str | None = None,
30
- language_code: str = "en_us",
31
28
  input_sample_rate: int = 48000,
32
29
  target_sample_rate: int = 16000,
33
- silence_threshold: float = 0.015,
34
- silence_duration: float = 0.8,
30
+ format_turns: bool = True,
31
+ word_boost: list[str] | None = None,
32
+ end_of_turn_confidence_threshold: float = 0.5,
33
+ min_end_of_turn_silence_when_confident: int = 800,
34
+ max_turn_silence: int = 2000,
35
35
  ) -> None:
36
+ """Initialize the AssemblyAI STT plugin.
37
+
38
+ Args:
39
+ api_key (str | None, optional): AssemblyAI API key. Uses ASSEMBLYAI_API_KEY environment variable if not provided. Defaults to None.
40
+ input_sample_rate (int): The input sample rate to use for the STT plugin. Defaults to 48000.
41
+ target_sample_rate (int): The target sample rate to use for the STT plugin. Defaults to 16000.
42
+ format_turns (bool): Whether to format turns. Defaults to True.
43
+ word_boost (list[str] | None, optional): The word boost to use for the STT plugin. Defaults to None.
44
+ end_of_turn_confidence_threshold (float): The end of turn confidence threshold to use for the STT plugin. Defaults to 0.5.
45
+ min_end_of_turn_silence_when_confident (int): The minimum end of turn silence when confident to use for the STT plugin. Defaults to 800.
46
+ max_turn_silence (int): The maximum turn silence to use for the STT plugin. Defaults to 2000.
47
+ """
36
48
  super().__init__()
49
+
37
50
  if not SCIPY_AVAILABLE:
38
51
  raise ImportError("scipy is not installed. Please install it with 'pip install scipy'")
39
52
 
40
53
  self.api_key = api_key or os.getenv("ASSEMBLYAI_API_KEY")
41
54
  if not self.api_key:
42
- raise ValueError("AssemblyAI API key must be provided either through the 'api_key' parameter or the 'ASSEMBLYAI_API_KEY' environment variable.")
43
-
44
- self.language_code = language_code
55
+ raise ValueError(
56
+ "AssemblyAI API key must be provided either through the 'api_key' parameter "
57
+ "or the 'ASSEMBLYAI_API_KEY' environment variable."
58
+ )
45
59
  self.input_sample_rate = input_sample_rate
46
60
  self.target_sample_rate = target_sample_rate
47
- self.silence_threshold_bytes = int(silence_threshold * 32767)
48
- self.silence_duration_frames = int(silence_duration * self.input_sample_rate)
49
-
50
- self._session = aiohttp.ClientSession(headers={"Authorization": self.api_key})
51
- self._audio_buffer = bytearray()
52
- self._is_speaking = False
53
- self._silence_frames = 0
54
- self._lock = asyncio.Lock()
61
+ self.format_turns = format_turns
62
+ self.word_boost = word_boost or []
63
+ self.end_of_turn_confidence_threshold = end_of_turn_confidence_threshold
64
+ self.min_end_of_turn_silence_when_confident = min_end_of_turn_silence_when_confident
65
+ self.max_turn_silence = max_turn_silence
55
66
 
56
- async def process_audio(self, audio_frames: bytes, **kwargs: Any) -> None:
57
- async with self._lock:
58
- is_silent_chunk = self._is_silent(audio_frames)
67
+ connection_params = {
68
+ "sample_rate": self.target_sample_rate,
69
+ "format_turns": self.format_turns,
70
+ }
71
+
59
72
 
60
- if not is_silent_chunk:
61
- if not self._is_speaking:
62
- self._is_speaking = True
63
- global_event_emitter.emit("speech_started")
64
- self._audio_buffer.extend(audio_frames)
65
- self._silence_frames = 0
66
- else:
67
- if self._is_speaking:
68
- self._silence_frames += len(audio_frames) // 4
69
- if self._silence_frames > self.silence_duration_frames:
70
- global_event_emitter.emit("speech_stopped")
71
- asyncio.create_task(self._transcribe_buffer())
72
- self._is_speaking = False
73
- self._silence_frames = 0
73
+ if self.end_of_turn_confidence_threshold != 0.7:
74
+ connection_params["end_of_turn_confidence_threshold"] = self.end_of_turn_confidence_threshold
75
+ if self.min_end_of_turn_silence_when_confident != 1500:
76
+ connection_params["min_end_of_turn_silence_when_confident"] = self.min_end_of_turn_silence_when_confident
77
+ if self.max_turn_silence != 3000:
78
+ connection_params["max_turn_silence"] = self.max_turn_silence
79
+
80
+ if self.word_boost:
81
+ connection_params["word_boost"] = json.dumps(self.word_boost)
74
82
 
75
- def _is_silent(self, audio_chunk: bytes) -> bool:
76
- audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
77
- return np.max(np.abs(audio_data)) < self.silence_threshold_bytes
83
+ self.ws_url = f"wss://streaming.assemblyai.com/v3/ws?{urlencode(connection_params)}"
84
+ logger.info(f"[AssemblyAI] WebSocket URL: {self.ws_url}")
78
85
 
79
- async def _transcribe_buffer(self):
80
- async with self._lock:
81
- if not self._audio_buffer:
86
+ self._session: Optional[aiohttp.ClientSession] = None
87
+ self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
88
+ self._ws_task: Optional[asyncio.Task] = None
89
+
90
+ self._stream_buffer = bytearray()
91
+ self._target_chunk_size = int(0.1 * self.target_sample_rate * 2)
92
+ self._min_chunk_size = int(0.05 * self.target_sample_rate * 2)
93
+
94
+ self._last_speech_event_time = 0.0
95
+ self._last_transcript = ""
96
+ self._is_speaking = False
97
+
98
+ async def process_audio(
99
+ self,
100
+ audio_frames: bytes,
101
+ **kwargs: Any
102
+ ) -> None:
103
+ """Process audio frames and send to AssemblyAI's Streaming API"""
104
+
105
+ if not self._ws:
106
+ await self._connect_ws()
107
+ self._ws_task = asyncio.create_task(self._listen_for_responses())
108
+
109
+ try:
110
+ resampled_audio = self._resample_audio(audio_frames)
111
+ if not resampled_audio:
82
112
  return
83
- audio_to_send = self._audio_buffer
84
- self._audio_buffer = bytearray()
113
+
114
+ self._stream_buffer.extend(resampled_audio)
115
+
116
+ while len(self._stream_buffer) >= self._target_chunk_size:
117
+ chunk_to_send = bytes(self._stream_buffer[:self._target_chunk_size])
118
+ self._stream_buffer = self._stream_buffer[self._target_chunk_size:]
119
+
120
+ await self._ws.send_bytes(chunk_to_send)
121
+
122
+ except Exception as e:
123
+ logger.error(f"Error in process_audio: {str(e)}")
124
+ self.emit("error", str(e))
125
+ if self._ws:
126
+ await self._ws.close()
127
+ self._ws = None
128
+ if self._ws_task:
129
+ self._ws_task.cancel()
130
+ self._ws_task = None
131
+
132
+ async def _listen_for_responses(self) -> None:
133
+ """Background task to listen for WebSocket responses"""
134
+ if not self._ws:
135
+ return
136
+
137
+ try:
138
+ async for msg in self._ws:
139
+ if msg.type == aiohttp.WSMsgType.TEXT:
140
+ data = msg.json()
141
+ responses = self._handle_ws_message(data)
142
+ for response in responses:
143
+ if self._transcript_callback:
144
+ await self._transcript_callback(response)
145
+ elif msg.type == aiohttp.WSMsgType.ERROR:
146
+ logger.error(f"WebSocket error: {self._ws.exception()}")
147
+ self.emit("error", f"WebSocket error: {self._ws.exception()}")
148
+ break
149
+ except Exception as e:
150
+ logger.error(f"Error in WebSocket listener: {str(e)}")
151
+ self.emit("error", f"Error in WebSocket listener: {str(e)}")
152
+ finally:
153
+ if self._ws:
154
+ await self._ws.close()
155
+ self._ws = None
156
+
157
+ async def _connect_ws(self) -> None:
158
+ """Establish WebSocket connection with AssemblyAI's Streaming API"""
159
+
160
+ if not self._session:
161
+ self._session = aiohttp.ClientSession()
162
+
163
+ headers = {
164
+ "Authorization": self.api_key,
165
+ }
166
+
167
+ try:
168
+ self._ws = await self._session.ws_connect(self.ws_url, headers=headers)
169
+ logger.info("[AssemblyAI] WebSocket connection opened")
170
+ except Exception as e:
171
+ logger.error(f"Error connecting to WebSocket: {str(e)}")
172
+ raise
173
+
174
+ def _handle_ws_message(self, msg: dict) -> list[STTResponse]:
175
+ """Handle incoming WebSocket messages and generate STT responses"""
176
+ responses = []
85
177
 
86
178
  try:
87
- resampled_audio_bytes = self._resample_audio(audio_to_send)
88
- wav_audio = self._create_wav_in_memory(resampled_audio_bytes)
89
-
90
- upload_url = f"{ASSEMBLYAI_API_URL}/upload"
91
- async with self._session.post(upload_url, data=wav_audio) as response:
92
- response.raise_for_status()
93
- upload_data = await response.json()
94
- audio_url = upload_data["upload_url"]
95
-
96
- transcript_url = f"{ASSEMBLYAI_API_URL}/transcript"
97
- payload = {"audio_url": audio_url, "language_code": self.language_code}
98
- async with self._session.post(transcript_url, json=payload) as response:
99
- response.raise_for_status()
100
- transcript_data = await response.json()
101
- transcript_id = transcript_data["id"]
102
-
103
- poll_url = f"{ASSEMBLYAI_API_URL}/transcript/{transcript_id}"
104
- while True:
105
- await asyncio.sleep(1)
106
- async with self._session.get(poll_url) as response:
107
- response.raise_for_status()
108
- result = await response.json()
109
- if result["status"] == "completed":
110
- if result.get("text") and self._transcript_callback:
111
- event = STTResponse(
112
- event_type=SpeechEventType.FINAL,
113
- data=SpeechData(text=result["text"], language=self.language_code, confidence=result.get("confidence", 1.0))
114
- )
115
- await self._transcript_callback(event)
116
- break
117
- elif result["status"] == "error":
118
- raise Exception(f"AssemblyAI transcription failed: {result.get('error')}")
179
+ msg_type = msg.get('type')
180
+ logger.info(f"[AssemblyAI] Message type: {msg_type}")
181
+
182
+ if msg_type == "Begin":
183
+ session_id = msg.get('id')
184
+ logger.info(f"[AssemblyAI] Session began: ID={session_id}")
185
+
186
+ elif msg_type == "Turn":
187
+ transcript = msg.get('transcript', '')
188
+ formatted = msg.get('turn_is_formatted', False)
189
+ confidence = msg.get('confidence', 1.0)
190
+
191
+ if transcript and transcript.strip():
192
+ self._last_transcript = transcript.strip()
193
+
194
+ event_type = SpeechEventType.FINAL if formatted else SpeechEventType.INTERIM
195
+
196
+ response = STTResponse(
197
+ event_type=event_type,
198
+ data=SpeechData(
199
+ text=transcript.strip(),
200
+ confidence=confidence
201
+ )
202
+ )
203
+
204
+ responses.append(response)
205
+
206
+ if not self._is_speaking:
207
+ self._is_speaking = True
208
+ global_event_emitter.emit("speech_started")
209
+
210
+ if formatted:
211
+ self._is_speaking = False
212
+ self._last_transcript = ""
213
+
214
+ elif msg_type == "Termination":
215
+ if self._last_transcript and self._is_speaking:
216
+ final_response = STTResponse(
217
+ event_type=SpeechEventType.FINAL,
218
+ data=SpeechData(
219
+ text=self._last_transcript,
220
+ confidence=1.0
221
+ )
222
+ )
223
+ responses.append(final_response)
224
+ self._last_transcript = ""
225
+ self._is_speaking = False
226
+
227
+ elif msg_type == "Error":
228
+ error_msg = msg.get('error', 'Unknown error')
229
+ logger.error(f"AssemblyAI Error: {error_msg}")
119
230
 
120
231
  except Exception as e:
121
- print(f"!!! ASSEMBLYAI PLUGIN FATAL ERROR: {e} ({type(e).__name__}) !!!")
122
- self.emit("error", f"AssemblyAI transcription error: {e}")
232
+ logger.error(f"Error handling WebSocket message: {str(e)}")
233
+
234
+ return responses
123
235
 
124
236
  def _resample_audio(self, audio_bytes: bytes) -> bytes:
125
- raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
126
- if raw_audio.size == 0: return b''
127
- stereo_audio = raw_audio.reshape(-1, 2)
128
- mono_audio = stereo_audio.astype(np.float32).mean(axis=1)
129
- resampled_data = signal.resample(mono_audio, int(len(mono_audio) * self.target_sample_rate / self.input_sample_rate))
130
- return resampled_data.astype(np.int16).tobytes()
131
-
132
- def _create_wav_in_memory(self, pcm_data: bytes) -> io.BytesIO:
133
- """Creates a WAV file in memory from raw PCM data."""
134
- wav_buffer = io.BytesIO()
135
- with wave.open(wav_buffer, 'wb') as wf:
136
- wf.setnchannels(1) # Mono
137
- wf.setsampwidth(2) # 16-bit
138
- wf.setframerate(self.target_sample_rate)
139
- wf.writeframes(pcm_data)
140
- wav_buffer.seek(0)
141
- return wav_buffer
237
+ """Resample audio from input sample rate to target sample rate and convert to mono."""
238
+ try:
239
+ if not audio_bytes:
240
+ return b''
241
+
242
+ raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
243
+ if raw_audio.size == 0:
244
+ return b''
245
+
246
+ if raw_audio.size % 2 == 0:
247
+ stereo_audio = raw_audio.reshape(-1, 2)
248
+ mono_audio = stereo_audio.astype(np.float32).mean(axis=1)
249
+ else:
250
+ mono_audio = raw_audio.astype(np.float32)
251
+
252
+ if self.input_sample_rate != self.target_sample_rate:
253
+ target_length = int(len(mono_audio) * self.target_sample_rate / self.input_sample_rate)
254
+ resampled_data = signal.resample(mono_audio, target_length)
255
+ else:
256
+ resampled_data = mono_audio
257
+
258
+ resampled_data = np.clip(resampled_data, -32767, 32767)
259
+ return resampled_data.astype(np.int16).tobytes()
260
+
261
+ except Exception as e:
262
+ logger.error(f"Error resampling audio: {e}")
263
+ return b''
142
264
 
143
265
  async def aclose(self) -> None:
144
- if self._is_speaking and self._audio_buffer:
145
- await self._transcribe_buffer()
146
- await asyncio.sleep(1)
266
+ """Cleanup resources"""
267
+
268
+ if len(self._stream_buffer) >= self._min_chunk_size and self._ws:
269
+ try:
270
+ final_chunk = bytes(self._stream_buffer)
271
+ await self._ws.send_bytes(final_chunk)
272
+ except Exception as e:
273
+ logger.error(f"Error sending final audio: {e}")
274
+
275
+ if self._ws:
276
+ try:
277
+ await self._ws.send_str(json.dumps({"type": "Terminate"}))
278
+ await asyncio.sleep(0.5)
279
+ except Exception as e:
280
+ logger.error(f"Error sending termination: {e}")
147
281
 
148
- if self._session and not self._session.closed:
149
- await self._session.close()
282
+ if self._ws_task:
283
+ self._ws_task.cancel()
284
+ try:
285
+ await self._ws_task
286
+ except asyncio.CancelledError:
287
+ pass
288
+ self._ws_task = None
289
+
290
+ if self._ws:
291
+ await self._ws.close()
292
+ self._ws = None
293
+
294
+ if self._session:
295
+ await self._session.close()
296
+ self._session = None
@@ -1 +1 @@
1
- __version__ = "0.0.30"
1
+ __version__ = "0.0.32"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-assemblyai
3
- Version: 0.0.30
3
+ Version: 0.0.32
4
4
  Summary: VideoSDK Agent Framework plugin for AssemblyAI
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -12,7 +12,8 @@ Classifier: Topic :: Multimedia :: Sound/Audio
12
12
  Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
- Requires-Dist: videosdk-agents>=0.0.30
15
+ Requires-Dist: aiohttp
16
+ Requires-Dist: videosdk-agents>=0.0.32
16
17
  Description-Content-Type: text/markdown
17
18
 
18
19
  # VideoSDK Assembly AI Plugin
@@ -0,0 +1,6 @@
1
+ videosdk/plugins/assemblyai/__init__.py,sha256=T4deawBZKrOiGrKFcfksus-wmb5rF5KY7_p6QBRd4QE,59
2
+ videosdk/plugins/assemblyai/stt.py,sha256=kdUsBJtc0utn1bpFWU2Y7szv2PdGvx5HE75mHWQc1Ao,11873
3
+ videosdk/plugins/assemblyai/version.py,sha256=WKDnjJM7gYpD9fIwhK2qAZICJAT2ndquQ6VcOar074Y,23
4
+ videosdk_plugins_assemblyai-0.0.32.dist-info/METADATA,sha256=xPOmbeUdbO6dokBwmAlR605XYcM_X4t2-XFR7sa8ab4,804
5
+ videosdk_plugins_assemblyai-0.0.32.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ videosdk_plugins_assemblyai-0.0.32.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- videosdk/plugins/assemblyai/__init__.py,sha256=T4deawBZKrOiGrKFcfksus-wmb5rF5KY7_p6QBRd4QE,59
2
- videosdk/plugins/assemblyai/stt.py,sha256=4qBQSn0gvQ4ET0ilFW6lry7NEOd_Sc1yqG6v4op6u7M,6247
3
- videosdk/plugins/assemblyai/version.py,sha256=8ZeepqkW4DvpVeNm92mx0tIzgvVevS4NKWkTXXHuXNY,23
4
- videosdk_plugins_assemblyai-0.0.30.dist-info/METADATA,sha256=1pk6-Le16OuZ_8ayfLQTQYg9nlP8HHn3bmlwiukI_A0,781
5
- videosdk_plugins_assemblyai-0.0.30.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- videosdk_plugins_assemblyai-0.0.30.dist-info/RECORD,,