videosdk-plugins-assemblyai 0.0.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-assemblyai might be problematic. Click here for more details.
- videosdk/plugins/assemblyai/__init__.py +3 -0
- videosdk/plugins/assemblyai/stt.py +149 -0
- videosdk/plugins/assemblyai/version.py +1 -0
- videosdk_plugins_assemblyai-0.0.22.dist-info/METADATA +26 -0
- videosdk_plugins_assemblyai-0.0.22.dist-info/RECORD +6 -0
- videosdk_plugins_assemblyai-0.0.22.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import io
|
|
5
|
+
import os
|
|
6
|
+
import wave
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import aiohttp
|
|
10
|
+
import numpy as np
|
|
11
|
+
from videosdk.agents import STT, STTResponse, SpeechData, SpeechEventType, global_event_emitter
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from scipy import signal
|
|
15
|
+
SCIPY_AVAILABLE = True
|
|
16
|
+
except ImportError:
|
|
17
|
+
SCIPY_AVAILABLE = False
|
|
18
|
+
|
|
19
|
+
ASSEMBLYAI_API_URL = "https://api.assemblyai.com/v2"
|
|
20
|
+
|
|
21
|
+
class AssemblyAISTT(STT):
|
|
22
|
+
"""
|
|
23
|
+
VideoSDK Agent Framework STT plugin for AssemblyAI.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
*,
|
|
29
|
+
api_key: str | None = None,
|
|
30
|
+
language_code: str = "en_us",
|
|
31
|
+
input_sample_rate: int = 48000,
|
|
32
|
+
target_sample_rate: int = 16000,
|
|
33
|
+
silence_threshold: float = 0.015,
|
|
34
|
+
silence_duration: float = 0.8,
|
|
35
|
+
) -> None:
|
|
36
|
+
super().__init__()
|
|
37
|
+
if not SCIPY_AVAILABLE:
|
|
38
|
+
raise ImportError("scipy is not installed. Please install it with 'pip install scipy'")
|
|
39
|
+
|
|
40
|
+
self.api_key = api_key or os.getenv("ASSEMBLYAI_API_KEY")
|
|
41
|
+
if not self.api_key:
|
|
42
|
+
raise ValueError("AssemblyAI API key must be provided either through the 'api_key' parameter or the 'ASSEMBLYAI_API_KEY' environment variable.")
|
|
43
|
+
|
|
44
|
+
self.language_code = language_code
|
|
45
|
+
self.input_sample_rate = input_sample_rate
|
|
46
|
+
self.target_sample_rate = target_sample_rate
|
|
47
|
+
self.silence_threshold_bytes = int(silence_threshold * 32767)
|
|
48
|
+
self.silence_duration_frames = int(silence_duration * self.input_sample_rate)
|
|
49
|
+
|
|
50
|
+
self._session = aiohttp.ClientSession(headers={"Authorization": self.api_key})
|
|
51
|
+
self._audio_buffer = bytearray()
|
|
52
|
+
self._is_speaking = False
|
|
53
|
+
self._silence_frames = 0
|
|
54
|
+
self._lock = asyncio.Lock()
|
|
55
|
+
|
|
56
|
+
async def process_audio(self, audio_frames: bytes, **kwargs: Any) -> None:
|
|
57
|
+
async with self._lock:
|
|
58
|
+
is_silent_chunk = self._is_silent(audio_frames)
|
|
59
|
+
|
|
60
|
+
if not is_silent_chunk:
|
|
61
|
+
if not self._is_speaking:
|
|
62
|
+
self._is_speaking = True
|
|
63
|
+
global_event_emitter.emit("speech_started")
|
|
64
|
+
self._audio_buffer.extend(audio_frames)
|
|
65
|
+
self._silence_frames = 0
|
|
66
|
+
else:
|
|
67
|
+
if self._is_speaking:
|
|
68
|
+
self._silence_frames += len(audio_frames) // 4
|
|
69
|
+
if self._silence_frames > self.silence_duration_frames:
|
|
70
|
+
global_event_emitter.emit("speech_stopped")
|
|
71
|
+
asyncio.create_task(self._transcribe_buffer())
|
|
72
|
+
self._is_speaking = False
|
|
73
|
+
self._silence_frames = 0
|
|
74
|
+
|
|
75
|
+
def _is_silent(self, audio_chunk: bytes) -> bool:
|
|
76
|
+
audio_data = np.frombuffer(audio_chunk, dtype=np.int16)
|
|
77
|
+
return np.max(np.abs(audio_data)) < self.silence_threshold_bytes
|
|
78
|
+
|
|
79
|
+
async def _transcribe_buffer(self):
|
|
80
|
+
async with self._lock:
|
|
81
|
+
if not self._audio_buffer:
|
|
82
|
+
return
|
|
83
|
+
audio_to_send = self._audio_buffer
|
|
84
|
+
self._audio_buffer = bytearray()
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
resampled_audio_bytes = self._resample_audio(audio_to_send)
|
|
88
|
+
wav_audio = self._create_wav_in_memory(resampled_audio_bytes)
|
|
89
|
+
|
|
90
|
+
upload_url = f"{ASSEMBLYAI_API_URL}/upload"
|
|
91
|
+
async with self._session.post(upload_url, data=wav_audio) as response:
|
|
92
|
+
response.raise_for_status()
|
|
93
|
+
upload_data = await response.json()
|
|
94
|
+
audio_url = upload_data["upload_url"]
|
|
95
|
+
|
|
96
|
+
transcript_url = f"{ASSEMBLYAI_API_URL}/transcript"
|
|
97
|
+
payload = {"audio_url": audio_url, "language_code": self.language_code}
|
|
98
|
+
async with self._session.post(transcript_url, json=payload) as response:
|
|
99
|
+
response.raise_for_status()
|
|
100
|
+
transcript_data = await response.json()
|
|
101
|
+
transcript_id = transcript_data["id"]
|
|
102
|
+
|
|
103
|
+
poll_url = f"{ASSEMBLYAI_API_URL}/transcript/{transcript_id}"
|
|
104
|
+
while True:
|
|
105
|
+
await asyncio.sleep(1)
|
|
106
|
+
async with self._session.get(poll_url) as response:
|
|
107
|
+
response.raise_for_status()
|
|
108
|
+
result = await response.json()
|
|
109
|
+
if result["status"] == "completed":
|
|
110
|
+
if result.get("text") and self._transcript_callback:
|
|
111
|
+
event = STTResponse(
|
|
112
|
+
event_type=SpeechEventType.FINAL,
|
|
113
|
+
data=SpeechData(text=result["text"], language=self.language_code, confidence=result.get("confidence", 1.0))
|
|
114
|
+
)
|
|
115
|
+
await self._transcript_callback(event)
|
|
116
|
+
break
|
|
117
|
+
elif result["status"] == "error":
|
|
118
|
+
raise Exception(f"AssemblyAI transcription failed: {result.get('error')}")
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print(f"!!! ASSEMBLYAI PLUGIN FATAL ERROR: {e} ({type(e).__name__}) !!!")
|
|
122
|
+
self.emit("error", f"AssemblyAI transcription error: {e}")
|
|
123
|
+
|
|
124
|
+
def _resample_audio(self, audio_bytes: bytes) -> bytes:
|
|
125
|
+
raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
|
|
126
|
+
if raw_audio.size == 0: return b''
|
|
127
|
+
stereo_audio = raw_audio.reshape(-1, 2)
|
|
128
|
+
mono_audio = stereo_audio.astype(np.float32).mean(axis=1)
|
|
129
|
+
resampled_data = signal.resample(mono_audio, int(len(mono_audio) * self.target_sample_rate / self.input_sample_rate))
|
|
130
|
+
return resampled_data.astype(np.int16).tobytes()
|
|
131
|
+
|
|
132
|
+
def _create_wav_in_memory(self, pcm_data: bytes) -> io.BytesIO:
|
|
133
|
+
"""Creates a WAV file in memory from raw PCM data."""
|
|
134
|
+
wav_buffer = io.BytesIO()
|
|
135
|
+
with wave.open(wav_buffer, 'wb') as wf:
|
|
136
|
+
wf.setnchannels(1) # Mono
|
|
137
|
+
wf.setsampwidth(2) # 16-bit
|
|
138
|
+
wf.setframerate(self.target_sample_rate)
|
|
139
|
+
wf.writeframes(pcm_data)
|
|
140
|
+
wav_buffer.seek(0)
|
|
141
|
+
return wav_buffer
|
|
142
|
+
|
|
143
|
+
async def aclose(self) -> None:
|
|
144
|
+
if self._is_speaking and self._audio_buffer:
|
|
145
|
+
await self._transcribe_buffer()
|
|
146
|
+
await asyncio.sleep(1)
|
|
147
|
+
|
|
148
|
+
if self._session and not self._session.closed:
|
|
149
|
+
await self._session.close()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.22"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: videosdk-plugins-assemblyai
|
|
3
|
+
Version: 0.0.22
|
|
4
|
+
Summary: VideoSDK Agent Framework plugin for AssemblyAI
|
|
5
|
+
Author: videosdk
|
|
6
|
+
License-Expression: Apache-2.0
|
|
7
|
+
Keywords: ai,assemblyai,audio,video,videosdk
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Communications :: Conferencing
|
|
11
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
12
|
+
Classifier: Topic :: Multimedia :: Video
|
|
13
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.22
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# VideoSDK Assembly AI Plugin
|
|
19
|
+
|
|
20
|
+
Agent Framework plugin for STT services from Assembly AI.
|
|
21
|
+
|
|
22
|
+
## Installation
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
pip install videosdk-plugins-assemblyai
|
|
26
|
+
```
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
videosdk/plugins/assemblyai/__init__.py,sha256=T4deawBZKrOiGrKFcfksus-wmb5rF5KY7_p6QBRd4QE,59
|
|
2
|
+
videosdk/plugins/assemblyai/stt.py,sha256=4qBQSn0gvQ4ET0ilFW6lry7NEOd_Sc1yqG6v4op6u7M,6247
|
|
3
|
+
videosdk/plugins/assemblyai/version.py,sha256=NoiGDztYD4fsDDnfSPiSzRkknkNHhFUtKZj0mhQiTYM,22
|
|
4
|
+
videosdk_plugins_assemblyai-0.0.22.dist-info/METADATA,sha256=PBFZ45WXduVeKs694MY6dmDk5muA_kwlzMi4JhwyCDE,781
|
|
5
|
+
videosdk_plugins_assemblyai-0.0.22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
+
videosdk_plugins_assemblyai-0.0.22.dist-info/RECORD,,
|