videosdk-plugins-elevenlabs 0.0.3__py3-none-any.whl → 0.0.49__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- videosdk/plugins/elevenlabs/__init__.py +2 -1
- videosdk/plugins/elevenlabs/stt.py +356 -0
- videosdk/plugins/elevenlabs/tts.py +285 -79
- videosdk/plugins/elevenlabs/version.py +1 -1
- {videosdk_plugins_elevenlabs-0.0.3.dist-info → videosdk_plugins_elevenlabs-0.0.49.dist-info}/METADATA +5 -4
- videosdk_plugins_elevenlabs-0.0.49.dist-info/RECORD +7 -0
- {videosdk_plugins_elevenlabs-0.0.3.dist-info → videosdk_plugins_elevenlabs-0.0.49.dist-info}/WHEEL +1 -1
- videosdk_plugins_elevenlabs-0.0.3.dist-info/RECORD +0 -6
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import base64
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
from typing import Any, Optional, List
|
|
10
|
+
from urllib.parse import urlencode
|
|
11
|
+
import aiohttp
|
|
12
|
+
import numpy as np
|
|
13
|
+
from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
STT_ERROR_MSGS = {"input_error", "auth_error", "quota_exceeded", "transcriber_error", "error"}
|
|
18
|
+
SUPPORTED_SAMPLE_RATES = {8000, 16000, 22050, 24000, 44100, 48000}
|
|
19
|
+
|
|
20
|
+
class ElevenLabsSTT(BaseSTT):
|
|
21
|
+
"""
|
|
22
|
+
ElevenLabs Realtime Speech-to-Text (STT) client.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
*,
|
|
28
|
+
api_key: str | None = None,
|
|
29
|
+
model_id: str = "scribe_v2_realtime",
|
|
30
|
+
language_code: str = "en",
|
|
31
|
+
sample_rate: int = 48000,
|
|
32
|
+
commit_strategy: str = "vad",
|
|
33
|
+
vad_silence_threshold_secs: float = 0.8,
|
|
34
|
+
vad_threshold: float = 0.4,
|
|
35
|
+
min_speech_duration_ms: int = 50,
|
|
36
|
+
min_silence_duration_ms: int = 50,
|
|
37
|
+
base_url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
|
|
38
|
+
) -> None:
|
|
39
|
+
"""
|
|
40
|
+
Initialize the ElevenLabs STT client.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
api_key: ElevenLabs API key for authentication. Defaults to env variable ELEVENLABS_API_KEY.
|
|
44
|
+
model_id: STT model identifier.
|
|
45
|
+
language_code: Language code for transcription.
|
|
46
|
+
sample_rate: Sample rate of input audio in Hz.
|
|
47
|
+
commit_strategy: Strategy for committing transcripts ('vad' is by default).
|
|
48
|
+
vad_silence_threshold_secs: Duration of silence to detect end-of-speech.
|
|
49
|
+
vad_threshold: Threshold for detecting voice activity.
|
|
50
|
+
min_speech_duration_ms: Minimum duration in milliseconds for a speech segment.
|
|
51
|
+
min_silence_duration_ms: Minimum duration in milliseconds of silence to consider end-of-speech.
|
|
52
|
+
base_url: WebSocket endpoint for ElevenLabs STT.
|
|
53
|
+
Raises:
|
|
54
|
+
ValueError: If required parameters are missing or invalid.
|
|
55
|
+
"""
|
|
56
|
+
super().__init__()
|
|
57
|
+
|
|
58
|
+
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
|
59
|
+
if not self.api_key:
|
|
60
|
+
raise ValueError("ElevenLabs API key must be provided via api_key or ELEVENLABS_API_KEY env var")
|
|
61
|
+
|
|
62
|
+
self.model_id = model_id
|
|
63
|
+
self.language_code = language_code
|
|
64
|
+
self.commit_strategy = commit_strategy
|
|
65
|
+
self.base_url = base_url
|
|
66
|
+
self.sample_rate = sample_rate
|
|
67
|
+
|
|
68
|
+
if self.sample_rate not in SUPPORTED_SAMPLE_RATES:
|
|
69
|
+
raise ValueError(f"Unsupported sample_rate: {self.sample_rate}. Supported rates: {SUPPORTED_SAMPLE_RATES}")
|
|
70
|
+
|
|
71
|
+
self.vad_silence_threshold_secs = vad_silence_threshold_secs
|
|
72
|
+
self.vad_threshold = vad_threshold
|
|
73
|
+
self.min_speech_duration_ms = min_speech_duration_ms
|
|
74
|
+
self.min_silence_duration_ms = min_silence_duration_ms
|
|
75
|
+
|
|
76
|
+
self._last_final_text = ""
|
|
77
|
+
self._last_final_time = 0.0
|
|
78
|
+
self._duplicate_suppression_window = 0.75
|
|
79
|
+
|
|
80
|
+
self._stream_buffer = bytearray()
|
|
81
|
+
self._target_chunk_size = int(0.1 * self.sample_rate * 2)
|
|
82
|
+
|
|
83
|
+
self.heartbeat = 15.0
|
|
84
|
+
self._session: Optional[aiohttp.ClientSession] = None
|
|
85
|
+
self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
|
|
86
|
+
self._ws_task: Optional[asyncio.Task] = None
|
|
87
|
+
|
|
88
|
+
async def process_audio(
|
|
89
|
+
self,
|
|
90
|
+
audio_frames: bytes,
|
|
91
|
+
**kwargs: Any
|
|
92
|
+
) -> None:
|
|
93
|
+
"""
|
|
94
|
+
Process and send audio frames.
|
|
95
|
+
Converts to mono (required by ElevenLabs) and buffers 100ms chunks to reduce overhead.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
if not self._ws or self._ws.closed:
|
|
99
|
+
await self._connect_ws()
|
|
100
|
+
if not self._ws_task or self._ws_task.done():
|
|
101
|
+
self._ws_task = asyncio.create_task(self._listen_for_responses())
|
|
102
|
+
|
|
103
|
+
elif self._ws_task and self._ws_task.done():
|
|
104
|
+
logger.warning("WebSocket listener stopped unexpectedly, restarting")
|
|
105
|
+
self._ws_task = asyncio.create_task(self._listen_for_responses())
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
mono_audio = self._convert_to_mono(audio_frames)
|
|
109
|
+
if not mono_audio:
|
|
110
|
+
return
|
|
111
|
+
|
|
112
|
+
self._stream_buffer.extend(mono_audio)
|
|
113
|
+
|
|
114
|
+
while len(self._stream_buffer) >= self._target_chunk_size:
|
|
115
|
+
chunk = self._stream_buffer[:self._target_chunk_size]
|
|
116
|
+
await self._send_audio(chunk)
|
|
117
|
+
self._stream_buffer = self._stream_buffer[self._target_chunk_size:]
|
|
118
|
+
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.exception("Error in process_audio: %s", e)
|
|
121
|
+
self.emit("error", str(e))
|
|
122
|
+
if self._ws:
|
|
123
|
+
await self._ws.close()
|
|
124
|
+
self._ws = None
|
|
125
|
+
|
|
126
|
+
async def _connect_ws(self) -> None:
|
|
127
|
+
if not self._session:
|
|
128
|
+
self._session = aiohttp.ClientSession()
|
|
129
|
+
|
|
130
|
+
query_params = {
|
|
131
|
+
"model_id": str(self.model_id),
|
|
132
|
+
"language_code": str(self.language_code),
|
|
133
|
+
"audio_format": f"pcm_{self.sample_rate}",
|
|
134
|
+
"commit_strategy": str(self.commit_strategy),
|
|
135
|
+
"vad_silence_threshold_secs": self.vad_silence_threshold_secs,
|
|
136
|
+
"vad_threshold": self.vad_threshold,
|
|
137
|
+
"min_speech_duration_ms": self.min_speech_duration_ms,
|
|
138
|
+
"min_silence_duration_ms": self.min_silence_duration_ms,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
ws_url = f"{self.base_url}?{urlencode(query_params)}"
|
|
142
|
+
headers = {"xi-api-key": self.api_key}
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
self._ws = await self._session.ws_connect(ws_url, headers=headers, heartbeat=self.heartbeat)
|
|
146
|
+
logger.info("Connected to ElevenLabs Realtime STT WebSocket.")
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.exception("Error connecting to ElevenLabs WebSocket: %s", e)
|
|
149
|
+
raise
|
|
150
|
+
|
|
151
|
+
async def _send_audio(self, audio_bytes: bytes) -> None:
|
|
152
|
+
if not self._ws:
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
payload = {
|
|
156
|
+
"message_type": "input_audio_chunk",
|
|
157
|
+
"audio_base_64": base64.b64encode(audio_bytes).decode(),
|
|
158
|
+
"sample_rate": self.sample_rate,
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
await self._ws.send_str(json.dumps(payload))
|
|
163
|
+
except Exception as e:
|
|
164
|
+
logger.exception("Error sending audio chunk: %s", e)
|
|
165
|
+
self.emit("error", str(e))
|
|
166
|
+
await self.aclose()
|
|
167
|
+
|
|
168
|
+
def _convert_to_mono(self, audio_bytes: bytes) -> bytes:
|
|
169
|
+
"""
|
|
170
|
+
Convert input audio bytes to mono.
|
|
171
|
+
"""
|
|
172
|
+
if not audio_bytes:
|
|
173
|
+
return b""
|
|
174
|
+
try:
|
|
175
|
+
raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
|
|
176
|
+
if raw_audio.size == 0:
|
|
177
|
+
return b""
|
|
178
|
+
|
|
179
|
+
if raw_audio.size % 2 == 0:
|
|
180
|
+
try:
|
|
181
|
+
stereo = raw_audio.reshape(-1, 2).astype(np.float32)
|
|
182
|
+
mono = stereo.mean(axis=1)
|
|
183
|
+
return mono.astype(np.int16).tobytes()
|
|
184
|
+
except ValueError:
|
|
185
|
+
pass
|
|
186
|
+
|
|
187
|
+
return audio_bytes
|
|
188
|
+
except Exception as e:
|
|
189
|
+
logger.error("Error converting to mono: %s", e)
|
|
190
|
+
return b""
|
|
191
|
+
|
|
192
|
+
async def _listen_for_responses(self) -> None:
|
|
193
|
+
"""
|
|
194
|
+
Listen for incoming WebSocket messages from ElevenLabs STT.
|
|
195
|
+
"""
|
|
196
|
+
if not self._ws:
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
try:
|
|
200
|
+
async for msg in self._ws:
|
|
201
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
202
|
+
data = None
|
|
203
|
+
try:
|
|
204
|
+
data = msg.json()
|
|
205
|
+
except Exception:
|
|
206
|
+
try:
|
|
207
|
+
data = json.loads(msg.data)
|
|
208
|
+
except Exception:
|
|
209
|
+
logger.debug("Received non-json ws text message")
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
responses = await self._handle_ws_event(data)
|
|
213
|
+
if responses:
|
|
214
|
+
for r in responses:
|
|
215
|
+
if self._transcript_callback:
|
|
216
|
+
try:
|
|
217
|
+
await self._transcript_callback(r)
|
|
218
|
+
except Exception:
|
|
219
|
+
logger.exception("Error in transcript callback")
|
|
220
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
221
|
+
logger.error("WebSocket error: %s", self._ws.exception())
|
|
222
|
+
self.emit("error", f"WebSocket error: {self._ws.exception()}")
|
|
223
|
+
break
|
|
224
|
+
elif msg.type == aiohttp.WSMsgType.CLOSED:
|
|
225
|
+
logger.info("WebSocket closed by server.")
|
|
226
|
+
break
|
|
227
|
+
except asyncio.CancelledError:
|
|
228
|
+
logger.debug("WebSocket listener cancelled")
|
|
229
|
+
except Exception as e:
|
|
230
|
+
logger.exception("Error in WebSocket listener: %s", e)
|
|
231
|
+
self.emit("error", str(e))
|
|
232
|
+
finally:
|
|
233
|
+
if self._ws:
|
|
234
|
+
try:
|
|
235
|
+
await self._ws.close()
|
|
236
|
+
except Exception:
|
|
237
|
+
pass
|
|
238
|
+
self._ws = None
|
|
239
|
+
self._ws_task = None
|
|
240
|
+
|
|
241
|
+
async def _handle_ws_event(self, data: dict) -> List[STTResponse]:
|
|
242
|
+
"""
|
|
243
|
+
Process a single WebSocket event from ElevenLabs STT.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
data: JSON-decoded WebSocket message.
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
List of STTResponse objects for this event.
|
|
250
|
+
"""
|
|
251
|
+
responses: List[STTResponse] = []
|
|
252
|
+
message_type = data.get("message_type")
|
|
253
|
+
logger.debug("Received WS event: %s", message_type)
|
|
254
|
+
|
|
255
|
+
if message_type in STT_ERROR_MSGS:
|
|
256
|
+
logger.error("ElevenLabs STT error: %s", data)
|
|
257
|
+
self.emit("error", data)
|
|
258
|
+
return responses
|
|
259
|
+
|
|
260
|
+
if message_type == "session_started":
|
|
261
|
+
global_event_emitter.emit("speech_session_started")
|
|
262
|
+
return responses
|
|
263
|
+
|
|
264
|
+
if message_type == "committed_transcript":
|
|
265
|
+
logger.info("==== Received final transcript event: %s", data)
|
|
266
|
+
text = data.get("text", "")
|
|
267
|
+
clean_text = text.strip()
|
|
268
|
+
confidence = float(data.get("confidence", 0.0))
|
|
269
|
+
now = time.time()
|
|
270
|
+
|
|
271
|
+
if clean_text == "":
|
|
272
|
+
global_event_emitter.emit("speech_stopped")
|
|
273
|
+
self._last_final_text = ""
|
|
274
|
+
self._last_final_time = now
|
|
275
|
+
return responses
|
|
276
|
+
|
|
277
|
+
resp = STTResponse(
|
|
278
|
+
event_type=SpeechEventType.FINAL,
|
|
279
|
+
data=SpeechData(
|
|
280
|
+
text=clean_text,
|
|
281
|
+
confidence=confidence,
|
|
282
|
+
),
|
|
283
|
+
metadata={"model": self.model_id, "raw_event": data},
|
|
284
|
+
)
|
|
285
|
+
responses.append(resp)
|
|
286
|
+
|
|
287
|
+
global_event_emitter.emit("speech_stopped")
|
|
288
|
+
self._last_final_text = clean_text
|
|
289
|
+
self._last_final_time = now
|
|
290
|
+
return responses
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
if message_type == "partial_transcript":
|
|
294
|
+
text = data.get("text", "")
|
|
295
|
+
clean_text = text.strip()
|
|
296
|
+
|
|
297
|
+
if (
|
|
298
|
+
self._last_final_text
|
|
299
|
+
and clean_text
|
|
300
|
+
and clean_text == self._last_final_text
|
|
301
|
+
and (time.time() - self._last_final_time) < self._duplicate_suppression_window
|
|
302
|
+
):
|
|
303
|
+
logger.debug("Dropping duplicate partial matching recent final transcript")
|
|
304
|
+
return responses
|
|
305
|
+
|
|
306
|
+
resp = STTResponse(
|
|
307
|
+
event_type=SpeechEventType.INTERIM,
|
|
308
|
+
data=SpeechData(
|
|
309
|
+
text=text,
|
|
310
|
+
confidence=float(data.get("confidence", 0.0)),
|
|
311
|
+
),
|
|
312
|
+
metadata={"model": self.model_id, "raw_event": data},
|
|
313
|
+
)
|
|
314
|
+
responses.append(resp)
|
|
315
|
+
|
|
316
|
+
if clean_text:
|
|
317
|
+
global_event_emitter.emit("speech_started")
|
|
318
|
+
|
|
319
|
+
return responses
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
logger.debug("Ignoring unrecognized message_type: %s", message_type)
|
|
324
|
+
return responses
|
|
325
|
+
|
|
326
|
+
async def aclose(self) -> None:
|
|
327
|
+
"""
|
|
328
|
+
Close the WebSocket connection and cleanup session resources.
|
|
329
|
+
|
|
330
|
+
Cancels the listener task, closes WebSocket and HTTP session,
|
|
331
|
+
and calls the parent class cleanup.
|
|
332
|
+
"""
|
|
333
|
+
if self._ws_task:
|
|
334
|
+
self._ws_task.cancel()
|
|
335
|
+
try:
|
|
336
|
+
await self._ws_task
|
|
337
|
+
except asyncio.CancelledError:
|
|
338
|
+
pass
|
|
339
|
+
self._ws_task = None
|
|
340
|
+
|
|
341
|
+
if self._ws:
|
|
342
|
+
try:
|
|
343
|
+
await self._ws.close()
|
|
344
|
+
except Exception:
|
|
345
|
+
pass
|
|
346
|
+
self._ws = None
|
|
347
|
+
|
|
348
|
+
if self._session:
|
|
349
|
+
try:
|
|
350
|
+
await self._session.close()
|
|
351
|
+
except Exception:
|
|
352
|
+
pass
|
|
353
|
+
finally:
|
|
354
|
+
self._session = None
|
|
355
|
+
|
|
356
|
+
await super().aclose()
|
|
@@ -1,14 +1,16 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any, AsyncIterator,
|
|
3
|
+
from typing import Any, AsyncIterator, Optional, Union
|
|
4
4
|
import os
|
|
5
5
|
import httpx
|
|
6
6
|
import asyncio
|
|
7
7
|
import json
|
|
8
8
|
import aiohttp
|
|
9
|
+
import weakref
|
|
9
10
|
from dataclasses import dataclass
|
|
10
|
-
|
|
11
|
-
|
|
11
|
+
from videosdk.agents import TTS, segment_text
|
|
12
|
+
import base64
|
|
13
|
+
import uuid
|
|
12
14
|
|
|
13
15
|
ELEVENLABS_SAMPLE_RATE = 24000
|
|
14
16
|
ELEVENLABS_CHANNELS = 1
|
|
@@ -16,6 +18,7 @@ ELEVENLABS_CHANNELS = 1
|
|
|
16
18
|
DEFAULT_MODEL = "eleven_flash_v2_5"
|
|
17
19
|
DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
|
|
18
20
|
API_BASE_URL = "https://api.elevenlabs.io/v1"
|
|
21
|
+
WS_INACTIVITY_TIMEOUT = 300
|
|
19
22
|
|
|
20
23
|
|
|
21
24
|
@dataclass
|
|
@@ -30,16 +33,32 @@ class ElevenLabsTTS(TTS):
|
|
|
30
33
|
def __init__(
|
|
31
34
|
self,
|
|
32
35
|
*,
|
|
36
|
+
api_key: str | None = None,
|
|
33
37
|
model: str = DEFAULT_MODEL,
|
|
34
38
|
voice: str = DEFAULT_VOICE_ID,
|
|
35
39
|
speed: float = 1.0,
|
|
36
|
-
api_key: str | None = None,
|
|
37
40
|
response_format: str = "pcm_24000",
|
|
38
41
|
voice_settings: VoiceSettings | None = None,
|
|
39
42
|
base_url: str = API_BASE_URL,
|
|
40
|
-
enable_streaming: bool =
|
|
43
|
+
enable_streaming: bool = True,
|
|
44
|
+
inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
|
|
41
45
|
) -> None:
|
|
42
|
-
|
|
46
|
+
"""Initialize the ElevenLabs TTS plugin.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
api_key (Optional[str], optional): ElevenLabs API key. Uses ELEVENLABS_API_KEY environment variable if not provided. Defaults to None.
|
|
50
|
+
model (str): The model to use for the TTS plugin. Defaults to "eleven_flash_v2_5".
|
|
51
|
+
voice (str): The voice to use for the TTS plugin. Defaults to "EXAVITQu4vr4xnSDxMaL".
|
|
52
|
+
speed (float): The speed to use for the TTS plugin. Defaults to 1.0.
|
|
53
|
+
response_format (str): The response format to use for the TTS plugin. Defaults to "pcm_24000".
|
|
54
|
+
voice_settings (Optional[VoiceSettings], optional): The voice settings to use for the TTS plugin. Defaults to None.
|
|
55
|
+
base_url (str): The base URL to use for the TTS plugin. Defaults to "https://api.elevenlabs.io/v1".
|
|
56
|
+
enable_streaming (bool): Whether to enable streaming for the TTS plugin. Defaults to True.
|
|
57
|
+
inactivity_timeout (int): The inactivity timeout to use for the TTS plugin. Defaults to 300.
|
|
58
|
+
"""
|
|
59
|
+
super().__init__(
|
|
60
|
+
sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS
|
|
61
|
+
)
|
|
43
62
|
|
|
44
63
|
self.model = model
|
|
45
64
|
self.voice = voice
|
|
@@ -50,16 +69,35 @@ class ElevenLabsTTS(TTS):
|
|
|
50
69
|
self.base_url = base_url
|
|
51
70
|
self.enable_streaming = enable_streaming
|
|
52
71
|
self.voice_settings = voice_settings or VoiceSettings()
|
|
53
|
-
|
|
72
|
+
self.inactivity_timeout = inactivity_timeout
|
|
73
|
+
self._first_chunk_sent = False
|
|
74
|
+
self._ws_session = None
|
|
75
|
+
self._ws_connection = None
|
|
54
76
|
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
|
55
77
|
if not self.api_key:
|
|
56
|
-
raise ValueError(
|
|
78
|
+
raise ValueError(
|
|
79
|
+
"ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
|
|
57
80
|
|
|
58
81
|
self._session = httpx.AsyncClient(
|
|
59
|
-
timeout=httpx.Timeout(connect=15.0, read=30.0,
|
|
82
|
+
timeout=httpx.Timeout(connect=15.0, read=30.0,
|
|
83
|
+
write=5.0, pool=5.0),
|
|
60
84
|
follow_redirects=True,
|
|
61
85
|
)
|
|
62
86
|
|
|
87
|
+
self._streams = weakref.WeakSet()
|
|
88
|
+
self._send_task: asyncio.Task | None = None
|
|
89
|
+
self._recv_task: asyncio.Task | None = None
|
|
90
|
+
self._should_stop = False
|
|
91
|
+
|
|
92
|
+
self._connection_lock = asyncio.Lock()
|
|
93
|
+
self._ws_voice_id: str | None = None
|
|
94
|
+
self._active_contexts: set[str] = set()
|
|
95
|
+
self._context_futures: dict[str, asyncio.Future[None]] = {}
|
|
96
|
+
|
|
97
|
+
def reset_first_audio_tracking(self) -> None:
|
|
98
|
+
"""Reset the first audio tracking state for next TTS task"""
|
|
99
|
+
self._first_chunk_sent = False
|
|
100
|
+
|
|
63
101
|
async def synthesize(
|
|
64
102
|
self,
|
|
65
103
|
text: AsyncIterator[str] | str,
|
|
@@ -67,23 +105,23 @@ class ElevenLabsTTS(TTS):
|
|
|
67
105
|
**kwargs: Any,
|
|
68
106
|
) -> None:
|
|
69
107
|
try:
|
|
70
|
-
if isinstance(text, AsyncIterator):
|
|
71
|
-
full_text = ""
|
|
72
|
-
async for chunk in text:
|
|
73
|
-
full_text += chunk
|
|
74
|
-
else:
|
|
75
|
-
full_text = text
|
|
76
|
-
|
|
77
108
|
if not self.audio_track or not self.loop:
|
|
78
109
|
self.emit("error", "Audio track or event loop not set")
|
|
79
110
|
return
|
|
80
111
|
|
|
81
112
|
target_voice = voice_id or self.voice
|
|
113
|
+
self._should_stop = False
|
|
82
114
|
|
|
83
115
|
if self.enable_streaming:
|
|
84
|
-
await self._stream_synthesis(
|
|
116
|
+
await self._stream_synthesis(text, target_voice)
|
|
85
117
|
else:
|
|
86
|
-
|
|
118
|
+
if isinstance(text, AsyncIterator):
|
|
119
|
+
async for segment in segment_text(text):
|
|
120
|
+
if self._should_stop:
|
|
121
|
+
break
|
|
122
|
+
await self._chunked_synthesis(segment, target_voice)
|
|
123
|
+
else:
|
|
124
|
+
await self._chunked_synthesis(text, target_voice)
|
|
87
125
|
|
|
88
126
|
except Exception as e:
|
|
89
127
|
self.emit("error", f"TTS synthesis failed: {str(e)}")
|
|
@@ -91,17 +129,17 @@ class ElevenLabsTTS(TTS):
|
|
|
91
129
|
async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
|
|
92
130
|
"""Non-streaming synthesis using the standard API"""
|
|
93
131
|
url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
|
|
94
|
-
|
|
132
|
+
|
|
95
133
|
params = {
|
|
96
134
|
"model_id": self.model,
|
|
97
135
|
"output_format": self.response_format,
|
|
98
136
|
}
|
|
99
|
-
|
|
137
|
+
|
|
100
138
|
headers = {
|
|
101
139
|
"xi-api-key": self.api_key,
|
|
102
140
|
"Content-Type": "application/json",
|
|
103
141
|
}
|
|
104
|
-
|
|
142
|
+
|
|
105
143
|
payload = {
|
|
106
144
|
"text": text,
|
|
107
145
|
"voice_settings": {
|
|
@@ -114,83 +152,251 @@ class ElevenLabsTTS(TTS):
|
|
|
114
152
|
|
|
115
153
|
try:
|
|
116
154
|
async with self._session.stream(
|
|
117
|
-
"POST",
|
|
118
|
-
url,
|
|
119
|
-
headers=headers,
|
|
155
|
+
"POST",
|
|
156
|
+
url,
|
|
157
|
+
headers=headers,
|
|
120
158
|
json=payload,
|
|
121
159
|
params=params
|
|
122
160
|
) as response:
|
|
123
161
|
response.raise_for_status()
|
|
124
|
-
|
|
162
|
+
|
|
125
163
|
async for chunk in response.aiter_bytes():
|
|
164
|
+
if self._should_stop:
|
|
165
|
+
break
|
|
126
166
|
if chunk:
|
|
127
|
-
self.
|
|
128
|
-
|
|
167
|
+
await self._stream_audio_chunks(chunk)
|
|
168
|
+
|
|
129
169
|
except httpx.HTTPStatusError as e:
|
|
130
|
-
self.emit(
|
|
170
|
+
self.emit(
|
|
171
|
+
"error", f"HTTP error {e.response.status_code}: {e.response.text}")
|
|
131
172
|
except Exception as e:
|
|
132
173
|
self.emit("error", f"Chunked synthesis failed: {str(e)}")
|
|
133
174
|
|
|
134
|
-
async def _stream_synthesis(self, text: str, voice_id: str) -> None:
|
|
135
|
-
"""WebSocket-based streaming synthesis"""
|
|
136
|
-
ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
|
|
137
|
-
|
|
138
|
-
params = {
|
|
139
|
-
"model_id": self.model,
|
|
140
|
-
"output_format": self.response_format,
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
144
|
-
full_ws_url = f"{ws_url}?{param_string}"
|
|
145
|
-
|
|
146
|
-
headers = {"xi-api-key": self.api_key}
|
|
147
|
-
|
|
175
|
+
async def _stream_synthesis(self, text: Union[AsyncIterator[str], str], voice_id: str) -> None:
|
|
176
|
+
"""WebSocket-based streaming synthesis using multi-context connection"""
|
|
148
177
|
try:
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
eos_message = {"text": ""}
|
|
166
|
-
await ws.send_str(json.dumps(eos_message))
|
|
167
|
-
|
|
168
|
-
async for msg in ws:
|
|
169
|
-
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
170
|
-
data = json.loads(msg.data)
|
|
171
|
-
if data.get("audio"):
|
|
172
|
-
import base64
|
|
173
|
-
audio_chunk = base64.b64decode(data["audio"])
|
|
174
|
-
self.loop.create_task(self.audio_track.add_new_bytes(audio_chunk))
|
|
175
|
-
elif data.get("isFinal"):
|
|
178
|
+
await self._ensure_connection(voice_id)
|
|
179
|
+
|
|
180
|
+
context_id = uuid.uuid4().hex[:12]
|
|
181
|
+
done_future: asyncio.Future[None] = asyncio.get_event_loop().create_future()
|
|
182
|
+
self.register_context(context_id, done_future)
|
|
183
|
+
|
|
184
|
+
async def _single_chunk_gen(s: str) -> AsyncIterator[str]:
|
|
185
|
+
yield s
|
|
186
|
+
|
|
187
|
+
async def _send_chunks() -> None:
|
|
188
|
+
try:
|
|
189
|
+
first_message_sent = False
|
|
190
|
+
if isinstance(text, str):
|
|
191
|
+
async for segment in segment_text(_single_chunk_gen(text)):
|
|
192
|
+
if self._should_stop:
|
|
176
193
|
break
|
|
177
|
-
|
|
178
|
-
|
|
194
|
+
await self.send_text(context_id, f"{segment} ",
|
|
195
|
+
voice_settings=None if first_message_sent else self._voice_settings_dict(),
|
|
196
|
+
flush=True)
|
|
197
|
+
first_message_sent = True
|
|
198
|
+
else:
|
|
199
|
+
async for chunk in text:
|
|
200
|
+
if self._should_stop:
|
|
179
201
|
break
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
202
|
+
await self.send_text(context_id, f"{chunk} ",
|
|
203
|
+
voice_settings=None if first_message_sent else self._voice_settings_dict())
|
|
204
|
+
first_message_sent = True
|
|
205
|
+
|
|
206
|
+
if not self._should_stop:
|
|
207
|
+
await self.flush_context(context_id)
|
|
208
|
+
await self.close_context(context_id)
|
|
209
|
+
except Exception as e:
|
|
210
|
+
if not done_future.done():
|
|
211
|
+
done_future.set_exception(e)
|
|
212
|
+
|
|
213
|
+
sender = asyncio.create_task(_send_chunks())
|
|
214
|
+
|
|
215
|
+
await done_future
|
|
216
|
+
await sender
|
|
217
|
+
|
|
184
218
|
except Exception as e:
|
|
185
219
|
self.emit("error", f"Streaming synthesis failed: {str(e)}")
|
|
186
220
|
|
|
221
|
+
if isinstance(text, str):
|
|
222
|
+
await self._chunked_synthesis(text, voice_id)
|
|
223
|
+
else:
|
|
224
|
+
async for segment in segment_text(text):
|
|
225
|
+
if self._should_stop:
|
|
226
|
+
break
|
|
227
|
+
await self._chunked_synthesis(segment, voice_id)
|
|
228
|
+
|
|
229
|
+
def _voice_settings_dict(self) -> dict[str, Any]:
|
|
230
|
+
return {
|
|
231
|
+
"stability": self.voice_settings.stability,
|
|
232
|
+
"similarity_boost": self.voice_settings.similarity_boost,
|
|
233
|
+
"style": self.voice_settings.style,
|
|
234
|
+
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
|
|
238
|
+
if not audio_bytes or self._should_stop:
|
|
239
|
+
return
|
|
240
|
+
|
|
241
|
+
if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
|
|
242
|
+
self._first_chunk_sent = True
|
|
243
|
+
asyncio.create_task(self._first_audio_callback())
|
|
244
|
+
|
|
245
|
+
if self.audio_track and self.loop:
|
|
246
|
+
await self.audio_track.add_new_bytes(audio_bytes)
|
|
247
|
+
|
|
248
|
+
async def interrupt(self) -> None:
|
|
249
|
+
"""Simple but effective interruption"""
|
|
250
|
+
self._should_stop = True
|
|
251
|
+
|
|
252
|
+
if self.audio_track:
|
|
253
|
+
self.audio_track.interrupt()
|
|
254
|
+
|
|
255
|
+
await self.close_all_contexts()
|
|
256
|
+
|
|
187
257
|
async def aclose(self) -> None:
|
|
188
258
|
"""Cleanup resources"""
|
|
259
|
+
self._should_stop = True
|
|
260
|
+
|
|
261
|
+
for task in [self._send_task, self._recv_task]:
|
|
262
|
+
if task and not task.done():
|
|
263
|
+
task.cancel()
|
|
264
|
+
|
|
265
|
+
for stream in list(self._streams):
|
|
266
|
+
try:
|
|
267
|
+
await stream.aclose()
|
|
268
|
+
except Exception:
|
|
269
|
+
pass
|
|
270
|
+
|
|
271
|
+
self._streams.clear()
|
|
272
|
+
|
|
273
|
+
if self._ws_connection and not self._ws_connection.closed:
|
|
274
|
+
try:
|
|
275
|
+
await self._ws_connection.send_str(json.dumps({"close_socket": True}))
|
|
276
|
+
except Exception:
|
|
277
|
+
pass
|
|
278
|
+
await self._ws_connection.close()
|
|
279
|
+
if self._ws_session and not self._ws_session.closed:
|
|
280
|
+
await self._ws_session.close()
|
|
281
|
+
self._ws_connection = None
|
|
282
|
+
self._ws_session = None
|
|
189
283
|
if self._session:
|
|
190
284
|
await self._session.aclose()
|
|
191
285
|
await super().aclose()
|
|
192
286
|
|
|
193
|
-
async def
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
287
|
+
async def _ensure_connection(self, voice_id: str) -> None:
|
|
288
|
+
async with self._connection_lock:
|
|
289
|
+
if self._ws_connection and not self._ws_connection.closed and self._ws_voice_id == voice_id:
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
if self._ws_connection and not self._ws_connection.closed:
|
|
293
|
+
try:
|
|
294
|
+
await self._ws_connection.send_str(json.dumps({"close_socket": True}))
|
|
295
|
+
except Exception:
|
|
296
|
+
pass
|
|
297
|
+
await self._ws_connection.close()
|
|
298
|
+
if self._ws_session and not self._ws_session.closed:
|
|
299
|
+
await self._ws_session.close()
|
|
300
|
+
|
|
301
|
+
self._ws_session = aiohttp.ClientSession()
|
|
302
|
+
self._ws_voice_id = voice_id
|
|
303
|
+
|
|
304
|
+
ws_url = f"{self.base_url}/text-to-speech/{voice_id}/multi-stream-input".replace("https://", "wss://").replace("http://", "ws://")
|
|
305
|
+
params = {
|
|
306
|
+
"model_id": self.model,
|
|
307
|
+
"output_format": self.response_format,
|
|
308
|
+
"inactivity_timeout": self.inactivity_timeout,
|
|
309
|
+
}
|
|
310
|
+
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
311
|
+
full_ws_url = f"{ws_url}?{param_string}"
|
|
312
|
+
headers = {"xi-api-key": self.api_key}
|
|
313
|
+
self._ws_connection = await asyncio.wait_for(self._ws_session.ws_connect(full_ws_url, headers=headers), timeout=10.0)
|
|
314
|
+
|
|
315
|
+
if self._recv_task and not self._recv_task.done():
|
|
316
|
+
self._recv_task.cancel()
|
|
317
|
+
self._recv_task = asyncio.create_task(self._recv_loop())
|
|
318
|
+
|
|
319
|
+
def register_context(self, context_id: str, done_future: asyncio.Future[None]) -> None:
|
|
320
|
+
self._context_futures[context_id] = done_future
|
|
321
|
+
|
|
322
|
+
async def send_text(
|
|
323
|
+
self,
|
|
324
|
+
context_id: str,
|
|
325
|
+
text: str,
|
|
326
|
+
*,
|
|
327
|
+
voice_settings: Optional[dict[str, Any]] = None,
|
|
328
|
+
flush: bool = False,
|
|
329
|
+
) -> None:
|
|
330
|
+
if not self._ws_connection or self._ws_connection.closed:
|
|
331
|
+
raise RuntimeError("WebSocket connection is closed")
|
|
332
|
+
|
|
333
|
+
if context_id not in self._active_contexts:
|
|
334
|
+
init_msg = {
|
|
335
|
+
"context_id": context_id,
|
|
336
|
+
"text": " ",
|
|
337
|
+
}
|
|
338
|
+
if voice_settings:
|
|
339
|
+
init_msg["voice_settings"] = voice_settings
|
|
340
|
+
await self._ws_connection.send_str(json.dumps(init_msg))
|
|
341
|
+
self._active_contexts.add(context_id)
|
|
342
|
+
|
|
343
|
+
pkt: dict[str, Any] = {"context_id": context_id, "text": text}
|
|
344
|
+
if flush:
|
|
345
|
+
pkt["flush"] = True
|
|
346
|
+
await self._ws_connection.send_str(json.dumps(pkt))
|
|
347
|
+
|
|
348
|
+
async def flush_context(self, context_id: str) -> None:
|
|
349
|
+
if not self._ws_connection or self._ws_connection.closed:
|
|
350
|
+
return
|
|
351
|
+
await self._ws_connection.send_str(json.dumps({"context_id": context_id, "flush": True}))
|
|
352
|
+
|
|
353
|
+
async def close_context(self, context_id: str) -> None:
|
|
354
|
+
if not self._ws_connection or self._ws_connection.closed:
|
|
355
|
+
return
|
|
356
|
+
await self._ws_connection.send_str(json.dumps({"context_id": context_id, "close_context": True}))
|
|
357
|
+
|
|
358
|
+
async def close_all_contexts(self) -> None:
|
|
359
|
+
try:
|
|
360
|
+
for context_id in list(self._active_contexts):
|
|
361
|
+
await self.close_context(context_id)
|
|
362
|
+
except Exception:
|
|
363
|
+
pass
|
|
364
|
+
|
|
365
|
+
async def _recv_loop(self) -> None:
|
|
366
|
+
try:
|
|
367
|
+
while self._ws_connection and not self._ws_connection.closed:
|
|
368
|
+
msg = await self._ws_connection.receive()
|
|
369
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
370
|
+
data = json.loads(msg.data)
|
|
371
|
+
|
|
372
|
+
if data.get("error"):
|
|
373
|
+
ctx_id = data.get("contextId")
|
|
374
|
+
fut = self._context_futures.get(ctx_id)
|
|
375
|
+
if fut and not fut.done():
|
|
376
|
+
fut.set_exception(RuntimeError(data["error"]))
|
|
377
|
+
continue
|
|
378
|
+
|
|
379
|
+
if data.get("audio"):
|
|
380
|
+
audio_chunk = base64.b64decode(data["audio"]) if isinstance(data["audio"], str) else None
|
|
381
|
+
if audio_chunk:
|
|
382
|
+
if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
|
|
383
|
+
self._first_chunk_sent = True
|
|
384
|
+
asyncio.create_task(self._first_audio_callback())
|
|
385
|
+
if self.audio_track:
|
|
386
|
+
await self.audio_track.add_new_bytes(audio_chunk)
|
|
387
|
+
|
|
388
|
+
if data.get("is_final") or data.get("isFinal"):
|
|
389
|
+
ctx_id = data.get("contextId")
|
|
390
|
+
if ctx_id:
|
|
391
|
+
fut = self._context_futures.pop(ctx_id, None)
|
|
392
|
+
self._active_contexts.discard(ctx_id)
|
|
393
|
+
if fut and not fut.done():
|
|
394
|
+
fut.set_result(None)
|
|
395
|
+
|
|
396
|
+
elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
|
|
397
|
+
break
|
|
398
|
+
except Exception:
|
|
399
|
+
for fut in self._context_futures.values():
|
|
400
|
+
if not fut.done():
|
|
401
|
+
fut.set_exception(RuntimeError("WebSocket receive loop error"))
|
|
402
|
+
self._context_futures.clear()
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.49"
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-elevenlabs
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.49
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for ElevenLabs
|
|
5
5
|
Author: videosdk
|
|
6
|
+
License-Expression: Apache-2.0
|
|
6
7
|
Keywords: ai,audio,elevenlabs,video,videosdk
|
|
7
8
|
Classifier: Development Status :: 4 - Beta
|
|
8
9
|
Classifier: Intended Audience :: Developers
|
|
@@ -11,12 +12,12 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
11
12
|
Classifier: Topic :: Multimedia :: Video
|
|
12
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
14
|
Requires-Python: >=3.11
|
|
14
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.49
|
|
15
16
|
Description-Content-Type: text/markdown
|
|
16
17
|
|
|
17
|
-
VideoSDK ElevenLabs Plugin
|
|
18
|
+
# VideoSDK ElevenLabs Plugin
|
|
18
19
|
|
|
19
|
-
Agent Framework plugin for
|
|
20
|
+
Agent Framework plugin for TTS services from ElevenLabs.
|
|
20
21
|
|
|
21
22
|
## Installation
|
|
22
23
|
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
videosdk/plugins/elevenlabs/__init__.py,sha256=g33CP7YD-GB32-U5RAkRAtoNNaRG7oVy5iqk-LKz0Aw,139
|
|
2
|
+
videosdk/plugins/elevenlabs/stt.py,sha256=3Vbs_9yYROhNAbBzPEUqzdhrpdO6A6zq7TRvby617rM,12881
|
|
3
|
+
videosdk/plugins/elevenlabs/tts.py,sha256=LWn5AG3lssQ1zxWfJ1GLDFZi1cCGO2FKmxy20gcm3dQ,16033
|
|
4
|
+
videosdk/plugins/elevenlabs/version.py,sha256=LuIJFrM65iX-YC6KaWH9iJWJKBv1GHcHHucNCmnVUqo,23
|
|
5
|
+
videosdk_plugins_elevenlabs-0.0.49.dist-info/METADATA,sha256=E30JnazHE_j1EFaTSVVv2gm5HuOI7_HofN3QAMSqEH8,779
|
|
6
|
+
videosdk_plugins_elevenlabs-0.0.49.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
7
|
+
videosdk_plugins_elevenlabs-0.0.49.dist-info/RECORD,,
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
videosdk/plugins/elevenlabs/__init__.py,sha256=bb7M4MSOIIb0KxrsRvG1JczJNGjQ3n-LBqKJp671HfU,91
|
|
2
|
-
videosdk/plugins/elevenlabs/tts.py,sha256=l51CgdxHPgoR-Q2Q4FmSzD-Hi_Hz0MDSxbDIc8jwPck,7092
|
|
3
|
-
videosdk/plugins/elevenlabs/version.py,sha256=k5tJXhBQJ4l9fKHJ76K5w98zBHoYvNk9r-UNH6eQ2-k,21
|
|
4
|
-
videosdk_plugins_elevenlabs-0.0.3.dist-info/METADATA,sha256=j-KpXkh45CmJkAjh7L_Yro5-mOwVdw57panNJC9YKHg,745
|
|
5
|
-
videosdk_plugins_elevenlabs-0.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
-
videosdk_plugins_elevenlabs-0.0.3.dist-info/RECORD,,
|