videosdk-plugins-elevenlabs 0.0.34__py3-none-any.whl → 0.0.36__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.
- videosdk/plugins/elevenlabs/tts.py +175 -131
- videosdk/plugins/elevenlabs/version.py +1 -1
- {videosdk_plugins_elevenlabs-0.0.34.dist-info → videosdk_plugins_elevenlabs-0.0.36.dist-info}/METADATA +2 -2
- videosdk_plugins_elevenlabs-0.0.36.dist-info/RECORD +6 -0
- videosdk_plugins_elevenlabs-0.0.34.dist-info/RECORD +0 -6
- {videosdk_plugins_elevenlabs-0.0.34.dist-info → videosdk_plugins_elevenlabs-0.0.36.dist-info}/WHEEL +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Any, AsyncIterator,
|
|
3
|
+
from typing import Any, AsyncIterator, Optional, Union
|
|
4
4
|
import os
|
|
5
5
|
import httpx
|
|
6
6
|
import asyncio
|
|
@@ -9,6 +9,8 @@ import aiohttp
|
|
|
9
9
|
import weakref
|
|
10
10
|
from dataclasses import dataclass
|
|
11
11
|
from videosdk.agents import TTS, segment_text
|
|
12
|
+
import base64
|
|
13
|
+
import uuid
|
|
12
14
|
|
|
13
15
|
ELEVENLABS_SAMPLE_RATE = 24000
|
|
14
16
|
ELEVENLABS_CHANNELS = 1
|
|
@@ -87,6 +89,11 @@ class ElevenLabsTTS(TTS):
|
|
|
87
89
|
self._recv_task: asyncio.Task | None = None
|
|
88
90
|
self._should_stop = False
|
|
89
91
|
|
|
92
|
+
self._connection_lock = asyncio.Lock()
|
|
93
|
+
self._ws_voice_id: str | None = None
|
|
94
|
+
self._active_contexts: set[str] = set()
|
|
95
|
+
self._context_futures: dict[str, asyncio.Future[None]] = {}
|
|
96
|
+
|
|
90
97
|
def reset_first_audio_tracking(self) -> None:
|
|
91
98
|
"""Reset the first audio tracking state for next TTS task"""
|
|
92
99
|
self._first_chunk_sent = False
|
|
@@ -166,46 +173,47 @@ class ElevenLabsTTS(TTS):
|
|
|
166
173
|
self.emit("error", f"Chunked synthesis failed: {str(e)}")
|
|
167
174
|
|
|
168
175
|
async def _stream_synthesis(self, text: Union[AsyncIterator[str], str], voice_id: str) -> None:
|
|
169
|
-
"""WebSocket-based streaming synthesis"""
|
|
170
|
-
|
|
171
|
-
ws_session = None
|
|
172
|
-
ws_connection = None
|
|
173
|
-
|
|
176
|
+
"""WebSocket-based streaming synthesis using multi-context connection"""
|
|
174
177
|
try:
|
|
175
|
-
|
|
176
|
-
params = {
|
|
177
|
-
"model_id": self.model,
|
|
178
|
-
"output_format": self.response_format,
|
|
179
|
-
"inactivity_timeout": self.inactivity_timeout,
|
|
180
|
-
}
|
|
181
|
-
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
182
|
-
full_ws_url = f"{ws_url}?{param_string}"
|
|
178
|
+
await self._ensure_connection(voice_id)
|
|
183
179
|
|
|
184
|
-
|
|
180
|
+
context_id = uuid.uuid4().hex[:12]
|
|
181
|
+
done_future: asyncio.Future[None] = asyncio.get_event_loop().create_future()
|
|
182
|
+
self.register_context(context_id, done_future)
|
|
185
183
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
ws_session.ws_connect(full_ws_url, headers=headers),
|
|
189
|
-
timeout=10.0
|
|
190
|
-
)
|
|
184
|
+
async def _single_chunk_gen(s: str) -> AsyncIterator[str]:
|
|
185
|
+
yield s
|
|
191
186
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
187
|
+
async def _send_chunks() -> None:
|
|
188
|
+
try:
|
|
189
|
+
first_message_sent = False
|
|
190
|
+
if isinstance(text, str):
|
|
191
|
+
async for segment in segment_text(_single_chunk_gen(text)):
|
|
192
|
+
if self._should_stop:
|
|
193
|
+
break
|
|
194
|
+
await self.send_text(context_id, f"{segment} ",
|
|
195
|
+
voice_settings=None if first_message_sent else self._voice_settings_dict(),
|
|
196
|
+
flush=True)
|
|
197
|
+
first_message_sent = True
|
|
198
|
+
else:
|
|
199
|
+
async for chunk in text:
|
|
200
|
+
if self._should_stop:
|
|
201
|
+
break
|
|
202
|
+
await self.send_text(context_id, f"{chunk} ",
|
|
203
|
+
voice_settings=None if first_message_sent else self._voice_settings_dict())
|
|
204
|
+
first_message_sent = True
|
|
205
|
+
|
|
206
|
+
if not self._should_stop:
|
|
207
|
+
await self.flush_context(context_id)
|
|
208
|
+
await self.close_context(context_id)
|
|
209
|
+
except Exception as e:
|
|
210
|
+
if not done_future.done():
|
|
211
|
+
done_future.set_exception(e)
|
|
202
212
|
|
|
203
|
-
|
|
204
|
-
self._send_text_task(ws_connection, text))
|
|
205
|
-
self._recv_task = asyncio.create_task(
|
|
206
|
-
self._receive_audio_task(ws_connection))
|
|
213
|
+
sender = asyncio.create_task(_send_chunks())
|
|
207
214
|
|
|
208
|
-
await
|
|
215
|
+
await done_future
|
|
216
|
+
await sender
|
|
209
217
|
|
|
210
218
|
except Exception as e:
|
|
211
219
|
self.emit("error", f"Streaming synthesis failed: {str(e)}")
|
|
@@ -218,95 +226,13 @@ class ElevenLabsTTS(TTS):
|
|
|
218
226
|
break
|
|
219
227
|
await self._chunked_synthesis(segment, voice_id)
|
|
220
228
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
asyncio.gather(
|
|
229
|
-
*(t for t in [self._send_task, self._recv_task] if t),
|
|
230
|
-
return_exceptions=True
|
|
231
|
-
),
|
|
232
|
-
timeout=0.3
|
|
233
|
-
)
|
|
234
|
-
except asyncio.TimeoutError:
|
|
235
|
-
pass
|
|
236
|
-
|
|
237
|
-
self._send_task = None
|
|
238
|
-
self._recv_task = None
|
|
239
|
-
|
|
240
|
-
if ws_connection and not ws_connection.closed:
|
|
241
|
-
await ws_connection.close()
|
|
242
|
-
if ws_session and not ws_session.closed:
|
|
243
|
-
await ws_session.close()
|
|
244
|
-
|
|
245
|
-
async def _send_text_task(self, ws_connection: aiohttp.ClientWebSocketResponse, text: Union[AsyncIterator[str], str]) -> None:
|
|
246
|
-
"""Task for sending text to WebSocket"""
|
|
247
|
-
try:
|
|
248
|
-
if isinstance(text, str):
|
|
249
|
-
if not self._should_stop:
|
|
250
|
-
text_message = {"text": f"{text} "}
|
|
251
|
-
await ws_connection.send_str(json.dumps(text_message))
|
|
252
|
-
else:
|
|
253
|
-
async for chunk in text:
|
|
254
|
-
if ws_connection.closed or self._should_stop:
|
|
255
|
-
break
|
|
256
|
-
|
|
257
|
-
chunk_message = {"text": f"{chunk} "}
|
|
258
|
-
await ws_connection.send_str(json.dumps(chunk_message))
|
|
259
|
-
|
|
260
|
-
if not ws_connection.closed and not self._should_stop:
|
|
261
|
-
eos_message = {"text": ""}
|
|
262
|
-
await ws_connection.send_str(json.dumps(eos_message))
|
|
263
|
-
|
|
264
|
-
except Exception as e:
|
|
265
|
-
if not self._should_stop:
|
|
266
|
-
self.emit("error", f"Send task error: {str(e)}")
|
|
267
|
-
raise
|
|
268
|
-
|
|
269
|
-
async def _receive_audio_task(self, ws_connection: aiohttp.ClientWebSocketResponse) -> None:
|
|
270
|
-
"""Task for receiving audio from WebSocket"""
|
|
271
|
-
try:
|
|
272
|
-
while not ws_connection.closed and not self._should_stop:
|
|
273
|
-
try:
|
|
274
|
-
msg = await ws_connection.receive()
|
|
275
|
-
|
|
276
|
-
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
277
|
-
data = json.loads(msg.data)
|
|
278
|
-
|
|
279
|
-
if data.get("audio"):
|
|
280
|
-
import base64
|
|
281
|
-
audio_chunk = base64.b64decode(data["audio"])
|
|
282
|
-
if not self._should_stop:
|
|
283
|
-
await self._stream_audio_chunks(audio_chunk)
|
|
284
|
-
|
|
285
|
-
elif data.get("isFinal"):
|
|
286
|
-
break
|
|
287
|
-
|
|
288
|
-
elif data.get("error"):
|
|
289
|
-
self.emit(
|
|
290
|
-
"error", f"ElevenLabs error: {data['error']}")
|
|
291
|
-
raise ValueError(
|
|
292
|
-
f"ElevenLabs error: {data['error']}")
|
|
293
|
-
|
|
294
|
-
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
295
|
-
raise ConnectionError(
|
|
296
|
-
f"WebSocket error: {ws_connection.exception()}")
|
|
297
|
-
|
|
298
|
-
elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
|
|
299
|
-
break
|
|
300
|
-
|
|
301
|
-
except asyncio.TimeoutError:
|
|
302
|
-
if not self._should_stop:
|
|
303
|
-
self.emit("error", "WebSocket receive timeout")
|
|
304
|
-
break
|
|
305
|
-
|
|
306
|
-
except Exception as e:
|
|
307
|
-
if not self._should_stop:
|
|
308
|
-
self.emit("error", f"Receive task error: {str(e)}")
|
|
309
|
-
raise
|
|
229
|
+
def _voice_settings_dict(self) -> dict[str, Any]:
|
|
230
|
+
return {
|
|
231
|
+
"stability": self.voice_settings.stability,
|
|
232
|
+
"similarity_boost": self.voice_settings.similarity_boost,
|
|
233
|
+
"style": self.voice_settings.style,
|
|
234
|
+
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
235
|
+
}
|
|
310
236
|
|
|
311
237
|
async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
|
|
312
238
|
if not audio_bytes or self._should_stop:
|
|
@@ -326,12 +252,7 @@ class ElevenLabsTTS(TTS):
|
|
|
326
252
|
if self.audio_track:
|
|
327
253
|
self.audio_track.interrupt()
|
|
328
254
|
|
|
329
|
-
|
|
330
|
-
if task and not task.done():
|
|
331
|
-
task.cancel()
|
|
332
|
-
|
|
333
|
-
if self._ws_connection and not self._ws_connection.closed:
|
|
334
|
-
await self._ws_connection.close()
|
|
255
|
+
await self.close_all_contexts()
|
|
335
256
|
|
|
336
257
|
async def aclose(self) -> None:
|
|
337
258
|
"""Cleanup resources"""
|
|
@@ -350,9 +271,132 @@ class ElevenLabsTTS(TTS):
|
|
|
350
271
|
self._streams.clear()
|
|
351
272
|
|
|
352
273
|
if self._ws_connection and not self._ws_connection.closed:
|
|
274
|
+
try:
|
|
275
|
+
await self._ws_connection.send_str(json.dumps({"close_socket": True}))
|
|
276
|
+
except Exception:
|
|
277
|
+
pass
|
|
353
278
|
await self._ws_connection.close()
|
|
354
|
-
if self._ws_session:
|
|
279
|
+
if self._ws_session and not self._ws_session.closed:
|
|
355
280
|
await self._ws_session.close()
|
|
281
|
+
self._ws_connection = None
|
|
282
|
+
self._ws_session = None
|
|
356
283
|
if self._session:
|
|
357
284
|
await self._session.aclose()
|
|
358
285
|
await super().aclose()
|
|
286
|
+
|
|
287
|
+
async def _ensure_connection(self, voice_id: str) -> None:
|
|
288
|
+
async with self._connection_lock:
|
|
289
|
+
if self._ws_connection and not self._ws_connection.closed and self._ws_voice_id == voice_id:
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
if self._ws_connection and not self._ws_connection.closed:
|
|
293
|
+
try:
|
|
294
|
+
await self._ws_connection.send_str(json.dumps({"close_socket": True}))
|
|
295
|
+
except Exception:
|
|
296
|
+
pass
|
|
297
|
+
await self._ws_connection.close()
|
|
298
|
+
if self._ws_session and not self._ws_session.closed:
|
|
299
|
+
await self._ws_session.close()
|
|
300
|
+
|
|
301
|
+
self._ws_session = aiohttp.ClientSession()
|
|
302
|
+
self._ws_voice_id = voice_id
|
|
303
|
+
|
|
304
|
+
ws_url = f"{self.base_url}/text-to-speech/{voice_id}/multi-stream-input".replace("https://", "wss://").replace("http://", "ws://")
|
|
305
|
+
params = {
|
|
306
|
+
"model_id": self.model,
|
|
307
|
+
"output_format": self.response_format,
|
|
308
|
+
"inactivity_timeout": self.inactivity_timeout,
|
|
309
|
+
}
|
|
310
|
+
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
311
|
+
full_ws_url = f"{ws_url}?{param_string}"
|
|
312
|
+
headers = {"xi-api-key": self.api_key}
|
|
313
|
+
self._ws_connection = await asyncio.wait_for(self._ws_session.ws_connect(full_ws_url, headers=headers), timeout=10.0)
|
|
314
|
+
|
|
315
|
+
if self._recv_task and not self._recv_task.done():
|
|
316
|
+
self._recv_task.cancel()
|
|
317
|
+
self._recv_task = asyncio.create_task(self._recv_loop())
|
|
318
|
+
|
|
319
|
+
def register_context(self, context_id: str, done_future: asyncio.Future[None]) -> None:
|
|
320
|
+
self._context_futures[context_id] = done_future
|
|
321
|
+
|
|
322
|
+
async def send_text(
|
|
323
|
+
self,
|
|
324
|
+
context_id: str,
|
|
325
|
+
text: str,
|
|
326
|
+
*,
|
|
327
|
+
voice_settings: Optional[dict[str, Any]] = None,
|
|
328
|
+
flush: bool = False,
|
|
329
|
+
) -> None:
|
|
330
|
+
if not self._ws_connection or self._ws_connection.closed:
|
|
331
|
+
raise RuntimeError("WebSocket connection is closed")
|
|
332
|
+
|
|
333
|
+
if context_id not in self._active_contexts:
|
|
334
|
+
init_msg = {
|
|
335
|
+
"context_id": context_id,
|
|
336
|
+
"text": " ",
|
|
337
|
+
}
|
|
338
|
+
if voice_settings:
|
|
339
|
+
init_msg["voice_settings"] = voice_settings
|
|
340
|
+
await self._ws_connection.send_str(json.dumps(init_msg))
|
|
341
|
+
self._active_contexts.add(context_id)
|
|
342
|
+
|
|
343
|
+
pkt: dict[str, Any] = {"context_id": context_id, "text": text}
|
|
344
|
+
if flush:
|
|
345
|
+
pkt["flush"] = True
|
|
346
|
+
await self._ws_connection.send_str(json.dumps(pkt))
|
|
347
|
+
|
|
348
|
+
async def flush_context(self, context_id: str) -> None:
|
|
349
|
+
if not self._ws_connection or self._ws_connection.closed:
|
|
350
|
+
return
|
|
351
|
+
await self._ws_connection.send_str(json.dumps({"context_id": context_id, "flush": True}))
|
|
352
|
+
|
|
353
|
+
async def close_context(self, context_id: str) -> None:
|
|
354
|
+
if not self._ws_connection or self._ws_connection.closed:
|
|
355
|
+
return
|
|
356
|
+
await self._ws_connection.send_str(json.dumps({"context_id": context_id, "close_context": True}))
|
|
357
|
+
|
|
358
|
+
async def close_all_contexts(self) -> None:
|
|
359
|
+
try:
|
|
360
|
+
for context_id in list(self._active_contexts):
|
|
361
|
+
await self.close_context(context_id)
|
|
362
|
+
except Exception:
|
|
363
|
+
pass
|
|
364
|
+
|
|
365
|
+
async def _recv_loop(self) -> None:
|
|
366
|
+
try:
|
|
367
|
+
while self._ws_connection and not self._ws_connection.closed:
|
|
368
|
+
msg = await self._ws_connection.receive()
|
|
369
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
370
|
+
data = json.loads(msg.data)
|
|
371
|
+
|
|
372
|
+
if data.get("error"):
|
|
373
|
+
ctx_id = data.get("contextId")
|
|
374
|
+
fut = self._context_futures.get(ctx_id)
|
|
375
|
+
if fut and not fut.done():
|
|
376
|
+
fut.set_exception(RuntimeError(data["error"]))
|
|
377
|
+
continue
|
|
378
|
+
|
|
379
|
+
if data.get("audio"):
|
|
380
|
+
audio_chunk = base64.b64decode(data["audio"]) if isinstance(data["audio"], str) else None
|
|
381
|
+
if audio_chunk:
|
|
382
|
+
if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
|
|
383
|
+
self._first_chunk_sent = True
|
|
384
|
+
asyncio.create_task(self._first_audio_callback())
|
|
385
|
+
if self.audio_track:
|
|
386
|
+
await self.audio_track.add_new_bytes(audio_chunk)
|
|
387
|
+
|
|
388
|
+
if data.get("is_final") or data.get("isFinal"):
|
|
389
|
+
ctx_id = data.get("contextId")
|
|
390
|
+
if ctx_id:
|
|
391
|
+
fut = self._context_futures.pop(ctx_id, None)
|
|
392
|
+
self._active_contexts.discard(ctx_id)
|
|
393
|
+
if fut and not fut.done():
|
|
394
|
+
fut.set_result(None)
|
|
395
|
+
|
|
396
|
+
elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
|
|
397
|
+
break
|
|
398
|
+
except Exception:
|
|
399
|
+
for fut in self._context_futures.values():
|
|
400
|
+
if not fut.done():
|
|
401
|
+
fut.set_exception(RuntimeError("WebSocket receive loop error"))
|
|
402
|
+
self._context_futures.clear()
|
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.0.
|
|
1
|
+
__version__ = "0.0.36"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-elevenlabs
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.36
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for ElevenLabs
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
12
12
|
Classifier: Topic :: Multimedia :: Video
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.36
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
|
|
18
18
|
# VideoSDK ElevenLabs Plugin
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
videosdk/plugins/elevenlabs/__init__.py,sha256=bb7M4MSOIIb0KxrsRvG1JczJNGjQ3n-LBqKJp671HfU,91
|
|
2
|
+
videosdk/plugins/elevenlabs/tts.py,sha256=LWn5AG3lssQ1zxWfJ1GLDFZi1cCGO2FKmxy20gcm3dQ,16033
|
|
3
|
+
videosdk/plugins/elevenlabs/version.py,sha256=oSKhQHo_8dYEVv3A19nCmQysoh4TbOzHl508xX9iHoo,23
|
|
4
|
+
videosdk_plugins_elevenlabs-0.0.36.dist-info/METADATA,sha256=MafBViAJQHvThJO7WxQYRYnQIVqQP-7oE3FdGooZblU,779
|
|
5
|
+
videosdk_plugins_elevenlabs-0.0.36.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
+
videosdk_plugins_elevenlabs-0.0.36.dist-info/RECORD,,
|
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
videosdk/plugins/elevenlabs/__init__.py,sha256=bb7M4MSOIIb0KxrsRvG1JczJNGjQ3n-LBqKJp671HfU,91
|
|
2
|
-
videosdk/plugins/elevenlabs/tts.py,sha256=SNaR_5PmElWBaJTsGX51zSijEJ19puIOordLOuF_uNM,13332
|
|
3
|
-
videosdk/plugins/elevenlabs/version.py,sha256=IeOAFvenopBWYv9ajaYlgGYiPXMiGT008qnpbTQ0HPU,23
|
|
4
|
-
videosdk_plugins_elevenlabs-0.0.34.dist-info/METADATA,sha256=AHYFz0ObxyDXOskdoLU9c7tP_6-u61x9KvpbOVwGqwQ,779
|
|
5
|
-
videosdk_plugins_elevenlabs-0.0.34.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
6
|
-
videosdk_plugins_elevenlabs-0.0.34.dist-info/RECORD,,
|
{videosdk_plugins_elevenlabs-0.0.34.dist-info → videosdk_plugins_elevenlabs-0.0.36.dist-info}/WHEEL
RENAMED
|
File without changes
|