videosdk-plugins-elevenlabs 0.0.34__tar.gz → 0.0.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-elevenlabs
3
- Version: 0.0.34
3
+ Version: 0.0.35
4
4
  Summary: VideoSDK Agent Framework plugin for ElevenLabs
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
12
12
  Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
- Requires-Dist: videosdk-agents>=0.0.34
15
+ Requires-Dist: videosdk-agents>=0.0.35
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  # VideoSDK ElevenLabs Plugin
@@ -20,7 +20,7 @@ classifiers = [
20
20
  "Topic :: Multimedia :: Video",
21
21
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
22
  ]
23
- dependencies = ["videosdk-agents>=0.0.34"]
23
+ dependencies = ["videosdk-agents>=0.0.35"]
24
24
 
25
25
  [tool.hatch.version]
26
26
  path = "videosdk/plugins/elevenlabs/version.py"
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, AsyncIterator, Literal, Optional, Union
3
+ from typing import Any, AsyncIterator, Optional, Union
4
4
  import os
5
5
  import httpx
6
6
  import asyncio
@@ -9,6 +9,8 @@ import aiohttp
9
9
  import weakref
10
10
  from dataclasses import dataclass
11
11
  from videosdk.agents import TTS, segment_text
12
+ import base64
13
+ import uuid
12
14
 
13
15
  ELEVENLABS_SAMPLE_RATE = 24000
14
16
  ELEVENLABS_CHANNELS = 1
@@ -87,6 +89,11 @@ class ElevenLabsTTS(TTS):
87
89
  self._recv_task: asyncio.Task | None = None
88
90
  self._should_stop = False
89
91
 
92
+ self._connection_lock = asyncio.Lock()
93
+ self._ws_voice_id: str | None = None
94
+ self._active_contexts: set[str] = set()
95
+ self._context_futures: dict[str, asyncio.Future[None]] = {}
96
+
90
97
  def reset_first_audio_tracking(self) -> None:
91
98
  """Reset the first audio tracking state for next TTS task"""
92
99
  self._first_chunk_sent = False
@@ -166,46 +173,47 @@ class ElevenLabsTTS(TTS):
166
173
  self.emit("error", f"Chunked synthesis failed: {str(e)}")
167
174
 
168
175
  async def _stream_synthesis(self, text: Union[AsyncIterator[str], str], voice_id: str) -> None:
169
- """WebSocket-based streaming synthesis"""
170
-
171
- ws_session = None
172
- ws_connection = None
173
-
176
+ """WebSocket-based streaming synthesis using multi-context connection"""
174
177
  try:
175
- ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
176
- params = {
177
- "model_id": self.model,
178
- "output_format": self.response_format,
179
- "inactivity_timeout": self.inactivity_timeout,
180
- }
181
- param_string = "&".join([f"{k}={v}" for k, v in params.items()])
182
- full_ws_url = f"{ws_url}?{param_string}"
178
+ await self._ensure_connection(voice_id)
183
179
 
184
- headers = {"xi-api-key": self.api_key}
180
+ context_id = uuid.uuid4().hex[:12]
181
+ done_future: asyncio.Future[None] = asyncio.get_event_loop().create_future()
182
+ self.register_context(context_id, done_future)
185
183
 
186
- ws_session = aiohttp.ClientSession()
187
- ws_connection = await asyncio.wait_for(
188
- ws_session.ws_connect(full_ws_url, headers=headers),
189
- timeout=10.0
190
- )
184
+ async def _single_chunk_gen(s: str) -> AsyncIterator[str]:
185
+ yield s
191
186
 
192
- init_message = {
193
- "text": " ",
194
- "voice_settings": {
195
- "stability": self.voice_settings.stability,
196
- "similarity_boost": self.voice_settings.similarity_boost,
197
- "style": self.voice_settings.style,
198
- "use_speaker_boost": self.voice_settings.use_speaker_boost,
199
- },
200
- }
201
- await ws_connection.send_str(json.dumps(init_message))
187
+ async def _send_chunks() -> None:
188
+ try:
189
+ first_message_sent = False
190
+ if isinstance(text, str):
191
+ async for segment in segment_text(_single_chunk_gen(text)):
192
+ if self._should_stop:
193
+ break
194
+ await self.send_text(context_id, f"{segment} ",
195
+ voice_settings=None if first_message_sent else self._voice_settings_dict(),
196
+ flush=True)
197
+ first_message_sent = True
198
+ else:
199
+ async for chunk in text:
200
+ if self._should_stop:
201
+ break
202
+ await self.send_text(context_id, f"{chunk} ",
203
+ voice_settings=None if first_message_sent else self._voice_settings_dict())
204
+ first_message_sent = True
205
+
206
+ if not self._should_stop:
207
+ await self.flush_context(context_id)
208
+ await self.close_context(context_id)
209
+ except Exception as e:
210
+ if not done_future.done():
211
+ done_future.set_exception(e)
202
212
 
203
- self._send_task = asyncio.create_task(
204
- self._send_text_task(ws_connection, text))
205
- self._recv_task = asyncio.create_task(
206
- self._receive_audio_task(ws_connection))
213
+ sender = asyncio.create_task(_send_chunks())
207
214
 
208
- await asyncio.gather(self._send_task, self._recv_task)
215
+ await done_future
216
+ await sender
209
217
 
210
218
  except Exception as e:
211
219
  self.emit("error", f"Streaming synthesis failed: {str(e)}")
@@ -218,95 +226,13 @@ class ElevenLabsTTS(TTS):
218
226
  break
219
227
  await self._chunked_synthesis(segment, voice_id)
220
228
 
221
- finally:
222
- for task in [self._send_task, self._recv_task]:
223
- if task and not task.done():
224
- task.cancel()
225
-
226
- try:
227
- await asyncio.wait_for(
228
- asyncio.gather(
229
- *(t for t in [self._send_task, self._recv_task] if t),
230
- return_exceptions=True
231
- ),
232
- timeout=0.3
233
- )
234
- except asyncio.TimeoutError:
235
- pass
236
-
237
- self._send_task = None
238
- self._recv_task = None
239
-
240
- if ws_connection and not ws_connection.closed:
241
- await ws_connection.close()
242
- if ws_session and not ws_session.closed:
243
- await ws_session.close()
244
-
245
- async def _send_text_task(self, ws_connection: aiohttp.ClientWebSocketResponse, text: Union[AsyncIterator[str], str]) -> None:
246
- """Task for sending text to WebSocket"""
247
- try:
248
- if isinstance(text, str):
249
- if not self._should_stop:
250
- text_message = {"text": f"{text} "}
251
- await ws_connection.send_str(json.dumps(text_message))
252
- else:
253
- async for chunk in text:
254
- if ws_connection.closed or self._should_stop:
255
- break
256
-
257
- chunk_message = {"text": f"{chunk} "}
258
- await ws_connection.send_str(json.dumps(chunk_message))
259
-
260
- if not ws_connection.closed and not self._should_stop:
261
- eos_message = {"text": ""}
262
- await ws_connection.send_str(json.dumps(eos_message))
263
-
264
- except Exception as e:
265
- if not self._should_stop:
266
- self.emit("error", f"Send task error: {str(e)}")
267
- raise
268
-
269
- async def _receive_audio_task(self, ws_connection: aiohttp.ClientWebSocketResponse) -> None:
270
- """Task for receiving audio from WebSocket"""
271
- try:
272
- while not ws_connection.closed and not self._should_stop:
273
- try:
274
- msg = await ws_connection.receive()
275
-
276
- if msg.type == aiohttp.WSMsgType.TEXT:
277
- data = json.loads(msg.data)
278
-
279
- if data.get("audio"):
280
- import base64
281
- audio_chunk = base64.b64decode(data["audio"])
282
- if not self._should_stop:
283
- await self._stream_audio_chunks(audio_chunk)
284
-
285
- elif data.get("isFinal"):
286
- break
287
-
288
- elif data.get("error"):
289
- self.emit(
290
- "error", f"ElevenLabs error: {data['error']}")
291
- raise ValueError(
292
- f"ElevenLabs error: {data['error']}")
293
-
294
- elif msg.type == aiohttp.WSMsgType.ERROR:
295
- raise ConnectionError(
296
- f"WebSocket error: {ws_connection.exception()}")
297
-
298
- elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
299
- break
300
-
301
- except asyncio.TimeoutError:
302
- if not self._should_stop:
303
- self.emit("error", "WebSocket receive timeout")
304
- break
305
-
306
- except Exception as e:
307
- if not self._should_stop:
308
- self.emit("error", f"Receive task error: {str(e)}")
309
- raise
229
+ def _voice_settings_dict(self) -> dict[str, Any]:
230
+ return {
231
+ "stability": self.voice_settings.stability,
232
+ "similarity_boost": self.voice_settings.similarity_boost,
233
+ "style": self.voice_settings.style,
234
+ "use_speaker_boost": self.voice_settings.use_speaker_boost,
235
+ }
310
236
 
311
237
  async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
312
238
  if not audio_bytes or self._should_stop:
@@ -326,12 +252,7 @@ class ElevenLabsTTS(TTS):
326
252
  if self.audio_track:
327
253
  self.audio_track.interrupt()
328
254
 
329
- for task in [self._send_task, self._recv_task]:
330
- if task and not task.done():
331
- task.cancel()
332
-
333
- if self._ws_connection and not self._ws_connection.closed:
334
- await self._ws_connection.close()
255
+ await self.close_all_contexts()
335
256
 
336
257
  async def aclose(self) -> None:
337
258
  """Cleanup resources"""
@@ -350,9 +271,132 @@ class ElevenLabsTTS(TTS):
350
271
  self._streams.clear()
351
272
 
352
273
  if self._ws_connection and not self._ws_connection.closed:
274
+ try:
275
+ await self._ws_connection.send_str(json.dumps({"close_socket": True}))
276
+ except Exception:
277
+ pass
353
278
  await self._ws_connection.close()
354
- if self._ws_session:
279
+ if self._ws_session and not self._ws_session.closed:
355
280
  await self._ws_session.close()
281
+ self._ws_connection = None
282
+ self._ws_session = None
356
283
  if self._session:
357
284
  await self._session.aclose()
358
285
  await super().aclose()
286
+
287
+ async def _ensure_connection(self, voice_id: str) -> None:
288
+ async with self._connection_lock:
289
+ if self._ws_connection and not self._ws_connection.closed and self._ws_voice_id == voice_id:
290
+ return
291
+
292
+ if self._ws_connection and not self._ws_connection.closed:
293
+ try:
294
+ await self._ws_connection.send_str(json.dumps({"close_socket": True}))
295
+ except Exception:
296
+ pass
297
+ await self._ws_connection.close()
298
+ if self._ws_session and not self._ws_session.closed:
299
+ await self._ws_session.close()
300
+
301
+ self._ws_session = aiohttp.ClientSession()
302
+ self._ws_voice_id = voice_id
303
+
304
+ ws_url = f"{self.base_url}/text-to-speech/{voice_id}/multi-stream-input".replace("https://", "wss://").replace("http://", "ws://")
305
+ params = {
306
+ "model_id": self.model,
307
+ "output_format": self.response_format,
308
+ "inactivity_timeout": self.inactivity_timeout,
309
+ }
310
+ param_string = "&".join([f"{k}={v}" for k, v in params.items()])
311
+ full_ws_url = f"{ws_url}?{param_string}"
312
+ headers = {"xi-api-key": self.api_key}
313
+ self._ws_connection = await asyncio.wait_for(self._ws_session.ws_connect(full_ws_url, headers=headers), timeout=10.0)
314
+
315
+ if self._recv_task and not self._recv_task.done():
316
+ self._recv_task.cancel()
317
+ self._recv_task = asyncio.create_task(self._recv_loop())
318
+
319
+ def register_context(self, context_id: str, done_future: asyncio.Future[None]) -> None:
320
+ self._context_futures[context_id] = done_future
321
+
322
+ async def send_text(
323
+ self,
324
+ context_id: str,
325
+ text: str,
326
+ *,
327
+ voice_settings: Optional[dict[str, Any]] = None,
328
+ flush: bool = False,
329
+ ) -> None:
330
+ if not self._ws_connection or self._ws_connection.closed:
331
+ raise RuntimeError("WebSocket connection is closed")
332
+
333
+ if context_id not in self._active_contexts:
334
+ init_msg = {
335
+ "context_id": context_id,
336
+ "text": " ",
337
+ }
338
+ if voice_settings:
339
+ init_msg["voice_settings"] = voice_settings
340
+ await self._ws_connection.send_str(json.dumps(init_msg))
341
+ self._active_contexts.add(context_id)
342
+
343
+ pkt: dict[str, Any] = {"context_id": context_id, "text": text}
344
+ if flush:
345
+ pkt["flush"] = True
346
+ await self._ws_connection.send_str(json.dumps(pkt))
347
+
348
+ async def flush_context(self, context_id: str) -> None:
349
+ if not self._ws_connection or self._ws_connection.closed:
350
+ return
351
+ await self._ws_connection.send_str(json.dumps({"context_id": context_id, "flush": True}))
352
+
353
+ async def close_context(self, context_id: str) -> None:
354
+ if not self._ws_connection or self._ws_connection.closed:
355
+ return
356
+ await self._ws_connection.send_str(json.dumps({"context_id": context_id, "close_context": True}))
357
+
358
+ async def close_all_contexts(self) -> None:
359
+ try:
360
+ for context_id in list(self._active_contexts):
361
+ await self.close_context(context_id)
362
+ except Exception:
363
+ pass
364
+
365
+ async def _recv_loop(self) -> None:
366
+ try:
367
+ while self._ws_connection and not self._ws_connection.closed:
368
+ msg = await self._ws_connection.receive()
369
+ if msg.type == aiohttp.WSMsgType.TEXT:
370
+ data = json.loads(msg.data)
371
+
372
+ if data.get("error"):
373
+ ctx_id = data.get("contextId")
374
+ fut = self._context_futures.get(ctx_id)
375
+ if fut and not fut.done():
376
+ fut.set_exception(RuntimeError(data["error"]))
377
+ continue
378
+
379
+ if data.get("audio"):
380
+ audio_chunk = base64.b64decode(data["audio"]) if isinstance(data["audio"], str) else None
381
+ if audio_chunk:
382
+ if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
383
+ self._first_chunk_sent = True
384
+ asyncio.create_task(self._first_audio_callback())
385
+ if self.audio_track:
386
+ await self.audio_track.add_new_bytes(audio_chunk)
387
+
388
+ if data.get("is_final") or data.get("isFinal"):
389
+ ctx_id = data.get("contextId")
390
+ if ctx_id:
391
+ fut = self._context_futures.pop(ctx_id, None)
392
+ self._active_contexts.discard(ctx_id)
393
+ if fut and not fut.done():
394
+ fut.set_result(None)
395
+
396
+ elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
397
+ break
398
+ except Exception:
399
+ for fut in self._context_futures.values():
400
+ if not fut.done():
401
+ fut.set_exception(RuntimeError("WebSocket receive loop error"))
402
+ self._context_futures.clear()
@@ -0,0 +1 @@
1
+ __version__ = "0.0.35"
@@ -1 +0,0 @@
1
- __version__ = "0.0.34"