videosdk-plugins-elevenlabs 0.0.26__py3-none-any.whl → 0.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.

@@ -6,9 +6,9 @@ import httpx
6
6
  import asyncio
7
7
  import json
8
8
  import aiohttp
9
+ import weakref
9
10
  from dataclasses import dataclass
10
-
11
- from videosdk.agents import TTS
11
+ from videosdk.agents import TTS, segment_text
12
12
 
13
13
  ELEVENLABS_SAMPLE_RATE = 24000
14
14
  ELEVENLABS_CHANNELS = 1
@@ -16,6 +16,7 @@ ELEVENLABS_CHANNELS = 1
16
16
  DEFAULT_MODEL = "eleven_flash_v2_5"
17
17
  DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
18
18
  API_BASE_URL = "https://api.elevenlabs.io/v1"
19
+ WS_INACTIVITY_TIMEOUT = 300
19
20
 
20
21
 
21
22
  @dataclass
@@ -38,8 +39,11 @@ class ElevenLabsTTS(TTS):
38
39
  voice_settings: VoiceSettings | None = None,
39
40
  base_url: str = API_BASE_URL,
40
41
  enable_streaming: bool = True,
42
+ inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
41
43
  ) -> None:
42
- super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
44
+ super().__init__(
45
+ sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS
46
+ )
43
47
 
44
48
  self.model = model
45
49
  self.voice = voice
@@ -50,55 +54,30 @@ class ElevenLabsTTS(TTS):
50
54
  self.base_url = base_url
51
55
  self.enable_streaming = enable_streaming
52
56
  self.voice_settings = voice_settings or VoiceSettings()
57
+ self.inactivity_timeout = inactivity_timeout
53
58
  self._first_chunk_sent = False
54
59
  self._ws_session = None
55
60
  self._ws_connection = None
56
-
57
61
  self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
58
62
  if not self.api_key:
59
- raise ValueError("ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
63
+ raise ValueError(
64
+ "ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
60
65
 
61
66
  self._session = httpx.AsyncClient(
62
- timeout=httpx.Timeout(connect=15.0, read=30.0, write=5.0, pool=5.0),
67
+ timeout=httpx.Timeout(connect=15.0, read=30.0,
68
+ write=5.0, pool=5.0),
63
69
  follow_redirects=True,
64
70
  )
65
71
 
72
+ self._streams = weakref.WeakSet()
73
+ self._send_task: asyncio.Task | None = None
74
+ self._recv_task: asyncio.Task | None = None
75
+ self._should_stop = False
76
+
66
77
  def reset_first_audio_tracking(self) -> None:
67
78
  """Reset the first audio tracking state for next TTS task"""
68
79
  self._first_chunk_sent = False
69
80
 
70
- async def _ensure_ws_connection(self, voice_id: str) -> aiohttp.ClientWebSocketResponse:
71
- """Ensure WebSocket connection is established and return it"""
72
- if self._ws_connection is None or self._ws_connection.closed:
73
-
74
- ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
75
-
76
- params = {
77
- "model_id": self.model,
78
- "output_format": self.response_format,
79
- }
80
-
81
- param_string = "&".join([f"{k}={v}" for k, v in params.items()])
82
- full_ws_url = f"{ws_url}?{param_string}"
83
-
84
- headers = {"xi-api-key": self.api_key}
85
-
86
- self._ws_session = aiohttp.ClientSession()
87
- self._ws_connection = await self._ws_session.ws_connect(full_ws_url, headers=headers)
88
-
89
- init_message = {
90
- "text": " ",
91
- "voice_settings": {
92
- "stability": self.voice_settings.stability,
93
- "similarity_boost": self.voice_settings.similarity_boost,
94
- "style": self.voice_settings.style,
95
- "use_speaker_boost": self.voice_settings.use_speaker_boost,
96
- },
97
- }
98
- await self._ws_connection.send_str(json.dumps(init_message))
99
-
100
- return self._ws_connection
101
-
102
81
  async def synthesize(
103
82
  self,
104
83
  text: AsyncIterator[str] | str,
@@ -106,23 +85,23 @@ class ElevenLabsTTS(TTS):
106
85
  **kwargs: Any,
107
86
  ) -> None:
108
87
  try:
109
- if isinstance(text, AsyncIterator):
110
- full_text = ""
111
- async for chunk in text:
112
- full_text += chunk
113
- else:
114
- full_text = text
115
-
116
88
  if not self.audio_track or not self.loop:
117
89
  self.emit("error", "Audio track or event loop not set")
118
90
  return
119
91
 
120
92
  target_voice = voice_id or self.voice
93
+ self._should_stop = False
121
94
 
122
95
  if self.enable_streaming:
123
- await self._stream_synthesis(full_text, target_voice)
96
+ await self._stream_synthesis(text, target_voice)
124
97
  else:
125
- await self._chunked_synthesis(full_text, target_voice)
98
+ if isinstance(text, AsyncIterator):
99
+ async for segment in segment_text(text):
100
+ if self._should_stop:
101
+ break
102
+ await self._chunked_synthesis(segment, target_voice)
103
+ else:
104
+ await self._chunked_synthesis(text, target_voice)
126
105
 
127
106
  except Exception as e:
128
107
  self.emit("error", f"TTS synthesis failed: {str(e)}")
@@ -130,17 +109,17 @@ class ElevenLabsTTS(TTS):
130
109
  async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
131
110
  """Non-streaming synthesis using the standard API"""
132
111
  url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
133
-
112
+
134
113
  params = {
135
114
  "model_id": self.model,
136
115
  "output_format": self.response_format,
137
116
  }
138
-
117
+
139
118
  headers = {
140
119
  "xi-api-key": self.api_key,
141
120
  "Content-Type": "application/json",
142
121
  }
143
-
122
+
144
123
  payload = {
145
124
  "text": text,
146
125
  "voice_settings": {
@@ -153,94 +132,214 @@ class ElevenLabsTTS(TTS):
153
132
 
154
133
  try:
155
134
  async with self._session.stream(
156
- "POST",
157
- url,
158
- headers=headers,
135
+ "POST",
136
+ url,
137
+ headers=headers,
159
138
  json=payload,
160
139
  params=params
161
140
  ) as response:
162
141
  response.raise_for_status()
163
-
164
- audio_data = b""
142
+
165
143
  async for chunk in response.aiter_bytes():
144
+ if self._should_stop:
145
+ break
166
146
  if chunk:
167
- audio_data += chunk
147
+ await self._stream_audio_chunks(chunk)
168
148
 
169
- if audio_data:
170
- await self._stream_audio_chunks(audio_data)
171
-
172
149
  except httpx.HTTPStatusError as e:
173
- self.emit("error", f"HTTP error {e.response.status_code}: {e.response.text}")
150
+ self.emit(
151
+ "error", f"HTTP error {e.response.status_code}: {e.response.text}")
174
152
  except Exception as e:
175
153
  self.emit("error", f"Chunked synthesis failed: {str(e)}")
176
154
 
177
- async def _stream_synthesis(self, text: str, voice_id: str) -> None:
155
+ async def _stream_synthesis(self, text: Union[AsyncIterator[str], str], voice_id: str) -> None:
178
156
  """WebSocket-based streaming synthesis"""
179
-
157
+
158
+ ws_session = None
159
+ ws_connection = None
160
+
161
+ try:
162
+ ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
163
+ params = {
164
+ "model_id": self.model,
165
+ "output_format": self.response_format,
166
+ "inactivity_timeout": self.inactivity_timeout,
167
+ }
168
+ param_string = "&".join([f"{k}={v}" for k, v in params.items()])
169
+ full_ws_url = f"{ws_url}?{param_string}"
170
+
171
+ headers = {"xi-api-key": self.api_key}
172
+
173
+ ws_session = aiohttp.ClientSession()
174
+ ws_connection = await asyncio.wait_for(
175
+ ws_session.ws_connect(full_ws_url, headers=headers),
176
+ timeout=10.0
177
+ )
178
+
179
+ init_message = {
180
+ "text": " ",
181
+ "voice_settings": {
182
+ "stability": self.voice_settings.stability,
183
+ "similarity_boost": self.voice_settings.similarity_boost,
184
+ "style": self.voice_settings.style,
185
+ "use_speaker_boost": self.voice_settings.use_speaker_boost,
186
+ },
187
+ }
188
+ await ws_connection.send_str(json.dumps(init_message))
189
+
190
+ self._send_task = asyncio.create_task(
191
+ self._send_text_task(ws_connection, text))
192
+ self._recv_task = asyncio.create_task(
193
+ self._receive_audio_task(ws_connection))
194
+
195
+ await asyncio.gather(self._send_task, self._recv_task)
196
+
197
+ except Exception as e:
198
+ self.emit("error", f"Streaming synthesis failed: {str(e)}")
199
+
200
+ if isinstance(text, str):
201
+ await self._chunked_synthesis(text, voice_id)
202
+ else:
203
+ async for segment in segment_text(text):
204
+ if self._should_stop:
205
+ break
206
+ await self._chunked_synthesis(segment, voice_id)
207
+
208
+ finally:
209
+ for task in [self._send_task, self._recv_task]:
210
+ if task and not task.done():
211
+ task.cancel()
212
+
213
+ try:
214
+ await asyncio.wait_for(
215
+ asyncio.gather(
216
+ *(t for t in [self._send_task, self._recv_task] if t),
217
+ return_exceptions=True
218
+ ),
219
+ timeout=0.3
220
+ )
221
+ except asyncio.TimeoutError:
222
+ pass
223
+
224
+ self._send_task = None
225
+ self._recv_task = None
226
+
227
+ if ws_connection and not ws_connection.closed:
228
+ await ws_connection.close()
229
+ if ws_session and not ws_session.closed:
230
+ await ws_session.close()
231
+
232
+ async def _send_text_task(self, ws_connection: aiohttp.ClientWebSocketResponse, text: Union[AsyncIterator[str], str]) -> None:
233
+ """Task for sending text to WebSocket"""
180
234
  try:
181
- ws = await self._ensure_ws_connection(voice_id)
182
-
183
- # Send text message
184
- text_message = {"text": f"{text} "}
185
- await ws.send_str(json.dumps(text_message))
186
-
187
- # Send end-of-stream message
188
- eos_message = {"text": ""}
189
- await ws.send_str(json.dumps(eos_message))
190
-
191
- audio_data = b""
192
- async for msg in ws:
193
- if msg.type == aiohttp.WSMsgType.TEXT:
194
- data = json.loads(msg.data)
195
- if data.get("audio"):
196
- import base64
197
- audio_chunk = base64.b64decode(data["audio"])
198
- audio_data += audio_chunk
199
- elif data.get("isFinal"):
235
+ if isinstance(text, str):
236
+ if not self._should_stop:
237
+ text_message = {"text": f"{text} "}
238
+ await ws_connection.send_str(json.dumps(text_message))
239
+ else:
240
+ async for chunk in text:
241
+ if ws_connection.closed or self._should_stop:
200
242
  break
201
- elif data.get("error"):
202
- self.emit("error", f"WebSocket error: {data['error']}")
243
+
244
+ chunk_message = {"text": f"{chunk} "}
245
+ await ws_connection.send_str(json.dumps(chunk_message))
246
+
247
+ if not ws_connection.closed and not self._should_stop:
248
+ eos_message = {"text": ""}
249
+ await ws_connection.send_str(json.dumps(eos_message))
250
+
251
+ except Exception as e:
252
+ if not self._should_stop:
253
+ self.emit("error", f"Send task error: {str(e)}")
254
+ raise
255
+
256
+ async def _receive_audio_task(self, ws_connection: aiohttp.ClientWebSocketResponse) -> None:
257
+ """Task for receiving audio from WebSocket"""
258
+ try:
259
+ while not ws_connection.closed and not self._should_stop:
260
+ try:
261
+ msg = await ws_connection.receive()
262
+
263
+ if msg.type == aiohttp.WSMsgType.TEXT:
264
+ data = json.loads(msg.data)
265
+
266
+ if data.get("audio"):
267
+ import base64
268
+ audio_chunk = base64.b64decode(data["audio"])
269
+ if not self._should_stop:
270
+ await self._stream_audio_chunks(audio_chunk)
271
+
272
+ elif data.get("isFinal"):
273
+ break
274
+
275
+ elif data.get("error"):
276
+ self.emit(
277
+ "error", f"ElevenLabs error: {data['error']}")
278
+ raise ValueError(
279
+ f"ElevenLabs error: {data['error']}")
280
+
281
+ elif msg.type == aiohttp.WSMsgType.ERROR:
282
+ raise ConnectionError(
283
+ f"WebSocket error: {ws_connection.exception()}")
284
+
285
+ elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
203
286
  break
204
- elif msg.type == aiohttp.WSMsgType.ERROR:
205
- self.emit("error", f"WebSocket connection error: {ws.exception()}")
287
+
288
+ except asyncio.TimeoutError:
289
+ if not self._should_stop:
290
+ self.emit("error", "WebSocket receive timeout")
206
291
  break
207
292
 
208
- if audio_data:
209
- await self._stream_audio_chunks(audio_data)
210
-
211
293
  except Exception as e:
212
- self.emit("error", f"Streaming synthesis failed: {str(e)}")
294
+ if not self._should_stop:
295
+ self.emit("error", f"Receive task error: {str(e)}")
296
+ raise
213
297
 
214
298
  async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
215
- """Stream audio data in chunks for smooth playback"""
216
- chunk_size = int(ELEVENLABS_SAMPLE_RATE * ELEVENLABS_CHANNELS * 2 * 20 / 1000)
217
-
218
- for i in range(0, len(audio_bytes), chunk_size):
219
- chunk = audio_bytes[i:i + chunk_size]
220
-
221
- if len(chunk) < chunk_size and len(chunk) > 0:
222
- padding_needed = chunk_size - len(chunk)
223
- chunk += b'\x00' * padding_needed
224
-
225
- if len(chunk) == chunk_size:
226
- if not self._first_chunk_sent and self._first_audio_callback:
227
- self._first_chunk_sent = True
228
- await self._first_audio_callback()
229
-
230
- self.loop.create_task(self.audio_track.add_new_bytes(chunk))
231
- await asyncio.sleep(0.001)
299
+ if not audio_bytes or self._should_stop:
300
+ return
301
+
302
+ if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
303
+ self._first_chunk_sent = True
304
+ asyncio.create_task(self._first_audio_callback())
305
+
306
+ if self.audio_track and self.loop:
307
+ await self.audio_track.add_new_bytes(audio_bytes)
308
+
309
+ async def interrupt(self) -> None:
310
+ """Simple but effective interruption"""
311
+ self._should_stop = True
312
+
313
+ if self.audio_track:
314
+ self.audio_track.interrupt()
315
+
316
+ for task in [self._send_task, self._recv_task]:
317
+ if task and not task.done():
318
+ task.cancel()
319
+
320
+ if self._ws_connection and not self._ws_connection.closed:
321
+ await self._ws_connection.close()
232
322
 
233
323
  async def aclose(self) -> None:
234
324
  """Cleanup resources"""
235
- if self._ws_connection:
325
+ self._should_stop = True
326
+
327
+ for task in [self._send_task, self._recv_task]:
328
+ if task and not task.done():
329
+ task.cancel()
330
+
331
+ for stream in list(self._streams):
332
+ try:
333
+ await stream.aclose()
334
+ except Exception:
335
+ pass
336
+
337
+ self._streams.clear()
338
+
339
+ if self._ws_connection and not self._ws_connection.closed:
236
340
  await self._ws_connection.close()
237
341
  if self._ws_session:
238
342
  await self._ws_session.close()
239
343
  if self._session:
240
344
  await self._session.aclose()
241
345
  await super().aclose()
242
-
243
- async def interrupt(self) -> None:
244
- """Interrupt the TTS process"""
245
- if self.audio_track:
246
- self.audio_track.interrupt()
@@ -1 +1 @@
1
- __version__ = "0.0.26"
1
+ __version__ = "0.0.28"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-elevenlabs
3
- Version: 0.0.26
3
+ Version: 0.0.28
4
4
  Summary: VideoSDK Agent Framework plugin for ElevenLabs
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
12
12
  Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
- Requires-Dist: videosdk-agents>=0.0.26
15
+ Requires-Dist: videosdk-agents>=0.0.28
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  # VideoSDK ElevenLabs Plugin
@@ -0,0 +1,6 @@
1
+ videosdk/plugins/elevenlabs/__init__.py,sha256=bb7M4MSOIIb0KxrsRvG1JczJNGjQ3n-LBqKJp671HfU,91
2
+ videosdk/plugins/elevenlabs/tts.py,sha256=pBva4O8TbzIYc-Y35VPh81pJQuXqmbZqHDepMj5OaDU,12280
3
+ videosdk/plugins/elevenlabs/version.py,sha256=OxG64Q6SDQQGNb5ggPOgDkHI0rY-RjCF92VCMUiyhOQ,23
4
+ videosdk_plugins_elevenlabs-0.0.28.dist-info/METADATA,sha256=WRluQXC1ADVzZ4myOfY5X3QP50djkD-ZMDQdI_mF-Vk,779
5
+ videosdk_plugins_elevenlabs-0.0.28.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
+ videosdk_plugins_elevenlabs-0.0.28.dist-info/RECORD,,
@@ -1,6 +0,0 @@
1
- videosdk/plugins/elevenlabs/__init__.py,sha256=bb7M4MSOIIb0KxrsRvG1JczJNGjQ3n-LBqKJp671HfU,91
2
- videosdk/plugins/elevenlabs/tts.py,sha256=wxa2qE4Gr9AY1a0uICK4E-nMW-i5mTJIn5it3JLs4u8,8822
3
- videosdk/plugins/elevenlabs/version.py,sha256=z29JBD25DddXLFEuHGkjmgFe-J9BefGzGLo7FXpMNtM,22
4
- videosdk_plugins_elevenlabs-0.0.26.dist-info/METADATA,sha256=zshQ9Vv8pW_IZdm5-GiPaXwXmR21lj3U9KtCyCb2oL4,779
5
- videosdk_plugins_elevenlabs-0.0.26.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- videosdk_plugins_elevenlabs-0.0.26.dist-info/RECORD,,