videosdk-plugins-elevenlabs 0.0.26__tar.gz → 0.0.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-elevenlabs
3
- Version: 0.0.26
3
+ Version: 0.0.28
4
4
  Summary: VideoSDK Agent Framework plugin for ElevenLabs
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
12
12
  Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
- Requires-Dist: videosdk-agents>=0.0.26
15
+ Requires-Dist: videosdk-agents>=0.0.28
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  # VideoSDK ElevenLabs Plugin
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
22
  ]
23
23
  dependencies = [
24
- "videosdk-agents>=0.0.26"
24
+ "videosdk-agents>=0.0.28"
25
25
  ]
26
26
 
27
27
  [tool.hatch.version]
@@ -0,0 +1,345 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, AsyncIterator, Literal, Optional, Union
4
+ import os
5
+ import httpx
6
+ import asyncio
7
+ import json
8
+ import aiohttp
9
+ import weakref
10
+ from dataclasses import dataclass
11
+ from videosdk.agents import TTS, segment_text
12
+
13
+ ELEVENLABS_SAMPLE_RATE = 24000
14
+ ELEVENLABS_CHANNELS = 1
15
+
16
+ DEFAULT_MODEL = "eleven_flash_v2_5"
17
+ DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
18
+ API_BASE_URL = "https://api.elevenlabs.io/v1"
19
+ WS_INACTIVITY_TIMEOUT = 300
20
+
21
+
22
+ @dataclass
23
+ class VoiceSettings:
24
+ stability: float = 0.71
25
+ similarity_boost: float = 0.5
26
+ style: float = 0.0
27
+ use_speaker_boost: bool = True
28
+
29
+
30
+ class ElevenLabsTTS(TTS):
31
+ def __init__(
32
+ self,
33
+ *,
34
+ model: str = DEFAULT_MODEL,
35
+ voice: str = DEFAULT_VOICE_ID,
36
+ speed: float = 1.0,
37
+ api_key: str | None = None,
38
+ response_format: str = "pcm_24000",
39
+ voice_settings: VoiceSettings | None = None,
40
+ base_url: str = API_BASE_URL,
41
+ enable_streaming: bool = True,
42
+ inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
43
+ ) -> None:
44
+ super().__init__(
45
+ sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS
46
+ )
47
+
48
+ self.model = model
49
+ self.voice = voice
50
+ self.speed = speed
51
+ self.audio_track = None
52
+ self.loop = None
53
+ self.response_format = response_format
54
+ self.base_url = base_url
55
+ self.enable_streaming = enable_streaming
56
+ self.voice_settings = voice_settings or VoiceSettings()
57
+ self.inactivity_timeout = inactivity_timeout
58
+ self._first_chunk_sent = False
59
+ self._ws_session = None
60
+ self._ws_connection = None
61
+ self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
62
+ if not self.api_key:
63
+ raise ValueError(
64
+ "ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
65
+
66
+ self._session = httpx.AsyncClient(
67
+ timeout=httpx.Timeout(connect=15.0, read=30.0,
68
+ write=5.0, pool=5.0),
69
+ follow_redirects=True,
70
+ )
71
+
72
+ self._streams = weakref.WeakSet()
73
+ self._send_task: asyncio.Task | None = None
74
+ self._recv_task: asyncio.Task | None = None
75
+ self._should_stop = False
76
+
77
+ def reset_first_audio_tracking(self) -> None:
78
+ """Reset the first audio tracking state for next TTS task"""
79
+ self._first_chunk_sent = False
80
+
81
+ async def synthesize(
82
+ self,
83
+ text: AsyncIterator[str] | str,
84
+ voice_id: Optional[str] = None,
85
+ **kwargs: Any,
86
+ ) -> None:
87
+ try:
88
+ if not self.audio_track or not self.loop:
89
+ self.emit("error", "Audio track or event loop not set")
90
+ return
91
+
92
+ target_voice = voice_id or self.voice
93
+ self._should_stop = False
94
+
95
+ if self.enable_streaming:
96
+ await self._stream_synthesis(text, target_voice)
97
+ else:
98
+ if isinstance(text, AsyncIterator):
99
+ async for segment in segment_text(text):
100
+ if self._should_stop:
101
+ break
102
+ await self._chunked_synthesis(segment, target_voice)
103
+ else:
104
+ await self._chunked_synthesis(text, target_voice)
105
+
106
+ except Exception as e:
107
+ self.emit("error", f"TTS synthesis failed: {str(e)}")
108
+
109
+ async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
110
+ """Non-streaming synthesis using the standard API"""
111
+ url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
112
+
113
+ params = {
114
+ "model_id": self.model,
115
+ "output_format": self.response_format,
116
+ }
117
+
118
+ headers = {
119
+ "xi-api-key": self.api_key,
120
+ "Content-Type": "application/json",
121
+ }
122
+
123
+ payload = {
124
+ "text": text,
125
+ "voice_settings": {
126
+ "stability": self.voice_settings.stability,
127
+ "similarity_boost": self.voice_settings.similarity_boost,
128
+ "style": self.voice_settings.style,
129
+ "use_speaker_boost": self.voice_settings.use_speaker_boost,
130
+ },
131
+ }
132
+
133
+ try:
134
+ async with self._session.stream(
135
+ "POST",
136
+ url,
137
+ headers=headers,
138
+ json=payload,
139
+ params=params
140
+ ) as response:
141
+ response.raise_for_status()
142
+
143
+ async for chunk in response.aiter_bytes():
144
+ if self._should_stop:
145
+ break
146
+ if chunk:
147
+ await self._stream_audio_chunks(chunk)
148
+
149
+ except httpx.HTTPStatusError as e:
150
+ self.emit(
151
+ "error", f"HTTP error {e.response.status_code}: {e.response.text}")
152
+ except Exception as e:
153
+ self.emit("error", f"Chunked synthesis failed: {str(e)}")
154
+
155
+ async def _stream_synthesis(self, text: Union[AsyncIterator[str], str], voice_id: str) -> None:
156
+ """WebSocket-based streaming synthesis"""
157
+
158
+ ws_session = None
159
+ ws_connection = None
160
+
161
+ try:
162
+ ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
163
+ params = {
164
+ "model_id": self.model,
165
+ "output_format": self.response_format,
166
+ "inactivity_timeout": self.inactivity_timeout,
167
+ }
168
+ param_string = "&".join([f"{k}={v}" for k, v in params.items()])
169
+ full_ws_url = f"{ws_url}?{param_string}"
170
+
171
+ headers = {"xi-api-key": self.api_key}
172
+
173
+ ws_session = aiohttp.ClientSession()
174
+ ws_connection = await asyncio.wait_for(
175
+ ws_session.ws_connect(full_ws_url, headers=headers),
176
+ timeout=10.0
177
+ )
178
+
179
+ init_message = {
180
+ "text": " ",
181
+ "voice_settings": {
182
+ "stability": self.voice_settings.stability,
183
+ "similarity_boost": self.voice_settings.similarity_boost,
184
+ "style": self.voice_settings.style,
185
+ "use_speaker_boost": self.voice_settings.use_speaker_boost,
186
+ },
187
+ }
188
+ await ws_connection.send_str(json.dumps(init_message))
189
+
190
+ self._send_task = asyncio.create_task(
191
+ self._send_text_task(ws_connection, text))
192
+ self._recv_task = asyncio.create_task(
193
+ self._receive_audio_task(ws_connection))
194
+
195
+ await asyncio.gather(self._send_task, self._recv_task)
196
+
197
+ except Exception as e:
198
+ self.emit("error", f"Streaming synthesis failed: {str(e)}")
199
+
200
+ if isinstance(text, str):
201
+ await self._chunked_synthesis(text, voice_id)
202
+ else:
203
+ async for segment in segment_text(text):
204
+ if self._should_stop:
205
+ break
206
+ await self._chunked_synthesis(segment, voice_id)
207
+
208
+ finally:
209
+ for task in [self._send_task, self._recv_task]:
210
+ if task and not task.done():
211
+ task.cancel()
212
+
213
+ try:
214
+ await asyncio.wait_for(
215
+ asyncio.gather(
216
+ *(t for t in [self._send_task, self._recv_task] if t),
217
+ return_exceptions=True
218
+ ),
219
+ timeout=0.3
220
+ )
221
+ except asyncio.TimeoutError:
222
+ pass
223
+
224
+ self._send_task = None
225
+ self._recv_task = None
226
+
227
+ if ws_connection and not ws_connection.closed:
228
+ await ws_connection.close()
229
+ if ws_session and not ws_session.closed:
230
+ await ws_session.close()
231
+
232
+ async def _send_text_task(self, ws_connection: aiohttp.ClientWebSocketResponse, text: Union[AsyncIterator[str], str]) -> None:
233
+ """Task for sending text to WebSocket"""
234
+ try:
235
+ if isinstance(text, str):
236
+ if not self._should_stop:
237
+ text_message = {"text": f"{text} "}
238
+ await ws_connection.send_str(json.dumps(text_message))
239
+ else:
240
+ async for chunk in text:
241
+ if ws_connection.closed or self._should_stop:
242
+ break
243
+
244
+ chunk_message = {"text": f"{chunk} "}
245
+ await ws_connection.send_str(json.dumps(chunk_message))
246
+
247
+ if not ws_connection.closed and not self._should_stop:
248
+ eos_message = {"text": ""}
249
+ await ws_connection.send_str(json.dumps(eos_message))
250
+
251
+ except Exception as e:
252
+ if not self._should_stop:
253
+ self.emit("error", f"Send task error: {str(e)}")
254
+ raise
255
+
256
+ async def _receive_audio_task(self, ws_connection: aiohttp.ClientWebSocketResponse) -> None:
257
+ """Task for receiving audio from WebSocket"""
258
+ try:
259
+ while not ws_connection.closed and not self._should_stop:
260
+ try:
261
+ msg = await ws_connection.receive()
262
+
263
+ if msg.type == aiohttp.WSMsgType.TEXT:
264
+ data = json.loads(msg.data)
265
+
266
+ if data.get("audio"):
267
+ import base64
268
+ audio_chunk = base64.b64decode(data["audio"])
269
+ if not self._should_stop:
270
+ await self._stream_audio_chunks(audio_chunk)
271
+
272
+ elif data.get("isFinal"):
273
+ break
274
+
275
+ elif data.get("error"):
276
+ self.emit(
277
+ "error", f"ElevenLabs error: {data['error']}")
278
+ raise ValueError(
279
+ f"ElevenLabs error: {data['error']}")
280
+
281
+ elif msg.type == aiohttp.WSMsgType.ERROR:
282
+ raise ConnectionError(
283
+ f"WebSocket error: {ws_connection.exception()}")
284
+
285
+ elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
286
+ break
287
+
288
+ except asyncio.TimeoutError:
289
+ if not self._should_stop:
290
+ self.emit("error", "WebSocket receive timeout")
291
+ break
292
+
293
+ except Exception as e:
294
+ if not self._should_stop:
295
+ self.emit("error", f"Receive task error: {str(e)}")
296
+ raise
297
+
298
+ async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
299
+ if not audio_bytes or self._should_stop:
300
+ return
301
+
302
+ if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
303
+ self._first_chunk_sent = True
304
+ asyncio.create_task(self._first_audio_callback())
305
+
306
+ if self.audio_track and self.loop:
307
+ await self.audio_track.add_new_bytes(audio_bytes)
308
+
309
+ async def interrupt(self) -> None:
310
+ """Simple but effective interruption"""
311
+ self._should_stop = True
312
+
313
+ if self.audio_track:
314
+ self.audio_track.interrupt()
315
+
316
+ for task in [self._send_task, self._recv_task]:
317
+ if task and not task.done():
318
+ task.cancel()
319
+
320
+ if self._ws_connection and not self._ws_connection.closed:
321
+ await self._ws_connection.close()
322
+
323
+ async def aclose(self) -> None:
324
+ """Cleanup resources"""
325
+ self._should_stop = True
326
+
327
+ for task in [self._send_task, self._recv_task]:
328
+ if task and not task.done():
329
+ task.cancel()
330
+
331
+ for stream in list(self._streams):
332
+ try:
333
+ await stream.aclose()
334
+ except Exception:
335
+ pass
336
+
337
+ self._streams.clear()
338
+
339
+ if self._ws_connection and not self._ws_connection.closed:
340
+ await self._ws_connection.close()
341
+ if self._ws_session:
342
+ await self._ws_session.close()
343
+ if self._session:
344
+ await self._session.aclose()
345
+ await super().aclose()
@@ -0,0 +1 @@
1
+ __version__ = "0.0.28"
@@ -1,246 +0,0 @@
1
- from __future__ import annotations
2
-
3
- from typing import Any, AsyncIterator, Literal, Optional, Union
4
- import os
5
- import httpx
6
- import asyncio
7
- import json
8
- import aiohttp
9
- from dataclasses import dataclass
10
-
11
- from videosdk.agents import TTS
12
-
13
- ELEVENLABS_SAMPLE_RATE = 24000
14
- ELEVENLABS_CHANNELS = 1
15
-
16
- DEFAULT_MODEL = "eleven_flash_v2_5"
17
- DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
18
- API_BASE_URL = "https://api.elevenlabs.io/v1"
19
-
20
-
21
- @dataclass
22
- class VoiceSettings:
23
- stability: float = 0.71
24
- similarity_boost: float = 0.5
25
- style: float = 0.0
26
- use_speaker_boost: bool = True
27
-
28
-
29
- class ElevenLabsTTS(TTS):
30
- def __init__(
31
- self,
32
- *,
33
- model: str = DEFAULT_MODEL,
34
- voice: str = DEFAULT_VOICE_ID,
35
- speed: float = 1.0,
36
- api_key: str | None = None,
37
- response_format: str = "pcm_24000",
38
- voice_settings: VoiceSettings | None = None,
39
- base_url: str = API_BASE_URL,
40
- enable_streaming: bool = True,
41
- ) -> None:
42
- super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
43
-
44
- self.model = model
45
- self.voice = voice
46
- self.speed = speed
47
- self.audio_track = None
48
- self.loop = None
49
- self.response_format = response_format
50
- self.base_url = base_url
51
- self.enable_streaming = enable_streaming
52
- self.voice_settings = voice_settings or VoiceSettings()
53
- self._first_chunk_sent = False
54
- self._ws_session = None
55
- self._ws_connection = None
56
-
57
- self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
58
- if not self.api_key:
59
- raise ValueError("ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
60
-
61
- self._session = httpx.AsyncClient(
62
- timeout=httpx.Timeout(connect=15.0, read=30.0, write=5.0, pool=5.0),
63
- follow_redirects=True,
64
- )
65
-
66
- def reset_first_audio_tracking(self) -> None:
67
- """Reset the first audio tracking state for next TTS task"""
68
- self._first_chunk_sent = False
69
-
70
- async def _ensure_ws_connection(self, voice_id: str) -> aiohttp.ClientWebSocketResponse:
71
- """Ensure WebSocket connection is established and return it"""
72
- if self._ws_connection is None or self._ws_connection.closed:
73
-
74
- ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
75
-
76
- params = {
77
- "model_id": self.model,
78
- "output_format": self.response_format,
79
- }
80
-
81
- param_string = "&".join([f"{k}={v}" for k, v in params.items()])
82
- full_ws_url = f"{ws_url}?{param_string}"
83
-
84
- headers = {"xi-api-key": self.api_key}
85
-
86
- self._ws_session = aiohttp.ClientSession()
87
- self._ws_connection = await self._ws_session.ws_connect(full_ws_url, headers=headers)
88
-
89
- init_message = {
90
- "text": " ",
91
- "voice_settings": {
92
- "stability": self.voice_settings.stability,
93
- "similarity_boost": self.voice_settings.similarity_boost,
94
- "style": self.voice_settings.style,
95
- "use_speaker_boost": self.voice_settings.use_speaker_boost,
96
- },
97
- }
98
- await self._ws_connection.send_str(json.dumps(init_message))
99
-
100
- return self._ws_connection
101
-
102
- async def synthesize(
103
- self,
104
- text: AsyncIterator[str] | str,
105
- voice_id: Optional[str] = None,
106
- **kwargs: Any,
107
- ) -> None:
108
- try:
109
- if isinstance(text, AsyncIterator):
110
- full_text = ""
111
- async for chunk in text:
112
- full_text += chunk
113
- else:
114
- full_text = text
115
-
116
- if not self.audio_track or not self.loop:
117
- self.emit("error", "Audio track or event loop not set")
118
- return
119
-
120
- target_voice = voice_id or self.voice
121
-
122
- if self.enable_streaming:
123
- await self._stream_synthesis(full_text, target_voice)
124
- else:
125
- await self._chunked_synthesis(full_text, target_voice)
126
-
127
- except Exception as e:
128
- self.emit("error", f"TTS synthesis failed: {str(e)}")
129
-
130
- async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
131
- """Non-streaming synthesis using the standard API"""
132
- url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
133
-
134
- params = {
135
- "model_id": self.model,
136
- "output_format": self.response_format,
137
- }
138
-
139
- headers = {
140
- "xi-api-key": self.api_key,
141
- "Content-Type": "application/json",
142
- }
143
-
144
- payload = {
145
- "text": text,
146
- "voice_settings": {
147
- "stability": self.voice_settings.stability,
148
- "similarity_boost": self.voice_settings.similarity_boost,
149
- "style": self.voice_settings.style,
150
- "use_speaker_boost": self.voice_settings.use_speaker_boost,
151
- },
152
- }
153
-
154
- try:
155
- async with self._session.stream(
156
- "POST",
157
- url,
158
- headers=headers,
159
- json=payload,
160
- params=params
161
- ) as response:
162
- response.raise_for_status()
163
-
164
- audio_data = b""
165
- async for chunk in response.aiter_bytes():
166
- if chunk:
167
- audio_data += chunk
168
-
169
- if audio_data:
170
- await self._stream_audio_chunks(audio_data)
171
-
172
- except httpx.HTTPStatusError as e:
173
- self.emit("error", f"HTTP error {e.response.status_code}: {e.response.text}")
174
- except Exception as e:
175
- self.emit("error", f"Chunked synthesis failed: {str(e)}")
176
-
177
- async def _stream_synthesis(self, text: str, voice_id: str) -> None:
178
- """WebSocket-based streaming synthesis"""
179
-
180
- try:
181
- ws = await self._ensure_ws_connection(voice_id)
182
-
183
- # Send text message
184
- text_message = {"text": f"{text} "}
185
- await ws.send_str(json.dumps(text_message))
186
-
187
- # Send end-of-stream message
188
- eos_message = {"text": ""}
189
- await ws.send_str(json.dumps(eos_message))
190
-
191
- audio_data = b""
192
- async for msg in ws:
193
- if msg.type == aiohttp.WSMsgType.TEXT:
194
- data = json.loads(msg.data)
195
- if data.get("audio"):
196
- import base64
197
- audio_chunk = base64.b64decode(data["audio"])
198
- audio_data += audio_chunk
199
- elif data.get("isFinal"):
200
- break
201
- elif data.get("error"):
202
- self.emit("error", f"WebSocket error: {data['error']}")
203
- break
204
- elif msg.type == aiohttp.WSMsgType.ERROR:
205
- self.emit("error", f"WebSocket connection error: {ws.exception()}")
206
- break
207
-
208
- if audio_data:
209
- await self._stream_audio_chunks(audio_data)
210
-
211
- except Exception as e:
212
- self.emit("error", f"Streaming synthesis failed: {str(e)}")
213
-
214
- async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
215
- """Stream audio data in chunks for smooth playback"""
216
- chunk_size = int(ELEVENLABS_SAMPLE_RATE * ELEVENLABS_CHANNELS * 2 * 20 / 1000)
217
-
218
- for i in range(0, len(audio_bytes), chunk_size):
219
- chunk = audio_bytes[i:i + chunk_size]
220
-
221
- if len(chunk) < chunk_size and len(chunk) > 0:
222
- padding_needed = chunk_size - len(chunk)
223
- chunk += b'\x00' * padding_needed
224
-
225
- if len(chunk) == chunk_size:
226
- if not self._first_chunk_sent and self._first_audio_callback:
227
- self._first_chunk_sent = True
228
- await self._first_audio_callback()
229
-
230
- self.loop.create_task(self.audio_track.add_new_bytes(chunk))
231
- await asyncio.sleep(0.001)
232
-
233
- async def aclose(self) -> None:
234
- """Cleanup resources"""
235
- if self._ws_connection:
236
- await self._ws_connection.close()
237
- if self._ws_session:
238
- await self._ws_session.close()
239
- if self._session:
240
- await self._session.aclose()
241
- await super().aclose()
242
-
243
- async def interrupt(self) -> None:
244
- """Interrupt the TTS process"""
245
- if self.audio_track:
246
- self.audio_track.interrupt()
@@ -1 +0,0 @@
1
- __version__ = "0.0.26"