videosdk-plugins-elevenlabs 0.0.26__tar.gz → 0.0.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.
- {videosdk_plugins_elevenlabs-0.0.26 → videosdk_plugins_elevenlabs-0.0.28}/PKG-INFO +2 -2
- {videosdk_plugins_elevenlabs-0.0.26 → videosdk_plugins_elevenlabs-0.0.28}/pyproject.toml +1 -1
- videosdk_plugins_elevenlabs-0.0.28/videosdk/plugins/elevenlabs/tts.py +345 -0
- videosdk_plugins_elevenlabs-0.0.28/videosdk/plugins/elevenlabs/version.py +1 -0
- videosdk_plugins_elevenlabs-0.0.26/videosdk/plugins/elevenlabs/tts.py +0 -246
- videosdk_plugins_elevenlabs-0.0.26/videosdk/plugins/elevenlabs/version.py +0 -1
- {videosdk_plugins_elevenlabs-0.0.26 → videosdk_plugins_elevenlabs-0.0.28}/.gitignore +0 -0
- {videosdk_plugins_elevenlabs-0.0.26 → videosdk_plugins_elevenlabs-0.0.28}/README.md +0 -0
- {videosdk_plugins_elevenlabs-0.0.26 → videosdk_plugins_elevenlabs-0.0.28}/videosdk/plugins/elevenlabs/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-elevenlabs
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.28
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for ElevenLabs
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
12
12
|
Classifier: Topic :: Multimedia :: Video
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.28
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
|
|
18
18
|
# VideoSDK ElevenLabs Plugin
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, AsyncIterator, Literal, Optional, Union
|
|
4
|
+
import os
|
|
5
|
+
import httpx
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
import aiohttp
|
|
9
|
+
import weakref
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from videosdk.agents import TTS, segment_text
|
|
12
|
+
|
|
13
|
+
ELEVENLABS_SAMPLE_RATE = 24000
|
|
14
|
+
ELEVENLABS_CHANNELS = 1
|
|
15
|
+
|
|
16
|
+
DEFAULT_MODEL = "eleven_flash_v2_5"
|
|
17
|
+
DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
|
|
18
|
+
API_BASE_URL = "https://api.elevenlabs.io/v1"
|
|
19
|
+
WS_INACTIVITY_TIMEOUT = 300
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class VoiceSettings:
|
|
24
|
+
stability: float = 0.71
|
|
25
|
+
similarity_boost: float = 0.5
|
|
26
|
+
style: float = 0.0
|
|
27
|
+
use_speaker_boost: bool = True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class ElevenLabsTTS(TTS):
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
*,
|
|
34
|
+
model: str = DEFAULT_MODEL,
|
|
35
|
+
voice: str = DEFAULT_VOICE_ID,
|
|
36
|
+
speed: float = 1.0,
|
|
37
|
+
api_key: str | None = None,
|
|
38
|
+
response_format: str = "pcm_24000",
|
|
39
|
+
voice_settings: VoiceSettings | None = None,
|
|
40
|
+
base_url: str = API_BASE_URL,
|
|
41
|
+
enable_streaming: bool = True,
|
|
42
|
+
inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
|
|
43
|
+
) -> None:
|
|
44
|
+
super().__init__(
|
|
45
|
+
sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
self.model = model
|
|
49
|
+
self.voice = voice
|
|
50
|
+
self.speed = speed
|
|
51
|
+
self.audio_track = None
|
|
52
|
+
self.loop = None
|
|
53
|
+
self.response_format = response_format
|
|
54
|
+
self.base_url = base_url
|
|
55
|
+
self.enable_streaming = enable_streaming
|
|
56
|
+
self.voice_settings = voice_settings or VoiceSettings()
|
|
57
|
+
self.inactivity_timeout = inactivity_timeout
|
|
58
|
+
self._first_chunk_sent = False
|
|
59
|
+
self._ws_session = None
|
|
60
|
+
self._ws_connection = None
|
|
61
|
+
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
|
62
|
+
if not self.api_key:
|
|
63
|
+
raise ValueError(
|
|
64
|
+
"ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
|
|
65
|
+
|
|
66
|
+
self._session = httpx.AsyncClient(
|
|
67
|
+
timeout=httpx.Timeout(connect=15.0, read=30.0,
|
|
68
|
+
write=5.0, pool=5.0),
|
|
69
|
+
follow_redirects=True,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self._streams = weakref.WeakSet()
|
|
73
|
+
self._send_task: asyncio.Task | None = None
|
|
74
|
+
self._recv_task: asyncio.Task | None = None
|
|
75
|
+
self._should_stop = False
|
|
76
|
+
|
|
77
|
+
def reset_first_audio_tracking(self) -> None:
|
|
78
|
+
"""Reset the first audio tracking state for next TTS task"""
|
|
79
|
+
self._first_chunk_sent = False
|
|
80
|
+
|
|
81
|
+
async def synthesize(
|
|
82
|
+
self,
|
|
83
|
+
text: AsyncIterator[str] | str,
|
|
84
|
+
voice_id: Optional[str] = None,
|
|
85
|
+
**kwargs: Any,
|
|
86
|
+
) -> None:
|
|
87
|
+
try:
|
|
88
|
+
if not self.audio_track or not self.loop:
|
|
89
|
+
self.emit("error", "Audio track or event loop not set")
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
target_voice = voice_id or self.voice
|
|
93
|
+
self._should_stop = False
|
|
94
|
+
|
|
95
|
+
if self.enable_streaming:
|
|
96
|
+
await self._stream_synthesis(text, target_voice)
|
|
97
|
+
else:
|
|
98
|
+
if isinstance(text, AsyncIterator):
|
|
99
|
+
async for segment in segment_text(text):
|
|
100
|
+
if self._should_stop:
|
|
101
|
+
break
|
|
102
|
+
await self._chunked_synthesis(segment, target_voice)
|
|
103
|
+
else:
|
|
104
|
+
await self._chunked_synthesis(text, target_voice)
|
|
105
|
+
|
|
106
|
+
except Exception as e:
|
|
107
|
+
self.emit("error", f"TTS synthesis failed: {str(e)}")
|
|
108
|
+
|
|
109
|
+
async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
|
|
110
|
+
"""Non-streaming synthesis using the standard API"""
|
|
111
|
+
url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
|
|
112
|
+
|
|
113
|
+
params = {
|
|
114
|
+
"model_id": self.model,
|
|
115
|
+
"output_format": self.response_format,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
headers = {
|
|
119
|
+
"xi-api-key": self.api_key,
|
|
120
|
+
"Content-Type": "application/json",
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
payload = {
|
|
124
|
+
"text": text,
|
|
125
|
+
"voice_settings": {
|
|
126
|
+
"stability": self.voice_settings.stability,
|
|
127
|
+
"similarity_boost": self.voice_settings.similarity_boost,
|
|
128
|
+
"style": self.voice_settings.style,
|
|
129
|
+
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
130
|
+
},
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
async with self._session.stream(
|
|
135
|
+
"POST",
|
|
136
|
+
url,
|
|
137
|
+
headers=headers,
|
|
138
|
+
json=payload,
|
|
139
|
+
params=params
|
|
140
|
+
) as response:
|
|
141
|
+
response.raise_for_status()
|
|
142
|
+
|
|
143
|
+
async for chunk in response.aiter_bytes():
|
|
144
|
+
if self._should_stop:
|
|
145
|
+
break
|
|
146
|
+
if chunk:
|
|
147
|
+
await self._stream_audio_chunks(chunk)
|
|
148
|
+
|
|
149
|
+
except httpx.HTTPStatusError as e:
|
|
150
|
+
self.emit(
|
|
151
|
+
"error", f"HTTP error {e.response.status_code}: {e.response.text}")
|
|
152
|
+
except Exception as e:
|
|
153
|
+
self.emit("error", f"Chunked synthesis failed: {str(e)}")
|
|
154
|
+
|
|
155
|
+
async def _stream_synthesis(self, text: Union[AsyncIterator[str], str], voice_id: str) -> None:
|
|
156
|
+
"""WebSocket-based streaming synthesis"""
|
|
157
|
+
|
|
158
|
+
ws_session = None
|
|
159
|
+
ws_connection = None
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
|
|
163
|
+
params = {
|
|
164
|
+
"model_id": self.model,
|
|
165
|
+
"output_format": self.response_format,
|
|
166
|
+
"inactivity_timeout": self.inactivity_timeout,
|
|
167
|
+
}
|
|
168
|
+
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
169
|
+
full_ws_url = f"{ws_url}?{param_string}"
|
|
170
|
+
|
|
171
|
+
headers = {"xi-api-key": self.api_key}
|
|
172
|
+
|
|
173
|
+
ws_session = aiohttp.ClientSession()
|
|
174
|
+
ws_connection = await asyncio.wait_for(
|
|
175
|
+
ws_session.ws_connect(full_ws_url, headers=headers),
|
|
176
|
+
timeout=10.0
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
init_message = {
|
|
180
|
+
"text": " ",
|
|
181
|
+
"voice_settings": {
|
|
182
|
+
"stability": self.voice_settings.stability,
|
|
183
|
+
"similarity_boost": self.voice_settings.similarity_boost,
|
|
184
|
+
"style": self.voice_settings.style,
|
|
185
|
+
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
186
|
+
},
|
|
187
|
+
}
|
|
188
|
+
await ws_connection.send_str(json.dumps(init_message))
|
|
189
|
+
|
|
190
|
+
self._send_task = asyncio.create_task(
|
|
191
|
+
self._send_text_task(ws_connection, text))
|
|
192
|
+
self._recv_task = asyncio.create_task(
|
|
193
|
+
self._receive_audio_task(ws_connection))
|
|
194
|
+
|
|
195
|
+
await asyncio.gather(self._send_task, self._recv_task)
|
|
196
|
+
|
|
197
|
+
except Exception as e:
|
|
198
|
+
self.emit("error", f"Streaming synthesis failed: {str(e)}")
|
|
199
|
+
|
|
200
|
+
if isinstance(text, str):
|
|
201
|
+
await self._chunked_synthesis(text, voice_id)
|
|
202
|
+
else:
|
|
203
|
+
async for segment in segment_text(text):
|
|
204
|
+
if self._should_stop:
|
|
205
|
+
break
|
|
206
|
+
await self._chunked_synthesis(segment, voice_id)
|
|
207
|
+
|
|
208
|
+
finally:
|
|
209
|
+
for task in [self._send_task, self._recv_task]:
|
|
210
|
+
if task and not task.done():
|
|
211
|
+
task.cancel()
|
|
212
|
+
|
|
213
|
+
try:
|
|
214
|
+
await asyncio.wait_for(
|
|
215
|
+
asyncio.gather(
|
|
216
|
+
*(t for t in [self._send_task, self._recv_task] if t),
|
|
217
|
+
return_exceptions=True
|
|
218
|
+
),
|
|
219
|
+
timeout=0.3
|
|
220
|
+
)
|
|
221
|
+
except asyncio.TimeoutError:
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
self._send_task = None
|
|
225
|
+
self._recv_task = None
|
|
226
|
+
|
|
227
|
+
if ws_connection and not ws_connection.closed:
|
|
228
|
+
await ws_connection.close()
|
|
229
|
+
if ws_session and not ws_session.closed:
|
|
230
|
+
await ws_session.close()
|
|
231
|
+
|
|
232
|
+
async def _send_text_task(self, ws_connection: aiohttp.ClientWebSocketResponse, text: Union[AsyncIterator[str], str]) -> None:
|
|
233
|
+
"""Task for sending text to WebSocket"""
|
|
234
|
+
try:
|
|
235
|
+
if isinstance(text, str):
|
|
236
|
+
if not self._should_stop:
|
|
237
|
+
text_message = {"text": f"{text} "}
|
|
238
|
+
await ws_connection.send_str(json.dumps(text_message))
|
|
239
|
+
else:
|
|
240
|
+
async for chunk in text:
|
|
241
|
+
if ws_connection.closed or self._should_stop:
|
|
242
|
+
break
|
|
243
|
+
|
|
244
|
+
chunk_message = {"text": f"{chunk} "}
|
|
245
|
+
await ws_connection.send_str(json.dumps(chunk_message))
|
|
246
|
+
|
|
247
|
+
if not ws_connection.closed and not self._should_stop:
|
|
248
|
+
eos_message = {"text": ""}
|
|
249
|
+
await ws_connection.send_str(json.dumps(eos_message))
|
|
250
|
+
|
|
251
|
+
except Exception as e:
|
|
252
|
+
if not self._should_stop:
|
|
253
|
+
self.emit("error", f"Send task error: {str(e)}")
|
|
254
|
+
raise
|
|
255
|
+
|
|
256
|
+
async def _receive_audio_task(self, ws_connection: aiohttp.ClientWebSocketResponse) -> None:
|
|
257
|
+
"""Task for receiving audio from WebSocket"""
|
|
258
|
+
try:
|
|
259
|
+
while not ws_connection.closed and not self._should_stop:
|
|
260
|
+
try:
|
|
261
|
+
msg = await ws_connection.receive()
|
|
262
|
+
|
|
263
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
264
|
+
data = json.loads(msg.data)
|
|
265
|
+
|
|
266
|
+
if data.get("audio"):
|
|
267
|
+
import base64
|
|
268
|
+
audio_chunk = base64.b64decode(data["audio"])
|
|
269
|
+
if not self._should_stop:
|
|
270
|
+
await self._stream_audio_chunks(audio_chunk)
|
|
271
|
+
|
|
272
|
+
elif data.get("isFinal"):
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
elif data.get("error"):
|
|
276
|
+
self.emit(
|
|
277
|
+
"error", f"ElevenLabs error: {data['error']}")
|
|
278
|
+
raise ValueError(
|
|
279
|
+
f"ElevenLabs error: {data['error']}")
|
|
280
|
+
|
|
281
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
282
|
+
raise ConnectionError(
|
|
283
|
+
f"WebSocket error: {ws_connection.exception()}")
|
|
284
|
+
|
|
285
|
+
elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
|
|
286
|
+
break
|
|
287
|
+
|
|
288
|
+
except asyncio.TimeoutError:
|
|
289
|
+
if not self._should_stop:
|
|
290
|
+
self.emit("error", "WebSocket receive timeout")
|
|
291
|
+
break
|
|
292
|
+
|
|
293
|
+
except Exception as e:
|
|
294
|
+
if not self._should_stop:
|
|
295
|
+
self.emit("error", f"Receive task error: {str(e)}")
|
|
296
|
+
raise
|
|
297
|
+
|
|
298
|
+
async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
|
|
299
|
+
if not audio_bytes or self._should_stop:
|
|
300
|
+
return
|
|
301
|
+
|
|
302
|
+
if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
|
|
303
|
+
self._first_chunk_sent = True
|
|
304
|
+
asyncio.create_task(self._first_audio_callback())
|
|
305
|
+
|
|
306
|
+
if self.audio_track and self.loop:
|
|
307
|
+
await self.audio_track.add_new_bytes(audio_bytes)
|
|
308
|
+
|
|
309
|
+
async def interrupt(self) -> None:
|
|
310
|
+
"""Simple but effective interruption"""
|
|
311
|
+
self._should_stop = True
|
|
312
|
+
|
|
313
|
+
if self.audio_track:
|
|
314
|
+
self.audio_track.interrupt()
|
|
315
|
+
|
|
316
|
+
for task in [self._send_task, self._recv_task]:
|
|
317
|
+
if task and not task.done():
|
|
318
|
+
task.cancel()
|
|
319
|
+
|
|
320
|
+
if self._ws_connection and not self._ws_connection.closed:
|
|
321
|
+
await self._ws_connection.close()
|
|
322
|
+
|
|
323
|
+
async def aclose(self) -> None:
|
|
324
|
+
"""Cleanup resources"""
|
|
325
|
+
self._should_stop = True
|
|
326
|
+
|
|
327
|
+
for task in [self._send_task, self._recv_task]:
|
|
328
|
+
if task and not task.done():
|
|
329
|
+
task.cancel()
|
|
330
|
+
|
|
331
|
+
for stream in list(self._streams):
|
|
332
|
+
try:
|
|
333
|
+
await stream.aclose()
|
|
334
|
+
except Exception:
|
|
335
|
+
pass
|
|
336
|
+
|
|
337
|
+
self._streams.clear()
|
|
338
|
+
|
|
339
|
+
if self._ws_connection and not self._ws_connection.closed:
|
|
340
|
+
await self._ws_connection.close()
|
|
341
|
+
if self._ws_session:
|
|
342
|
+
await self._ws_session.close()
|
|
343
|
+
if self._session:
|
|
344
|
+
await self._session.aclose()
|
|
345
|
+
await super().aclose()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.28"
|
|
@@ -1,246 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
from typing import Any, AsyncIterator, Literal, Optional, Union
|
|
4
|
-
import os
|
|
5
|
-
import httpx
|
|
6
|
-
import asyncio
|
|
7
|
-
import json
|
|
8
|
-
import aiohttp
|
|
9
|
-
from dataclasses import dataclass
|
|
10
|
-
|
|
11
|
-
from videosdk.agents import TTS
|
|
12
|
-
|
|
13
|
-
ELEVENLABS_SAMPLE_RATE = 24000
|
|
14
|
-
ELEVENLABS_CHANNELS = 1
|
|
15
|
-
|
|
16
|
-
DEFAULT_MODEL = "eleven_flash_v2_5"
|
|
17
|
-
DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
|
|
18
|
-
API_BASE_URL = "https://api.elevenlabs.io/v1"
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
@dataclass
|
|
22
|
-
class VoiceSettings:
|
|
23
|
-
stability: float = 0.71
|
|
24
|
-
similarity_boost: float = 0.5
|
|
25
|
-
style: float = 0.0
|
|
26
|
-
use_speaker_boost: bool = True
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class ElevenLabsTTS(TTS):
|
|
30
|
-
def __init__(
|
|
31
|
-
self,
|
|
32
|
-
*,
|
|
33
|
-
model: str = DEFAULT_MODEL,
|
|
34
|
-
voice: str = DEFAULT_VOICE_ID,
|
|
35
|
-
speed: float = 1.0,
|
|
36
|
-
api_key: str | None = None,
|
|
37
|
-
response_format: str = "pcm_24000",
|
|
38
|
-
voice_settings: VoiceSettings | None = None,
|
|
39
|
-
base_url: str = API_BASE_URL,
|
|
40
|
-
enable_streaming: bool = True,
|
|
41
|
-
) -> None:
|
|
42
|
-
super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
|
|
43
|
-
|
|
44
|
-
self.model = model
|
|
45
|
-
self.voice = voice
|
|
46
|
-
self.speed = speed
|
|
47
|
-
self.audio_track = None
|
|
48
|
-
self.loop = None
|
|
49
|
-
self.response_format = response_format
|
|
50
|
-
self.base_url = base_url
|
|
51
|
-
self.enable_streaming = enable_streaming
|
|
52
|
-
self.voice_settings = voice_settings or VoiceSettings()
|
|
53
|
-
self._first_chunk_sent = False
|
|
54
|
-
self._ws_session = None
|
|
55
|
-
self._ws_connection = None
|
|
56
|
-
|
|
57
|
-
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
|
58
|
-
if not self.api_key:
|
|
59
|
-
raise ValueError("ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
|
|
60
|
-
|
|
61
|
-
self._session = httpx.AsyncClient(
|
|
62
|
-
timeout=httpx.Timeout(connect=15.0, read=30.0, write=5.0, pool=5.0),
|
|
63
|
-
follow_redirects=True,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
def reset_first_audio_tracking(self) -> None:
|
|
67
|
-
"""Reset the first audio tracking state for next TTS task"""
|
|
68
|
-
self._first_chunk_sent = False
|
|
69
|
-
|
|
70
|
-
async def _ensure_ws_connection(self, voice_id: str) -> aiohttp.ClientWebSocketResponse:
|
|
71
|
-
"""Ensure WebSocket connection is established and return it"""
|
|
72
|
-
if self._ws_connection is None or self._ws_connection.closed:
|
|
73
|
-
|
|
74
|
-
ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
|
|
75
|
-
|
|
76
|
-
params = {
|
|
77
|
-
"model_id": self.model,
|
|
78
|
-
"output_format": self.response_format,
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
82
|
-
full_ws_url = f"{ws_url}?{param_string}"
|
|
83
|
-
|
|
84
|
-
headers = {"xi-api-key": self.api_key}
|
|
85
|
-
|
|
86
|
-
self._ws_session = aiohttp.ClientSession()
|
|
87
|
-
self._ws_connection = await self._ws_session.ws_connect(full_ws_url, headers=headers)
|
|
88
|
-
|
|
89
|
-
init_message = {
|
|
90
|
-
"text": " ",
|
|
91
|
-
"voice_settings": {
|
|
92
|
-
"stability": self.voice_settings.stability,
|
|
93
|
-
"similarity_boost": self.voice_settings.similarity_boost,
|
|
94
|
-
"style": self.voice_settings.style,
|
|
95
|
-
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
96
|
-
},
|
|
97
|
-
}
|
|
98
|
-
await self._ws_connection.send_str(json.dumps(init_message))
|
|
99
|
-
|
|
100
|
-
return self._ws_connection
|
|
101
|
-
|
|
102
|
-
async def synthesize(
|
|
103
|
-
self,
|
|
104
|
-
text: AsyncIterator[str] | str,
|
|
105
|
-
voice_id: Optional[str] = None,
|
|
106
|
-
**kwargs: Any,
|
|
107
|
-
) -> None:
|
|
108
|
-
try:
|
|
109
|
-
if isinstance(text, AsyncIterator):
|
|
110
|
-
full_text = ""
|
|
111
|
-
async for chunk in text:
|
|
112
|
-
full_text += chunk
|
|
113
|
-
else:
|
|
114
|
-
full_text = text
|
|
115
|
-
|
|
116
|
-
if not self.audio_track or not self.loop:
|
|
117
|
-
self.emit("error", "Audio track or event loop not set")
|
|
118
|
-
return
|
|
119
|
-
|
|
120
|
-
target_voice = voice_id or self.voice
|
|
121
|
-
|
|
122
|
-
if self.enable_streaming:
|
|
123
|
-
await self._stream_synthesis(full_text, target_voice)
|
|
124
|
-
else:
|
|
125
|
-
await self._chunked_synthesis(full_text, target_voice)
|
|
126
|
-
|
|
127
|
-
except Exception as e:
|
|
128
|
-
self.emit("error", f"TTS synthesis failed: {str(e)}")
|
|
129
|
-
|
|
130
|
-
async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
|
|
131
|
-
"""Non-streaming synthesis using the standard API"""
|
|
132
|
-
url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
|
|
133
|
-
|
|
134
|
-
params = {
|
|
135
|
-
"model_id": self.model,
|
|
136
|
-
"output_format": self.response_format,
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
headers = {
|
|
140
|
-
"xi-api-key": self.api_key,
|
|
141
|
-
"Content-Type": "application/json",
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
payload = {
|
|
145
|
-
"text": text,
|
|
146
|
-
"voice_settings": {
|
|
147
|
-
"stability": self.voice_settings.stability,
|
|
148
|
-
"similarity_boost": self.voice_settings.similarity_boost,
|
|
149
|
-
"style": self.voice_settings.style,
|
|
150
|
-
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
151
|
-
},
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
try:
|
|
155
|
-
async with self._session.stream(
|
|
156
|
-
"POST",
|
|
157
|
-
url,
|
|
158
|
-
headers=headers,
|
|
159
|
-
json=payload,
|
|
160
|
-
params=params
|
|
161
|
-
) as response:
|
|
162
|
-
response.raise_for_status()
|
|
163
|
-
|
|
164
|
-
audio_data = b""
|
|
165
|
-
async for chunk in response.aiter_bytes():
|
|
166
|
-
if chunk:
|
|
167
|
-
audio_data += chunk
|
|
168
|
-
|
|
169
|
-
if audio_data:
|
|
170
|
-
await self._stream_audio_chunks(audio_data)
|
|
171
|
-
|
|
172
|
-
except httpx.HTTPStatusError as e:
|
|
173
|
-
self.emit("error", f"HTTP error {e.response.status_code}: {e.response.text}")
|
|
174
|
-
except Exception as e:
|
|
175
|
-
self.emit("error", f"Chunked synthesis failed: {str(e)}")
|
|
176
|
-
|
|
177
|
-
async def _stream_synthesis(self, text: str, voice_id: str) -> None:
|
|
178
|
-
"""WebSocket-based streaming synthesis"""
|
|
179
|
-
|
|
180
|
-
try:
|
|
181
|
-
ws = await self._ensure_ws_connection(voice_id)
|
|
182
|
-
|
|
183
|
-
# Send text message
|
|
184
|
-
text_message = {"text": f"{text} "}
|
|
185
|
-
await ws.send_str(json.dumps(text_message))
|
|
186
|
-
|
|
187
|
-
# Send end-of-stream message
|
|
188
|
-
eos_message = {"text": ""}
|
|
189
|
-
await ws.send_str(json.dumps(eos_message))
|
|
190
|
-
|
|
191
|
-
audio_data = b""
|
|
192
|
-
async for msg in ws:
|
|
193
|
-
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
194
|
-
data = json.loads(msg.data)
|
|
195
|
-
if data.get("audio"):
|
|
196
|
-
import base64
|
|
197
|
-
audio_chunk = base64.b64decode(data["audio"])
|
|
198
|
-
audio_data += audio_chunk
|
|
199
|
-
elif data.get("isFinal"):
|
|
200
|
-
break
|
|
201
|
-
elif data.get("error"):
|
|
202
|
-
self.emit("error", f"WebSocket error: {data['error']}")
|
|
203
|
-
break
|
|
204
|
-
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
205
|
-
self.emit("error", f"WebSocket connection error: {ws.exception()}")
|
|
206
|
-
break
|
|
207
|
-
|
|
208
|
-
if audio_data:
|
|
209
|
-
await self._stream_audio_chunks(audio_data)
|
|
210
|
-
|
|
211
|
-
except Exception as e:
|
|
212
|
-
self.emit("error", f"Streaming synthesis failed: {str(e)}")
|
|
213
|
-
|
|
214
|
-
async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
|
|
215
|
-
"""Stream audio data in chunks for smooth playback"""
|
|
216
|
-
chunk_size = int(ELEVENLABS_SAMPLE_RATE * ELEVENLABS_CHANNELS * 2 * 20 / 1000)
|
|
217
|
-
|
|
218
|
-
for i in range(0, len(audio_bytes), chunk_size):
|
|
219
|
-
chunk = audio_bytes[i:i + chunk_size]
|
|
220
|
-
|
|
221
|
-
if len(chunk) < chunk_size and len(chunk) > 0:
|
|
222
|
-
padding_needed = chunk_size - len(chunk)
|
|
223
|
-
chunk += b'\x00' * padding_needed
|
|
224
|
-
|
|
225
|
-
if len(chunk) == chunk_size:
|
|
226
|
-
if not self._first_chunk_sent and self._first_audio_callback:
|
|
227
|
-
self._first_chunk_sent = True
|
|
228
|
-
await self._first_audio_callback()
|
|
229
|
-
|
|
230
|
-
self.loop.create_task(self.audio_track.add_new_bytes(chunk))
|
|
231
|
-
await asyncio.sleep(0.001)
|
|
232
|
-
|
|
233
|
-
async def aclose(self) -> None:
|
|
234
|
-
"""Cleanup resources"""
|
|
235
|
-
if self._ws_connection:
|
|
236
|
-
await self._ws_connection.close()
|
|
237
|
-
if self._ws_session:
|
|
238
|
-
await self._ws_session.close()
|
|
239
|
-
if self._session:
|
|
240
|
-
await self._session.aclose()
|
|
241
|
-
await super().aclose()
|
|
242
|
-
|
|
243
|
-
async def interrupt(self) -> None:
|
|
244
|
-
"""Interrupt the TTS process"""
|
|
245
|
-
if self.audio_track:
|
|
246
|
-
self.audio_track.interrupt()
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.26"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|