videosdk-plugins-elevenlabs 0.0.21__tar.gz → 0.0.23__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.
- {videosdk_plugins_elevenlabs-0.0.21 → videosdk_plugins_elevenlabs-0.0.23}/PKG-INFO +2 -2
- {videosdk_plugins_elevenlabs-0.0.21 → videosdk_plugins_elevenlabs-0.0.23}/pyproject.toml +1 -1
- {videosdk_plugins_elevenlabs-0.0.21 → videosdk_plugins_elevenlabs-0.0.23}/videosdk/plugins/elevenlabs/tts.py +76 -49
- videosdk_plugins_elevenlabs-0.0.23/videosdk/plugins/elevenlabs/version.py +1 -0
- videosdk_plugins_elevenlabs-0.0.21/videosdk/plugins/elevenlabs/version.py +0 -1
- {videosdk_plugins_elevenlabs-0.0.21 → videosdk_plugins_elevenlabs-0.0.23}/.gitignore +0 -0
- {videosdk_plugins_elevenlabs-0.0.21 → videosdk_plugins_elevenlabs-0.0.23}/README.md +0 -0
- {videosdk_plugins_elevenlabs-0.0.21 → videosdk_plugins_elevenlabs-0.0.23}/videosdk/plugins/elevenlabs/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videosdk-plugins-elevenlabs
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.23
|
|
4
4
|
Summary: VideoSDK Agent Framework plugin for ElevenLabs
|
|
5
5
|
Author: videosdk
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
12
12
|
Classifier: Topic :: Multimedia :: Video
|
|
13
13
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
14
14
|
Requires-Python: >=3.11
|
|
15
|
-
Requires-Dist: videosdk-agents>=0.0.
|
|
15
|
+
Requires-Dist: videosdk-agents>=0.0.23
|
|
16
16
|
Description-Content-Type: text/markdown
|
|
17
17
|
|
|
18
18
|
# VideoSDK ElevenLabs Plugin
|
|
@@ -37,7 +37,7 @@ class ElevenLabsTTS(TTS):
|
|
|
37
37
|
response_format: str = "pcm_24000",
|
|
38
38
|
voice_settings: VoiceSettings | None = None,
|
|
39
39
|
base_url: str = API_BASE_URL,
|
|
40
|
-
enable_streaming: bool =
|
|
40
|
+
enable_streaming: bool = True,
|
|
41
41
|
) -> None:
|
|
42
42
|
super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
|
|
43
43
|
|
|
@@ -50,6 +50,9 @@ class ElevenLabsTTS(TTS):
|
|
|
50
50
|
self.base_url = base_url
|
|
51
51
|
self.enable_streaming = enable_streaming
|
|
52
52
|
self.voice_settings = voice_settings or VoiceSettings()
|
|
53
|
+
self._first_chunk_sent = False
|
|
54
|
+
self._ws_session = None
|
|
55
|
+
self._ws_connection = None
|
|
53
56
|
|
|
54
57
|
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
|
55
58
|
if not self.api_key:
|
|
@@ -60,6 +63,42 @@ class ElevenLabsTTS(TTS):
|
|
|
60
63
|
follow_redirects=True,
|
|
61
64
|
)
|
|
62
65
|
|
|
66
|
+
def reset_first_audio_tracking(self) -> None:
|
|
67
|
+
"""Reset the first audio tracking state for next TTS task"""
|
|
68
|
+
self._first_chunk_sent = False
|
|
69
|
+
|
|
70
|
+
async def _ensure_ws_connection(self, voice_id: str) -> aiohttp.ClientWebSocketResponse:
|
|
71
|
+
"""Ensure WebSocket connection is established and return it"""
|
|
72
|
+
if self._ws_connection is None or self._ws_connection.closed:
|
|
73
|
+
|
|
74
|
+
ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
|
|
75
|
+
|
|
76
|
+
params = {
|
|
77
|
+
"model_id": self.model,
|
|
78
|
+
"output_format": self.response_format,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
82
|
+
full_ws_url = f"{ws_url}?{param_string}"
|
|
83
|
+
|
|
84
|
+
headers = {"xi-api-key": self.api_key}
|
|
85
|
+
|
|
86
|
+
self._ws_session = aiohttp.ClientSession()
|
|
87
|
+
self._ws_connection = await self._ws_session.ws_connect(full_ws_url, headers=headers)
|
|
88
|
+
|
|
89
|
+
init_message = {
|
|
90
|
+
"text": " ",
|
|
91
|
+
"voice_settings": {
|
|
92
|
+
"stability": self.voice_settings.stability,
|
|
93
|
+
"similarity_boost": self.voice_settings.similarity_boost,
|
|
94
|
+
"style": self.voice_settings.style,
|
|
95
|
+
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
96
|
+
},
|
|
97
|
+
}
|
|
98
|
+
await self._ws_connection.send_str(json.dumps(init_message))
|
|
99
|
+
|
|
100
|
+
return self._ws_connection
|
|
101
|
+
|
|
63
102
|
async def synthesize(
|
|
64
103
|
self,
|
|
65
104
|
text: AsyncIterator[str] | str,
|
|
@@ -137,58 +176,38 @@ class ElevenLabsTTS(TTS):
|
|
|
137
176
|
|
|
138
177
|
async def _stream_synthesis(self, text: str, voice_id: str) -> None:
|
|
139
178
|
"""WebSocket-based streaming synthesis"""
|
|
140
|
-
ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
|
|
141
|
-
|
|
142
|
-
params = {
|
|
143
|
-
"model_id": self.model,
|
|
144
|
-
"output_format": self.response_format,
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
148
|
-
full_ws_url = f"{ws_url}?{param_string}"
|
|
149
|
-
|
|
150
|
-
headers = {"xi-api-key": self.api_key}
|
|
151
179
|
|
|
152
180
|
try:
|
|
153
|
-
|
|
154
|
-
async with session.ws_connect(full_ws_url, headers=headers) as ws:
|
|
155
|
-
init_message = {
|
|
156
|
-
"text": " ",
|
|
157
|
-
"voice_settings": {
|
|
158
|
-
"stability": self.voice_settings.stability,
|
|
159
|
-
"similarity_boost": self.voice_settings.similarity_boost,
|
|
160
|
-
"style": self.voice_settings.style,
|
|
161
|
-
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
162
|
-
},
|
|
163
|
-
}
|
|
164
|
-
await ws.send_str(json.dumps(init_message))
|
|
165
|
-
|
|
166
|
-
text_message = {"text": f"{text} "}
|
|
167
|
-
await ws.send_str(json.dumps(text_message))
|
|
181
|
+
ws = await self._ensure_ws_connection(voice_id)
|
|
168
182
|
|
|
169
|
-
|
|
170
|
-
|
|
183
|
+
# Send text message
|
|
184
|
+
text_message = {"text": f"{text} "}
|
|
185
|
+
await ws.send_str(json.dumps(text_message))
|
|
186
|
+
|
|
187
|
+
# Send end-of-stream message
|
|
188
|
+
eos_message = {"text": ""}
|
|
189
|
+
await ws.send_str(json.dumps(eos_message))
|
|
190
|
+
|
|
191
|
+
audio_data = b""
|
|
192
|
+
async for msg in ws:
|
|
193
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
194
|
+
data = json.loads(msg.data)
|
|
195
|
+
if data.get("audio"):
|
|
196
|
+
import base64
|
|
197
|
+
audio_chunk = base64.b64decode(data["audio"])
|
|
198
|
+
audio_data += audio_chunk
|
|
199
|
+
elif data.get("isFinal"):
|
|
200
|
+
break
|
|
201
|
+
elif data.get("error"):
|
|
202
|
+
self.emit("error", f"WebSocket error: {data['error']}")
|
|
203
|
+
break
|
|
204
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
205
|
+
self.emit("error", f"WebSocket connection error: {ws.exception()}")
|
|
206
|
+
break
|
|
207
|
+
|
|
208
|
+
if audio_data:
|
|
209
|
+
await self._stream_audio_chunks(audio_data)
|
|
171
210
|
|
|
172
|
-
audio_data = b""
|
|
173
|
-
async for msg in ws:
|
|
174
|
-
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
175
|
-
data = json.loads(msg.data)
|
|
176
|
-
if data.get("audio"):
|
|
177
|
-
import base64
|
|
178
|
-
audio_chunk = base64.b64decode(data["audio"])
|
|
179
|
-
audio_data += audio_chunk
|
|
180
|
-
elif data.get("isFinal"):
|
|
181
|
-
break
|
|
182
|
-
elif data.get("error"):
|
|
183
|
-
self.emit("error", f"WebSocket error: {data['error']}")
|
|
184
|
-
break
|
|
185
|
-
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
186
|
-
self.emit("error", f"WebSocket connection error: {ws.exception()}")
|
|
187
|
-
break
|
|
188
|
-
|
|
189
|
-
if audio_data:
|
|
190
|
-
await self._stream_audio_chunks(audio_data)
|
|
191
|
-
|
|
192
211
|
except Exception as e:
|
|
193
212
|
self.emit("error", f"Streaming synthesis failed: {str(e)}")
|
|
194
213
|
|
|
@@ -204,11 +223,19 @@ class ElevenLabsTTS(TTS):
|
|
|
204
223
|
chunk += b'\x00' * padding_needed
|
|
205
224
|
|
|
206
225
|
if len(chunk) == chunk_size:
|
|
226
|
+
if not self._first_chunk_sent and self._first_audio_callback:
|
|
227
|
+
self._first_chunk_sent = True
|
|
228
|
+
await self._first_audio_callback()
|
|
229
|
+
|
|
207
230
|
self.loop.create_task(self.audio_track.add_new_bytes(chunk))
|
|
208
231
|
await asyncio.sleep(0.001)
|
|
209
232
|
|
|
210
233
|
async def aclose(self) -> None:
|
|
211
234
|
"""Cleanup resources"""
|
|
235
|
+
if self._ws_connection:
|
|
236
|
+
await self._ws_connection.close()
|
|
237
|
+
if self._ws_session:
|
|
238
|
+
await self._ws_session.close()
|
|
212
239
|
if self._session:
|
|
213
240
|
await self._session.aclose()
|
|
214
241
|
await super().aclose()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.23"
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.0.21"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|