videosdk-plugins-elevenlabs 0.0.22__tar.gz → 0.0.24__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-elevenlabs
3
- Version: 0.0.22
3
+ Version: 0.0.24
4
4
  Summary: VideoSDK Agent Framework plugin for ElevenLabs
5
5
  Author: videosdk
6
6
  License-Expression: Apache-2.0
@@ -12,7 +12,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
12
12
  Classifier: Topic :: Multimedia :: Video
13
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
14
  Requires-Python: >=3.11
15
- Requires-Dist: videosdk-agents>=0.0.22
15
+ Requires-Dist: videosdk-agents>=0.0.24
16
16
  Description-Content-Type: text/markdown
17
17
 
18
18
  # VideoSDK ElevenLabs Plugin
@@ -21,7 +21,7 @@ classifiers = [
21
21
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
22
  ]
23
23
  dependencies = [
24
- "videosdk-agents>=0.0.22"
24
+ "videosdk-agents>=0.0.24"
25
25
  ]
26
26
 
27
27
  [tool.hatch.version]
@@ -37,7 +37,7 @@ class ElevenLabsTTS(TTS):
37
37
  response_format: str = "pcm_24000",
38
38
  voice_settings: VoiceSettings | None = None,
39
39
  base_url: str = API_BASE_URL,
40
- enable_streaming: bool = False,
40
+ enable_streaming: bool = True,
41
41
  ) -> None:
42
42
  super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
43
43
 
@@ -50,6 +50,9 @@ class ElevenLabsTTS(TTS):
50
50
  self.base_url = base_url
51
51
  self.enable_streaming = enable_streaming
52
52
  self.voice_settings = voice_settings or VoiceSettings()
53
+ self._first_chunk_sent = False
54
+ self._ws_session = None
55
+ self._ws_connection = None
53
56
 
54
57
  self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
55
58
  if not self.api_key:
@@ -60,6 +63,42 @@ class ElevenLabsTTS(TTS):
60
63
  follow_redirects=True,
61
64
  )
62
65
 
66
+ def reset_first_audio_tracking(self) -> None:
67
+ """Reset the first audio tracking state for next TTS task"""
68
+ self._first_chunk_sent = False
69
+
70
+ async def _ensure_ws_connection(self, voice_id: str) -> aiohttp.ClientWebSocketResponse:
71
+ """Ensure WebSocket connection is established and return it"""
72
+ if self._ws_connection is None or self._ws_connection.closed:
73
+
74
+ ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
75
+
76
+ params = {
77
+ "model_id": self.model,
78
+ "output_format": self.response_format,
79
+ }
80
+
81
+ param_string = "&".join([f"{k}={v}" for k, v in params.items()])
82
+ full_ws_url = f"{ws_url}?{param_string}"
83
+
84
+ headers = {"xi-api-key": self.api_key}
85
+
86
+ self._ws_session = aiohttp.ClientSession()
87
+ self._ws_connection = await self._ws_session.ws_connect(full_ws_url, headers=headers)
88
+
89
+ init_message = {
90
+ "text": " ",
91
+ "voice_settings": {
92
+ "stability": self.voice_settings.stability,
93
+ "similarity_boost": self.voice_settings.similarity_boost,
94
+ "style": self.voice_settings.style,
95
+ "use_speaker_boost": self.voice_settings.use_speaker_boost,
96
+ },
97
+ }
98
+ await self._ws_connection.send_str(json.dumps(init_message))
99
+
100
+ return self._ws_connection
101
+
63
102
  async def synthesize(
64
103
  self,
65
104
  text: AsyncIterator[str] | str,
@@ -137,58 +176,38 @@ class ElevenLabsTTS(TTS):
137
176
 
138
177
  async def _stream_synthesis(self, text: str, voice_id: str) -> None:
139
178
  """WebSocket-based streaming synthesis"""
140
- ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
141
-
142
- params = {
143
- "model_id": self.model,
144
- "output_format": self.response_format,
145
- }
146
-
147
- param_string = "&".join([f"{k}={v}" for k, v in params.items()])
148
- full_ws_url = f"{ws_url}?{param_string}"
149
-
150
- headers = {"xi-api-key": self.api_key}
151
179
 
152
180
  try:
153
- async with aiohttp.ClientSession() as session:
154
- async with session.ws_connect(full_ws_url, headers=headers) as ws:
155
- init_message = {
156
- "text": " ",
157
- "voice_settings": {
158
- "stability": self.voice_settings.stability,
159
- "similarity_boost": self.voice_settings.similarity_boost,
160
- "style": self.voice_settings.style,
161
- "use_speaker_boost": self.voice_settings.use_speaker_boost,
162
- },
163
- }
164
- await ws.send_str(json.dumps(init_message))
165
-
166
- text_message = {"text": f"{text} "}
167
- await ws.send_str(json.dumps(text_message))
181
+ ws = await self._ensure_ws_connection(voice_id)
168
182
 
169
- eos_message = {"text": ""}
170
- await ws.send_str(json.dumps(eos_message))
183
+ # Send text message
184
+ text_message = {"text": f"{text} "}
185
+ await ws.send_str(json.dumps(text_message))
186
+
187
+ # Send end-of-stream message
188
+ eos_message = {"text": ""}
189
+ await ws.send_str(json.dumps(eos_message))
190
+
191
+ audio_data = b""
192
+ async for msg in ws:
193
+ if msg.type == aiohttp.WSMsgType.TEXT:
194
+ data = json.loads(msg.data)
195
+ if data.get("audio"):
196
+ import base64
197
+ audio_chunk = base64.b64decode(data["audio"])
198
+ audio_data += audio_chunk
199
+ elif data.get("isFinal"):
200
+ break
201
+ elif data.get("error"):
202
+ self.emit("error", f"WebSocket error: {data['error']}")
203
+ break
204
+ elif msg.type == aiohttp.WSMsgType.ERROR:
205
+ self.emit("error", f"WebSocket connection error: {ws.exception()}")
206
+ break
207
+
208
+ if audio_data:
209
+ await self._stream_audio_chunks(audio_data)
171
210
 
172
- audio_data = b""
173
- async for msg in ws:
174
- if msg.type == aiohttp.WSMsgType.TEXT:
175
- data = json.loads(msg.data)
176
- if data.get("audio"):
177
- import base64
178
- audio_chunk = base64.b64decode(data["audio"])
179
- audio_data += audio_chunk
180
- elif data.get("isFinal"):
181
- break
182
- elif data.get("error"):
183
- self.emit("error", f"WebSocket error: {data['error']}")
184
- break
185
- elif msg.type == aiohttp.WSMsgType.ERROR:
186
- self.emit("error", f"WebSocket connection error: {ws.exception()}")
187
- break
188
-
189
- if audio_data:
190
- await self._stream_audio_chunks(audio_data)
191
-
192
211
  except Exception as e:
193
212
  self.emit("error", f"Streaming synthesis failed: {str(e)}")
194
213
 
@@ -204,11 +223,19 @@ class ElevenLabsTTS(TTS):
204
223
  chunk += b'\x00' * padding_needed
205
224
 
206
225
  if len(chunk) == chunk_size:
226
+ if not self._first_chunk_sent and self._first_audio_callback:
227
+ self._first_chunk_sent = True
228
+ await self._first_audio_callback()
229
+
207
230
  self.loop.create_task(self.audio_track.add_new_bytes(chunk))
208
231
  await asyncio.sleep(0.001)
209
232
 
210
233
  async def aclose(self) -> None:
211
234
  """Cleanup resources"""
235
+ if self._ws_connection:
236
+ await self._ws_connection.close()
237
+ if self._ws_session:
238
+ await self._ws_session.close()
212
239
  if self._session:
213
240
  await self._session.aclose()
214
241
  await super().aclose()
@@ -0,0 +1 @@
1
+ __version__ = "0.0.24"
@@ -1 +0,0 @@
1
- __version__ = "0.0.22"