videosdk-plugins-elevenlabs 0.0.3__py3-none-any.whl → 0.0.49__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
1
  from .tts import ElevenLabsTTS, VoiceSettings
2
+ from .stt import ElevenLabsSTT
2
3
 
3
- __all__ = ["ElevenLabsTTS", "VoiceSettings"]
4
+ __all__ = ["ElevenLabsTTS", "VoiceSettings", "ElevenLabsSTT"]
@@ -0,0 +1,356 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import base64
5
+ import json
6
+ import os
7
+ import logging
8
+ import time
9
+ from typing import Any, Optional, List
10
+ from urllib.parse import urlencode
11
+ import aiohttp
12
+ import numpy as np
13
+ from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ STT_ERROR_MSGS = {"input_error", "auth_error", "quota_exceeded", "transcriber_error", "error"}
18
+ SUPPORTED_SAMPLE_RATES = {8000, 16000, 22050, 24000, 44100, 48000}
19
+
20
+ class ElevenLabsSTT(BaseSTT):
21
+ """
22
+ ElevenLabs Realtime Speech-to-Text (STT) client.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ *,
28
+ api_key: str | None = None,
29
+ model_id: str = "scribe_v2_realtime",
30
+ language_code: str = "en",
31
+ sample_rate: int = 48000,
32
+ commit_strategy: str = "vad",
33
+ vad_silence_threshold_secs: float = 0.8,
34
+ vad_threshold: float = 0.4,
35
+ min_speech_duration_ms: int = 50,
36
+ min_silence_duration_ms: int = 50,
37
+ base_url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
38
+ ) -> None:
39
+ """
40
+ Initialize the ElevenLabs STT client.
41
+
42
+ Args:
43
+ api_key: ElevenLabs API key for authentication. Defaults to env variable ELEVENLABS_API_KEY.
44
+ model_id: STT model identifier.
45
+ language_code: Language code for transcription.
46
+ sample_rate: Sample rate of input audio in Hz.
47
+ commit_strategy: Strategy for committing transcripts ('vad' is by default).
48
+ vad_silence_threshold_secs: Duration of silence to detect end-of-speech.
49
+ vad_threshold: Threshold for detecting voice activity.
50
+ min_speech_duration_ms: Minimum duration in milliseconds for a speech segment.
51
+ min_silence_duration_ms: Minimum duration in milliseconds of silence to consider end-of-speech.
52
+ base_url: WebSocket endpoint for ElevenLabs STT.
53
+ Raises:
54
+ ValueError: If required parameters are missing or invalid.
55
+ """
56
+ super().__init__()
57
+
58
+ self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
59
+ if not self.api_key:
60
+ raise ValueError("ElevenLabs API key must be provided via api_key or ELEVENLABS_API_KEY env var")
61
+
62
+ self.model_id = model_id
63
+ self.language_code = language_code
64
+ self.commit_strategy = commit_strategy
65
+ self.base_url = base_url
66
+ self.sample_rate = sample_rate
67
+
68
+ if self.sample_rate not in SUPPORTED_SAMPLE_RATES:
69
+ raise ValueError(f"Unsupported sample_rate: {self.sample_rate}. Supported rates: {SUPPORTED_SAMPLE_RATES}")
70
+
71
+ self.vad_silence_threshold_secs = vad_silence_threshold_secs
72
+ self.vad_threshold = vad_threshold
73
+ self.min_speech_duration_ms = min_speech_duration_ms
74
+ self.min_silence_duration_ms = min_silence_duration_ms
75
+
76
+ self._last_final_text = ""
77
+ self._last_final_time = 0.0
78
+ self._duplicate_suppression_window = 0.75
79
+
80
+ self._stream_buffer = bytearray()
81
+ self._target_chunk_size = int(0.1 * self.sample_rate * 2)
82
+
83
+ self.heartbeat = 15.0
84
+ self._session: Optional[aiohttp.ClientSession] = None
85
+ self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
86
+ self._ws_task: Optional[asyncio.Task] = None
87
+
88
+ async def process_audio(
89
+ self,
90
+ audio_frames: bytes,
91
+ **kwargs: Any
92
+ ) -> None:
93
+ """
94
+ Process and send audio frames.
95
+ Converts to mono (required by ElevenLabs) and buffers 100ms chunks to reduce overhead.
96
+ """
97
+
98
+ if not self._ws or self._ws.closed:
99
+ await self._connect_ws()
100
+ if not self._ws_task or self._ws_task.done():
101
+ self._ws_task = asyncio.create_task(self._listen_for_responses())
102
+
103
+ elif self._ws_task and self._ws_task.done():
104
+ logger.warning("WebSocket listener stopped unexpectedly, restarting")
105
+ self._ws_task = asyncio.create_task(self._listen_for_responses())
106
+
107
+ try:
108
+ mono_audio = self._convert_to_mono(audio_frames)
109
+ if not mono_audio:
110
+ return
111
+
112
+ self._stream_buffer.extend(mono_audio)
113
+
114
+ while len(self._stream_buffer) >= self._target_chunk_size:
115
+ chunk = self._stream_buffer[:self._target_chunk_size]
116
+ await self._send_audio(chunk)
117
+ self._stream_buffer = self._stream_buffer[self._target_chunk_size:]
118
+
119
+ except Exception as e:
120
+ logger.exception("Error in process_audio: %s", e)
121
+ self.emit("error", str(e))
122
+ if self._ws:
123
+ await self._ws.close()
124
+ self._ws = None
125
+
126
+ async def _connect_ws(self) -> None:
127
+ if not self._session:
128
+ self._session = aiohttp.ClientSession()
129
+
130
+ query_params = {
131
+ "model_id": str(self.model_id),
132
+ "language_code": str(self.language_code),
133
+ "audio_format": f"pcm_{self.sample_rate}",
134
+ "commit_strategy": str(self.commit_strategy),
135
+ "vad_silence_threshold_secs": self.vad_silence_threshold_secs,
136
+ "vad_threshold": self.vad_threshold,
137
+ "min_speech_duration_ms": self.min_speech_duration_ms,
138
+ "min_silence_duration_ms": self.min_silence_duration_ms,
139
+ }
140
+
141
+ ws_url = f"{self.base_url}?{urlencode(query_params)}"
142
+ headers = {"xi-api-key": self.api_key}
143
+
144
+ try:
145
+ self._ws = await self._session.ws_connect(ws_url, headers=headers, heartbeat=self.heartbeat)
146
+ logger.info("Connected to ElevenLabs Realtime STT WebSocket.")
147
+ except Exception as e:
148
+ logger.exception("Error connecting to ElevenLabs WebSocket: %s", e)
149
+ raise
150
+
151
+ async def _send_audio(self, audio_bytes: bytes) -> None:
152
+ if not self._ws:
153
+ return
154
+
155
+ payload = {
156
+ "message_type": "input_audio_chunk",
157
+ "audio_base_64": base64.b64encode(audio_bytes).decode(),
158
+ "sample_rate": self.sample_rate,
159
+ }
160
+
161
+ try:
162
+ await self._ws.send_str(json.dumps(payload))
163
+ except Exception as e:
164
+ logger.exception("Error sending audio chunk: %s", e)
165
+ self.emit("error", str(e))
166
+ await self.aclose()
167
+
168
+ def _convert_to_mono(self, audio_bytes: bytes) -> bytes:
169
+ """
170
+ Convert input audio bytes to mono.
171
+ """
172
+ if not audio_bytes:
173
+ return b""
174
+ try:
175
+ raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
176
+ if raw_audio.size == 0:
177
+ return b""
178
+
179
+ if raw_audio.size % 2 == 0:
180
+ try:
181
+ stereo = raw_audio.reshape(-1, 2).astype(np.float32)
182
+ mono = stereo.mean(axis=1)
183
+ return mono.astype(np.int16).tobytes()
184
+ except ValueError:
185
+ pass
186
+
187
+ return audio_bytes
188
+ except Exception as e:
189
+ logger.error("Error converting to mono: %s", e)
190
+ return b""
191
+
192
+ async def _listen_for_responses(self) -> None:
193
+ """
194
+ Listen for incoming WebSocket messages from ElevenLabs STT.
195
+ """
196
+ if not self._ws:
197
+ return
198
+
199
+ try:
200
+ async for msg in self._ws:
201
+ if msg.type == aiohttp.WSMsgType.TEXT:
202
+ data = None
203
+ try:
204
+ data = msg.json()
205
+ except Exception:
206
+ try:
207
+ data = json.loads(msg.data)
208
+ except Exception:
209
+ logger.debug("Received non-json ws text message")
210
+ continue
211
+
212
+ responses = await self._handle_ws_event(data)
213
+ if responses:
214
+ for r in responses:
215
+ if self._transcript_callback:
216
+ try:
217
+ await self._transcript_callback(r)
218
+ except Exception:
219
+ logger.exception("Error in transcript callback")
220
+ elif msg.type == aiohttp.WSMsgType.ERROR:
221
+ logger.error("WebSocket error: %s", self._ws.exception())
222
+ self.emit("error", f"WebSocket error: {self._ws.exception()}")
223
+ break
224
+ elif msg.type == aiohttp.WSMsgType.CLOSED:
225
+ logger.info("WebSocket closed by server.")
226
+ break
227
+ except asyncio.CancelledError:
228
+ logger.debug("WebSocket listener cancelled")
229
+ except Exception as e:
230
+ logger.exception("Error in WebSocket listener: %s", e)
231
+ self.emit("error", str(e))
232
+ finally:
233
+ if self._ws:
234
+ try:
235
+ await self._ws.close()
236
+ except Exception:
237
+ pass
238
+ self._ws = None
239
+ self._ws_task = None
240
+
241
+ async def _handle_ws_event(self, data: dict) -> List[STTResponse]:
242
+ """
243
+ Process a single WebSocket event from ElevenLabs STT.
244
+
245
+ Args:
246
+ data: JSON-decoded WebSocket message.
247
+
248
+ Returns:
249
+ List of STTResponse objects for this event.
250
+ """
251
+ responses: List[STTResponse] = []
252
+ message_type = data.get("message_type")
253
+ logger.debug("Received WS event: %s", message_type)
254
+
255
+ if message_type in STT_ERROR_MSGS:
256
+ logger.error("ElevenLabs STT error: %s", data)
257
+ self.emit("error", data)
258
+ return responses
259
+
260
+ if message_type == "session_started":
261
+ global_event_emitter.emit("speech_session_started")
262
+ return responses
263
+
264
+ if message_type == "committed_transcript":
265
+ logger.info("==== Received final transcript event: %s", data)
266
+ text = data.get("text", "")
267
+ clean_text = text.strip()
268
+ confidence = float(data.get("confidence", 0.0))
269
+ now = time.time()
270
+
271
+ if clean_text == "":
272
+ global_event_emitter.emit("speech_stopped")
273
+ self._last_final_text = ""
274
+ self._last_final_time = now
275
+ return responses
276
+
277
+ resp = STTResponse(
278
+ event_type=SpeechEventType.FINAL,
279
+ data=SpeechData(
280
+ text=clean_text,
281
+ confidence=confidence,
282
+ ),
283
+ metadata={"model": self.model_id, "raw_event": data},
284
+ )
285
+ responses.append(resp)
286
+
287
+ global_event_emitter.emit("speech_stopped")
288
+ self._last_final_text = clean_text
289
+ self._last_final_time = now
290
+ return responses
291
+
292
+
293
+ if message_type == "partial_transcript":
294
+ text = data.get("text", "")
295
+ clean_text = text.strip()
296
+
297
+ if (
298
+ self._last_final_text
299
+ and clean_text
300
+ and clean_text == self._last_final_text
301
+ and (time.time() - self._last_final_time) < self._duplicate_suppression_window
302
+ ):
303
+ logger.debug("Dropping duplicate partial matching recent final transcript")
304
+ return responses
305
+
306
+ resp = STTResponse(
307
+ event_type=SpeechEventType.INTERIM,
308
+ data=SpeechData(
309
+ text=text,
310
+ confidence=float(data.get("confidence", 0.0)),
311
+ ),
312
+ metadata={"model": self.model_id, "raw_event": data},
313
+ )
314
+ responses.append(resp)
315
+
316
+ if clean_text:
317
+ global_event_emitter.emit("speech_started")
318
+
319
+ return responses
320
+
321
+
322
+
323
+ logger.debug("Ignoring unrecognized message_type: %s", message_type)
324
+ return responses
325
+
326
+ async def aclose(self) -> None:
327
+ """
328
+ Close the WebSocket connection and cleanup session resources.
329
+
330
+ Cancels the listener task, closes WebSocket and HTTP session,
331
+ and calls the parent class cleanup.
332
+ """
333
+ if self._ws_task:
334
+ self._ws_task.cancel()
335
+ try:
336
+ await self._ws_task
337
+ except asyncio.CancelledError:
338
+ pass
339
+ self._ws_task = None
340
+
341
+ if self._ws:
342
+ try:
343
+ await self._ws.close()
344
+ except Exception:
345
+ pass
346
+ self._ws = None
347
+
348
+ if self._session:
349
+ try:
350
+ await self._session.close()
351
+ except Exception:
352
+ pass
353
+ finally:
354
+ self._session = None
355
+
356
+ await super().aclose()
@@ -1,14 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any, AsyncIterator, Literal, Optional, Union
3
+ from typing import Any, AsyncIterator, Optional, Union
4
4
  import os
5
5
  import httpx
6
6
  import asyncio
7
7
  import json
8
8
  import aiohttp
9
+ import weakref
9
10
  from dataclasses import dataclass
10
-
11
- from videosdk.agents import TTS
11
+ from videosdk.agents import TTS, segment_text
12
+ import base64
13
+ import uuid
12
14
 
13
15
  ELEVENLABS_SAMPLE_RATE = 24000
14
16
  ELEVENLABS_CHANNELS = 1
@@ -16,6 +18,7 @@ ELEVENLABS_CHANNELS = 1
16
18
  DEFAULT_MODEL = "eleven_flash_v2_5"
17
19
  DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
18
20
  API_BASE_URL = "https://api.elevenlabs.io/v1"
21
+ WS_INACTIVITY_TIMEOUT = 300
19
22
 
20
23
 
21
24
  @dataclass
@@ -30,16 +33,32 @@ class ElevenLabsTTS(TTS):
30
33
  def __init__(
31
34
  self,
32
35
  *,
36
+ api_key: str | None = None,
33
37
  model: str = DEFAULT_MODEL,
34
38
  voice: str = DEFAULT_VOICE_ID,
35
39
  speed: float = 1.0,
36
- api_key: str | None = None,
37
40
  response_format: str = "pcm_24000",
38
41
  voice_settings: VoiceSettings | None = None,
39
42
  base_url: str = API_BASE_URL,
40
- enable_streaming: bool = False,
43
+ enable_streaming: bool = True,
44
+ inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
41
45
  ) -> None:
42
- super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
46
+ """Initialize the ElevenLabs TTS plugin.
47
+
48
+ Args:
49
+ api_key (Optional[str], optional): ElevenLabs API key. Uses ELEVENLABS_API_KEY environment variable if not provided. Defaults to None.
50
+ model (str): The model to use for the TTS plugin. Defaults to "eleven_flash_v2_5".
51
+ voice (str): The voice to use for the TTS plugin. Defaults to "EXAVITQu4vr4xnSDxMaL".
52
+ speed (float): The speed to use for the TTS plugin. Defaults to 1.0.
53
+ response_format (str): The response format to use for the TTS plugin. Defaults to "pcm_24000".
54
+ voice_settings (Optional[VoiceSettings], optional): The voice settings to use for the TTS plugin. Defaults to None.
55
+ base_url (str): The base URL to use for the TTS plugin. Defaults to "https://api.elevenlabs.io/v1".
56
+ enable_streaming (bool): Whether to enable streaming for the TTS plugin. Defaults to True.
57
+ inactivity_timeout (int): The inactivity timeout to use for the TTS plugin. Defaults to 300.
58
+ """
59
+ super().__init__(
60
+ sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS
61
+ )
43
62
 
44
63
  self.model = model
45
64
  self.voice = voice
@@ -50,16 +69,35 @@ class ElevenLabsTTS(TTS):
50
69
  self.base_url = base_url
51
70
  self.enable_streaming = enable_streaming
52
71
  self.voice_settings = voice_settings or VoiceSettings()
53
-
72
+ self.inactivity_timeout = inactivity_timeout
73
+ self._first_chunk_sent = False
74
+ self._ws_session = None
75
+ self._ws_connection = None
54
76
  self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
55
77
  if not self.api_key:
56
- raise ValueError("ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
78
+ raise ValueError(
79
+ "ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
57
80
 
58
81
  self._session = httpx.AsyncClient(
59
- timeout=httpx.Timeout(connect=15.0, read=30.0, write=5.0, pool=5.0),
82
+ timeout=httpx.Timeout(connect=15.0, read=30.0,
83
+ write=5.0, pool=5.0),
60
84
  follow_redirects=True,
61
85
  )
62
86
 
87
+ self._streams = weakref.WeakSet()
88
+ self._send_task: asyncio.Task | None = None
89
+ self._recv_task: asyncio.Task | None = None
90
+ self._should_stop = False
91
+
92
+ self._connection_lock = asyncio.Lock()
93
+ self._ws_voice_id: str | None = None
94
+ self._active_contexts: set[str] = set()
95
+ self._context_futures: dict[str, asyncio.Future[None]] = {}
96
+
97
+ def reset_first_audio_tracking(self) -> None:
98
+ """Reset the first audio tracking state for next TTS task"""
99
+ self._first_chunk_sent = False
100
+
63
101
  async def synthesize(
64
102
  self,
65
103
  text: AsyncIterator[str] | str,
@@ -67,23 +105,23 @@ class ElevenLabsTTS(TTS):
67
105
  **kwargs: Any,
68
106
  ) -> None:
69
107
  try:
70
- if isinstance(text, AsyncIterator):
71
- full_text = ""
72
- async for chunk in text:
73
- full_text += chunk
74
- else:
75
- full_text = text
76
-
77
108
  if not self.audio_track or not self.loop:
78
109
  self.emit("error", "Audio track or event loop not set")
79
110
  return
80
111
 
81
112
  target_voice = voice_id or self.voice
113
+ self._should_stop = False
82
114
 
83
115
  if self.enable_streaming:
84
- await self._stream_synthesis(full_text, target_voice)
116
+ await self._stream_synthesis(text, target_voice)
85
117
  else:
86
- await self._chunked_synthesis(full_text, target_voice)
118
+ if isinstance(text, AsyncIterator):
119
+ async for segment in segment_text(text):
120
+ if self._should_stop:
121
+ break
122
+ await self._chunked_synthesis(segment, target_voice)
123
+ else:
124
+ await self._chunked_synthesis(text, target_voice)
87
125
 
88
126
  except Exception as e:
89
127
  self.emit("error", f"TTS synthesis failed: {str(e)}")
@@ -91,17 +129,17 @@ class ElevenLabsTTS(TTS):
91
129
  async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
92
130
  """Non-streaming synthesis using the standard API"""
93
131
  url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
94
-
132
+
95
133
  params = {
96
134
  "model_id": self.model,
97
135
  "output_format": self.response_format,
98
136
  }
99
-
137
+
100
138
  headers = {
101
139
  "xi-api-key": self.api_key,
102
140
  "Content-Type": "application/json",
103
141
  }
104
-
142
+
105
143
  payload = {
106
144
  "text": text,
107
145
  "voice_settings": {
@@ -114,83 +152,251 @@ class ElevenLabsTTS(TTS):
114
152
 
115
153
  try:
116
154
  async with self._session.stream(
117
- "POST",
118
- url,
119
- headers=headers,
155
+ "POST",
156
+ url,
157
+ headers=headers,
120
158
  json=payload,
121
159
  params=params
122
160
  ) as response:
123
161
  response.raise_for_status()
124
-
162
+
125
163
  async for chunk in response.aiter_bytes():
164
+ if self._should_stop:
165
+ break
126
166
  if chunk:
127
- self.loop.create_task(self.audio_track.add_new_bytes(chunk))
128
-
167
+ await self._stream_audio_chunks(chunk)
168
+
129
169
  except httpx.HTTPStatusError as e:
130
- self.emit("error", f"HTTP error {e.response.status_code}: {e.response.text}")
170
+ self.emit(
171
+ "error", f"HTTP error {e.response.status_code}: {e.response.text}")
131
172
  except Exception as e:
132
173
  self.emit("error", f"Chunked synthesis failed: {str(e)}")
133
174
 
134
- async def _stream_synthesis(self, text: str, voice_id: str) -> None:
135
- """WebSocket-based streaming synthesis"""
136
- ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
137
-
138
- params = {
139
- "model_id": self.model,
140
- "output_format": self.response_format,
141
- }
142
-
143
- param_string = "&".join([f"{k}={v}" for k, v in params.items()])
144
- full_ws_url = f"{ws_url}?{param_string}"
145
-
146
- headers = {"xi-api-key": self.api_key}
147
-
175
+ async def _stream_synthesis(self, text: Union[AsyncIterator[str], str], voice_id: str) -> None:
176
+ """WebSocket-based streaming synthesis using multi-context connection"""
148
177
  try:
149
- async with aiohttp.ClientSession() as session:
150
- async with session.ws_connect(full_ws_url, headers=headers) as ws:
151
- init_message = {
152
- "text": " ",
153
- "voice_settings": {
154
- "stability": self.voice_settings.stability,
155
- "similarity_boost": self.voice_settings.similarity_boost,
156
- "style": self.voice_settings.style,
157
- "use_speaker_boost": self.voice_settings.use_speaker_boost,
158
- },
159
- }
160
- await ws.send_str(json.dumps(init_message))
161
-
162
- text_message = {"text": f"{text} "}
163
- await ws.send_str(json.dumps(text_message))
164
-
165
- eos_message = {"text": ""}
166
- await ws.send_str(json.dumps(eos_message))
167
-
168
- async for msg in ws:
169
- if msg.type == aiohttp.WSMsgType.TEXT:
170
- data = json.loads(msg.data)
171
- if data.get("audio"):
172
- import base64
173
- audio_chunk = base64.b64decode(data["audio"])
174
- self.loop.create_task(self.audio_track.add_new_bytes(audio_chunk))
175
- elif data.get("isFinal"):
178
+ await self._ensure_connection(voice_id)
179
+
180
+ context_id = uuid.uuid4().hex[:12]
181
+ done_future: asyncio.Future[None] = asyncio.get_event_loop().create_future()
182
+ self.register_context(context_id, done_future)
183
+
184
+ async def _single_chunk_gen(s: str) -> AsyncIterator[str]:
185
+ yield s
186
+
187
+ async def _send_chunks() -> None:
188
+ try:
189
+ first_message_sent = False
190
+ if isinstance(text, str):
191
+ async for segment in segment_text(_single_chunk_gen(text)):
192
+ if self._should_stop:
176
193
  break
177
- elif data.get("error"):
178
- self.emit("error", f"WebSocket error: {data['error']}")
194
+ await self.send_text(context_id, f"{segment} ",
195
+ voice_settings=None if first_message_sent else self._voice_settings_dict(),
196
+ flush=True)
197
+ first_message_sent = True
198
+ else:
199
+ async for chunk in text:
200
+ if self._should_stop:
179
201
  break
180
- elif msg.type == aiohttp.WSMsgType.ERROR:
181
- self.emit("error", f"WebSocket connection error: {ws.exception()}")
182
- break
183
-
202
+ await self.send_text(context_id, f"{chunk} ",
203
+ voice_settings=None if first_message_sent else self._voice_settings_dict())
204
+ first_message_sent = True
205
+
206
+ if not self._should_stop:
207
+ await self.flush_context(context_id)
208
+ await self.close_context(context_id)
209
+ except Exception as e:
210
+ if not done_future.done():
211
+ done_future.set_exception(e)
212
+
213
+ sender = asyncio.create_task(_send_chunks())
214
+
215
+ await done_future
216
+ await sender
217
+
184
218
  except Exception as e:
185
219
  self.emit("error", f"Streaming synthesis failed: {str(e)}")
186
220
 
221
+ if isinstance(text, str):
222
+ await self._chunked_synthesis(text, voice_id)
223
+ else:
224
+ async for segment in segment_text(text):
225
+ if self._should_stop:
226
+ break
227
+ await self._chunked_synthesis(segment, voice_id)
228
+
229
+ def _voice_settings_dict(self) -> dict[str, Any]:
230
+ return {
231
+ "stability": self.voice_settings.stability,
232
+ "similarity_boost": self.voice_settings.similarity_boost,
233
+ "style": self.voice_settings.style,
234
+ "use_speaker_boost": self.voice_settings.use_speaker_boost,
235
+ }
236
+
237
+ async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
238
+ if not audio_bytes or self._should_stop:
239
+ return
240
+
241
+ if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
242
+ self._first_chunk_sent = True
243
+ asyncio.create_task(self._first_audio_callback())
244
+
245
+ if self.audio_track and self.loop:
246
+ await self.audio_track.add_new_bytes(audio_bytes)
247
+
248
+ async def interrupt(self) -> None:
249
+ """Simple but effective interruption"""
250
+ self._should_stop = True
251
+
252
+ if self.audio_track:
253
+ self.audio_track.interrupt()
254
+
255
+ await self.close_all_contexts()
256
+
187
257
  async def aclose(self) -> None:
188
258
  """Cleanup resources"""
259
+ self._should_stop = True
260
+
261
+ for task in [self._send_task, self._recv_task]:
262
+ if task and not task.done():
263
+ task.cancel()
264
+
265
+ for stream in list(self._streams):
266
+ try:
267
+ await stream.aclose()
268
+ except Exception:
269
+ pass
270
+
271
+ self._streams.clear()
272
+
273
+ if self._ws_connection and not self._ws_connection.closed:
274
+ try:
275
+ await self._ws_connection.send_str(json.dumps({"close_socket": True}))
276
+ except Exception:
277
+ pass
278
+ await self._ws_connection.close()
279
+ if self._ws_session and not self._ws_session.closed:
280
+ await self._ws_session.close()
281
+ self._ws_connection = None
282
+ self._ws_session = None
189
283
  if self._session:
190
284
  await self._session.aclose()
191
285
  await super().aclose()
192
286
 
193
- async def interrupt(self) -> None:
194
- """Interrupt the TTS process"""
195
- if self.audio_track:
196
- self.audio_track.interrupt()
287
+ async def _ensure_connection(self, voice_id: str) -> None:
288
+ async with self._connection_lock:
289
+ if self._ws_connection and not self._ws_connection.closed and self._ws_voice_id == voice_id:
290
+ return
291
+
292
+ if self._ws_connection and not self._ws_connection.closed:
293
+ try:
294
+ await self._ws_connection.send_str(json.dumps({"close_socket": True}))
295
+ except Exception:
296
+ pass
297
+ await self._ws_connection.close()
298
+ if self._ws_session and not self._ws_session.closed:
299
+ await self._ws_session.close()
300
+
301
+ self._ws_session = aiohttp.ClientSession()
302
+ self._ws_voice_id = voice_id
303
+
304
+ ws_url = f"{self.base_url}/text-to-speech/{voice_id}/multi-stream-input".replace("https://", "wss://").replace("http://", "ws://")
305
+ params = {
306
+ "model_id": self.model,
307
+ "output_format": self.response_format,
308
+ "inactivity_timeout": self.inactivity_timeout,
309
+ }
310
+ param_string = "&".join([f"{k}={v}" for k, v in params.items()])
311
+ full_ws_url = f"{ws_url}?{param_string}"
312
+ headers = {"xi-api-key": self.api_key}
313
+ self._ws_connection = await asyncio.wait_for(self._ws_session.ws_connect(full_ws_url, headers=headers), timeout=10.0)
314
+
315
+ if self._recv_task and not self._recv_task.done():
316
+ self._recv_task.cancel()
317
+ self._recv_task = asyncio.create_task(self._recv_loop())
318
+
319
+ def register_context(self, context_id: str, done_future: asyncio.Future[None]) -> None:
320
+ self._context_futures[context_id] = done_future
321
+
322
+ async def send_text(
323
+ self,
324
+ context_id: str,
325
+ text: str,
326
+ *,
327
+ voice_settings: Optional[dict[str, Any]] = None,
328
+ flush: bool = False,
329
+ ) -> None:
330
+ if not self._ws_connection or self._ws_connection.closed:
331
+ raise RuntimeError("WebSocket connection is closed")
332
+
333
+ if context_id not in self._active_contexts:
334
+ init_msg = {
335
+ "context_id": context_id,
336
+ "text": " ",
337
+ }
338
+ if voice_settings:
339
+ init_msg["voice_settings"] = voice_settings
340
+ await self._ws_connection.send_str(json.dumps(init_msg))
341
+ self._active_contexts.add(context_id)
342
+
343
+ pkt: dict[str, Any] = {"context_id": context_id, "text": text}
344
+ if flush:
345
+ pkt["flush"] = True
346
+ await self._ws_connection.send_str(json.dumps(pkt))
347
+
348
+ async def flush_context(self, context_id: str) -> None:
349
+ if not self._ws_connection or self._ws_connection.closed:
350
+ return
351
+ await self._ws_connection.send_str(json.dumps({"context_id": context_id, "flush": True}))
352
+
353
+ async def close_context(self, context_id: str) -> None:
354
+ if not self._ws_connection or self._ws_connection.closed:
355
+ return
356
+ await self._ws_connection.send_str(json.dumps({"context_id": context_id, "close_context": True}))
357
+
358
+ async def close_all_contexts(self) -> None:
359
+ try:
360
+ for context_id in list(self._active_contexts):
361
+ await self.close_context(context_id)
362
+ except Exception:
363
+ pass
364
+
365
+ async def _recv_loop(self) -> None:
366
+ try:
367
+ while self._ws_connection and not self._ws_connection.closed:
368
+ msg = await self._ws_connection.receive()
369
+ if msg.type == aiohttp.WSMsgType.TEXT:
370
+ data = json.loads(msg.data)
371
+
372
+ if data.get("error"):
373
+ ctx_id = data.get("contextId")
374
+ fut = self._context_futures.get(ctx_id)
375
+ if fut and not fut.done():
376
+ fut.set_exception(RuntimeError(data["error"]))
377
+ continue
378
+
379
+ if data.get("audio"):
380
+ audio_chunk = base64.b64decode(data["audio"]) if isinstance(data["audio"], str) else None
381
+ if audio_chunk:
382
+ if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
383
+ self._first_chunk_sent = True
384
+ asyncio.create_task(self._first_audio_callback())
385
+ if self.audio_track:
386
+ await self.audio_track.add_new_bytes(audio_chunk)
387
+
388
+ if data.get("is_final") or data.get("isFinal"):
389
+ ctx_id = data.get("contextId")
390
+ if ctx_id:
391
+ fut = self._context_futures.pop(ctx_id, None)
392
+ self._active_contexts.discard(ctx_id)
393
+ if fut and not fut.done():
394
+ fut.set_result(None)
395
+
396
+ elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
397
+ break
398
+ except Exception:
399
+ for fut in self._context_futures.values():
400
+ if not fut.done():
401
+ fut.set_exception(RuntimeError("WebSocket receive loop error"))
402
+ self._context_futures.clear()
@@ -1 +1 @@
1
- __version__ = "0.0.3"
1
+ __version__ = "0.0.49"
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videosdk-plugins-elevenlabs
3
- Version: 0.0.3
3
+ Version: 0.0.49
4
4
  Summary: VideoSDK Agent Framework plugin for ElevenLabs
5
5
  Author: videosdk
6
+ License-Expression: Apache-2.0
6
7
  Keywords: ai,audio,elevenlabs,video,videosdk
7
8
  Classifier: Development Status :: 4 - Beta
8
9
  Classifier: Intended Audience :: Developers
@@ -11,12 +12,12 @@ Classifier: Topic :: Multimedia :: Sound/Audio
11
12
  Classifier: Topic :: Multimedia :: Video
12
13
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
14
  Requires-Python: >=3.11
14
- Requires-Dist: videosdk-agents>=0.0.15
15
+ Requires-Dist: videosdk-agents>=0.0.49
15
16
  Description-Content-Type: text/markdown
16
17
 
17
- VideoSDK ElevenLabs Plugin
18
+ # VideoSDK ElevenLabs Plugin
18
19
 
19
- Agent Framework plugin for tts services from ElevenLabs.
20
+ Agent Framework plugin for TTS services from ElevenLabs.
20
21
 
21
22
  ## Installation
22
23
 
@@ -0,0 +1,7 @@
1
+ videosdk/plugins/elevenlabs/__init__.py,sha256=g33CP7YD-GB32-U5RAkRAtoNNaRG7oVy5iqk-LKz0Aw,139
2
+ videosdk/plugins/elevenlabs/stt.py,sha256=3Vbs_9yYROhNAbBzPEUqzdhrpdO6A6zq7TRvby617rM,12881
3
+ videosdk/plugins/elevenlabs/tts.py,sha256=LWn5AG3lssQ1zxWfJ1GLDFZi1cCGO2FKmxy20gcm3dQ,16033
4
+ videosdk/plugins/elevenlabs/version.py,sha256=LuIJFrM65iX-YC6KaWH9iJWJKBv1GHcHHucNCmnVUqo,23
5
+ videosdk_plugins_elevenlabs-0.0.49.dist-info/METADATA,sha256=E30JnazHE_j1EFaTSVVv2gm5HuOI7_HofN3QAMSqEH8,779
6
+ videosdk_plugins_elevenlabs-0.0.49.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
7
+ videosdk_plugins_elevenlabs-0.0.49.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.28.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,6 +0,0 @@
1
- videosdk/plugins/elevenlabs/__init__.py,sha256=bb7M4MSOIIb0KxrsRvG1JczJNGjQ3n-LBqKJp671HfU,91
2
- videosdk/plugins/elevenlabs/tts.py,sha256=l51CgdxHPgoR-Q2Q4FmSzD-Hi_Hz0MDSxbDIc8jwPck,7092
3
- videosdk/plugins/elevenlabs/version.py,sha256=k5tJXhBQJ4l9fKHJ76K5w98zBHoYvNk9r-UNH6eQ2-k,21
4
- videosdk_plugins_elevenlabs-0.0.3.dist-info/METADATA,sha256=j-KpXkh45CmJkAjh7L_Yro5-mOwVdw57panNJC9YKHg,745
5
- videosdk_plugins_elevenlabs-0.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
6
- videosdk_plugins_elevenlabs-0.0.3.dist-info/RECORD,,