PyPI - videosdk-plugins-elevenlabs - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.49__py3-none-any.whl - Mend

videosdk-plugins-elevenlabs 0.0.3py3-none-any.whl → 0.0.49py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

videosdk/plugins/elevenlabs/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .tts import ElevenLabsTTS, VoiceSettings
+from .stt import ElevenLabsSTT
-__all__ = ["ElevenLabsTTS", "VoiceSettings"]
+__all__ = ["ElevenLabsTTS", "VoiceSettings", "ElevenLabsSTT"]

videosdk/plugins/elevenlabs/stt.py ADDED Viewed

@@ -0,0 +1,356 @@
+from __future__ import annotations
+import asyncio
+import base64
+import json
+import os
+import logging
+import time
+from typing import Any, Optional, List
+from urllib.parse import urlencode
+import aiohttp
+import numpy as np
+from videosdk.agents import STT as BaseSTT, STTResponse, SpeechEventType, SpeechData, global_event_emitter
+logger = logging.getLogger(__name__)
+STT_ERROR_MSGS = {"input_error", "auth_error", "quota_exceeded", "transcriber_error", "error"}
+SUPPORTED_SAMPLE_RATES = {8000, 16000, 22050, 24000, 44100, 48000}
+class ElevenLabsSTT(BaseSTT):
+    """
+    ElevenLabs Realtime Speech-to-Text (STT) client.
+    """
+    def __init__(
+        self,
+        *,
+        api_key: str | None = None,
+        model_id: str = "scribe_v2_realtime",
+        language_code: str = "en",
+        sample_rate: int = 48000,
+        commit_strategy: str = "vad",
+        vad_silence_threshold_secs: float = 0.8,
+        vad_threshold: float = 0.4,
+        min_speech_duration_ms: int = 50,
+        min_silence_duration_ms: int = 50,
+        base_url: str = "wss://api.elevenlabs.io/v1/speech-to-text/realtime",
+    ) -> None:
+        """
+        Initialize the ElevenLabs STT client.
+        Args:
+            api_key: ElevenLabs API key for authentication. Defaults to env variable ELEVENLABS_API_KEY.
+            model_id: STT model identifier.
+            language_code: Language code for transcription.
+            sample_rate: Sample rate of input audio in Hz.
+            commit_strategy: Strategy for committing transcripts ('vad' is by default).
+            vad_silence_threshold_secs: Duration of silence to detect end-of-speech.
+            vad_threshold: Threshold for detecting voice activity.
+            min_speech_duration_ms: Minimum duration in milliseconds for a speech segment.
+            min_silence_duration_ms: Minimum duration in milliseconds of silence to consider end-of-speech.
+            base_url: WebSocket endpoint for ElevenLabs STT.
+        Raises:
+            ValueError: If required parameters are missing or invalid.
+        """
+        super().__init__()
+        self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
+        if not self.api_key:
+            raise ValueError("ElevenLabs API key must be provided via api_key or ELEVENLABS_API_KEY env var")
+        self.model_id = model_id
+        self.language_code = language_code
+        self.commit_strategy = commit_strategy
+        self.base_url = base_url
+        self.sample_rate = sample_rate
+        if self.sample_rate not in SUPPORTED_SAMPLE_RATES:
+            raise ValueError(f"Unsupported sample_rate: {self.sample_rate}. Supported rates: {SUPPORTED_SAMPLE_RATES}")
+        self.vad_silence_threshold_secs = vad_silence_threshold_secs
+        self.vad_threshold = vad_threshold
+        self.min_speech_duration_ms = min_speech_duration_ms
+        self.min_silence_duration_ms = min_silence_duration_ms
+        self._last_final_text = ""
+        self._last_final_time = 0.0
+        self._duplicate_suppression_window = 0.75
+        self._stream_buffer = bytearray()
+        self._target_chunk_size = int(0.1 * self.sample_rate * 2)
+        self.heartbeat = 15.0
+        self._session: Optional[aiohttp.ClientSession] = None
+        self._ws: Optional[aiohttp.ClientWebSocketResponse] = None
+        self._ws_task: Optional[asyncio.Task] = None
+    async def process_audio(
+        self,
+        audio_frames: bytes,
+        **kwargs: Any
+    ) -> None:
+        """
+        Process and send audio frames.
+        Converts to mono (required by ElevenLabs) and buffers 100ms chunks to reduce overhead.
+        """
+        if not self._ws or self._ws.closed:
+            await self._connect_ws()
+            if not self._ws_task or self._ws_task.done():
+                self._ws_task = asyncio.create_task(self._listen_for_responses())
+        elif self._ws_task and self._ws_task.done():
+            logger.warning("WebSocket listener stopped unexpectedly, restarting")
+            self._ws_task = asyncio.create_task(self._listen_for_responses())
+        try:
+            mono_audio = self._convert_to_mono(audio_frames)
+            if not mono_audio:
+                return
+            self._stream_buffer.extend(mono_audio)
+            while len(self._stream_buffer) >= self._target_chunk_size:
+                chunk = self._stream_buffer[:self._target_chunk_size]
+                await self._send_audio(chunk)
+                self._stream_buffer = self._stream_buffer[self._target_chunk_size:]
+        except Exception as e:
+            logger.exception("Error in process_audio: %s", e)
+            self.emit("error", str(e))
+            if self._ws:
+                await self._ws.close()
+                self._ws = None
+    async def _connect_ws(self) -> None:
+        if not self._session:
+            self._session = aiohttp.ClientSession()
+        query_params = {
+            "model_id": str(self.model_id),
+            "language_code": str(self.language_code),
+            "audio_format": f"pcm_{self.sample_rate}",
+            "commit_strategy": str(self.commit_strategy),
+            "vad_silence_threshold_secs": self.vad_silence_threshold_secs,
+            "vad_threshold": self.vad_threshold,
+            "min_speech_duration_ms": self.min_speech_duration_ms,
+            "min_silence_duration_ms": self.min_silence_duration_ms,
+        }
+        ws_url = f"{self.base_url}?{urlencode(query_params)}"
+        headers = {"xi-api-key": self.api_key}
+        try:
+            self._ws = await self._session.ws_connect(ws_url, headers=headers, heartbeat=self.heartbeat)
+            logger.info("Connected to ElevenLabs Realtime STT WebSocket.")
+        except Exception as e:
+            logger.exception("Error connecting to ElevenLabs WebSocket: %s", e)
+            raise
+    async def _send_audio(self, audio_bytes: bytes) -> None:
+        if not self._ws:
+            return
+        payload = {
+            "message_type": "input_audio_chunk",
+            "audio_base_64": base64.b64encode(audio_bytes).decode(),
+            "sample_rate": self.sample_rate,
+        }
+        try:
+            await self._ws.send_str(json.dumps(payload))
+        except Exception as e:
+            logger.exception("Error sending audio chunk: %s", e)
+            self.emit("error", str(e))
+            await self.aclose()
+    def _convert_to_mono(self, audio_bytes: bytes) -> bytes:
+        """
+        Convert input audio bytes to mono.
+        """
+        if not audio_bytes:
+            return b""
+        try:
+            raw_audio = np.frombuffer(audio_bytes, dtype=np.int16)
+            if raw_audio.size == 0:
+                return b""
+            if raw_audio.size % 2 == 0:
+                try:
+                    stereo = raw_audio.reshape(-1, 2).astype(np.float32)
+                    mono = stereo.mean(axis=1)
+                    return mono.astype(np.int16).tobytes()
+                except ValueError:
+                    pass
+            return audio_bytes
+        except Exception as e:
+            logger.error("Error converting to mono: %s", e)
+            return b""
+    async def _listen_for_responses(self) -> None:
+        """
+        Listen for incoming WebSocket messages from ElevenLabs STT.
+        """
+        if not self._ws:
+            return
+        try:
+            async for msg in self._ws:
+                if msg.type == aiohttp.WSMsgType.TEXT:
+                    data = None
+                    try:
+                        data = msg.json()
+                    except Exception:
+                        try:
+                            data = json.loads(msg.data)
+                        except Exception:
+                            logger.debug("Received non-json ws text message")
+                            continue
+                    responses = await self._handle_ws_event(data)
+                    if responses:
+                        for r in responses:
+                            if self._transcript_callback:
+                                try:
+                                    await self._transcript_callback(r)
+                                except Exception:
+                                    logger.exception("Error in transcript callback")
+                elif msg.type == aiohttp.WSMsgType.ERROR:
+                    logger.error("WebSocket error: %s", self._ws.exception())
+                    self.emit("error", f"WebSocket error: {self._ws.exception()}")
+                    break
+                elif msg.type == aiohttp.WSMsgType.CLOSED:
+                    logger.info("WebSocket closed by server.")
+                    break
+        except asyncio.CancelledError:
+            logger.debug("WebSocket listener cancelled")
+        except Exception as e:
+            logger.exception("Error in WebSocket listener: %s", e)
+            self.emit("error", str(e))
+        finally:
+            if self._ws:
+                try:
+                    await self._ws.close()
+                except Exception:
+                    pass
+                self._ws = None
+            self._ws_task = None
+    async def _handle_ws_event(self, data: dict) -> List[STTResponse]:
+        """
+        Process a single WebSocket event from ElevenLabs STT.
+        Args:
+            data: JSON-decoded WebSocket message.
+        Returns:
+            List of STTResponse objects for this event.
+        """
+        responses: List[STTResponse] = []
+        message_type = data.get("message_type")
+        logger.debug("Received WS event: %s", message_type)
+        if message_type in STT_ERROR_MSGS:
+            logger.error("ElevenLabs STT error: %s", data)
+            self.emit("error", data)
+            return responses
+        if message_type == "session_started":
+            global_event_emitter.emit("speech_session_started")
+            return responses
+        if message_type == "committed_transcript":
+            logger.info("==== Received final transcript event: %s", data)
+            text = data.get("text", "")
+            clean_text = text.strip()
+            confidence = float(data.get("confidence", 0.0))
+            now = time.time()
+            if clean_text == "":
+                global_event_emitter.emit("speech_stopped")
+                self._last_final_text = ""
+                self._last_final_time = now
+                return responses
+            resp = STTResponse(
+                event_type=SpeechEventType.FINAL,
+                data=SpeechData(
+                    text=clean_text,
+                    confidence=confidence,
+                ),
+                metadata={"model": self.model_id, "raw_event": data},
+            )
+            responses.append(resp)
+            global_event_emitter.emit("speech_stopped")
+            self._last_final_text = clean_text
+            self._last_final_time = now
+            return responses
+        if message_type == "partial_transcript":
+            text = data.get("text", "")
+            clean_text = text.strip()
+            if (
+                self._last_final_text
+                and clean_text
+                and clean_text == self._last_final_text
+                and (time.time() - self._last_final_time) < self._duplicate_suppression_window
+            ):
+                logger.debug("Dropping duplicate partial matching recent final transcript")
+                return responses
+            resp = STTResponse(
+                event_type=SpeechEventType.INTERIM,
+                data=SpeechData(
+                    text=text,
+                    confidence=float(data.get("confidence", 0.0)),
+                ),
+                metadata={"model": self.model_id, "raw_event": data},
+            )
+            responses.append(resp)
+            if clean_text:
+                global_event_emitter.emit("speech_started")
+            return responses
+        logger.debug("Ignoring unrecognized message_type: %s", message_type)
+        return responses
+    async def aclose(self) -> None:
+        """
+        Close the WebSocket connection and cleanup session resources.
+        Cancels the listener task, closes WebSocket and HTTP session,
+        and calls the parent class cleanup.
+        """
+        if self._ws_task:
+            self._ws_task.cancel()
+            try:
+                await self._ws_task
+            except asyncio.CancelledError:
+                pass
+            self._ws_task = None
+        if self._ws:
+            try:
+                await self._ws.close()
+            except Exception:
+                pass
+            self._ws = None
+        if self._session:
+            try:
+                await self._session.close()
+            except Exception:
+                pass
+            finally:
+                self._session = None
+        await super().aclose()

videosdk/plugins/elevenlabs/tts.py CHANGED Viewed

@@ -1,14 +1,16 @@
 from __future__ import annotations
-from typing import Any, AsyncIterator, Literal, Optional, Union
+from typing import Any, AsyncIterator, Optional, Union
 import os
 import httpx
 import asyncio
 import json
 import aiohttp
+import weakref
 from dataclasses import dataclass
-from videosdk.agents import TTS
+from videosdk.agents import TTS, segment_text
+import base64
+import uuid
 ELEVENLABS_SAMPLE_RATE = 24000
 ELEVENLABS_CHANNELS = 1
@@ -16,6 +18,7 @@ ELEVENLABS_CHANNELS = 1
 DEFAULT_MODEL = "eleven_flash_v2_5"
 DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
 API_BASE_URL = "https://api.elevenlabs.io/v1"
+WS_INACTIVITY_TIMEOUT = 300
 @dataclass
@@ -30,16 +33,32 @@ class ElevenLabsTTS(TTS):
     def __init__(
         self,
         *,
+        api_key: str | None = None,
         model: str = DEFAULT_MODEL,
         voice: str = DEFAULT_VOICE_ID,
         speed: float = 1.0,
-        api_key: str | None = None,
         response_format: str = "pcm_24000",
         voice_settings: VoiceSettings | None = None,
         base_url: str = API_BASE_URL,
-        enable_streaming: bool = False,
+        enable_streaming: bool = True,
+        inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
     ) -> None:
-        super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
+        """Initialize the ElevenLabs TTS plugin.
+        Args:
+            api_key (Optional[str], optional): ElevenLabs API key. Uses ELEVENLABS_API_KEY environment variable if not provided. Defaults to None.
+            model (str): The model to use for the TTS plugin. Defaults to "eleven_flash_v2_5".
+            voice (str): The voice to use for the TTS plugin. Defaults to "EXAVITQu4vr4xnSDxMaL".
+            speed (float): The speed to use for the TTS plugin. Defaults to 1.0.
+            response_format (str): The response format to use for the TTS plugin. Defaults to "pcm_24000".
+            voice_settings (Optional[VoiceSettings], optional): The voice settings to use for the TTS plugin. Defaults to None.
+            base_url (str): The base URL to use for the TTS plugin. Defaults to "https://api.elevenlabs.io/v1".
+            enable_streaming (bool): Whether to enable streaming for the TTS plugin. Defaults to True.
+            inactivity_timeout (int): The inactivity timeout to use for the TTS plugin. Defaults to 300.
+        """
+        super().__init__(
+            sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS
+        )
         self.model = model
         self.voice = voice
@@ -50,16 +69,35 @@ class ElevenLabsTTS(TTS):
         self.base_url = base_url
         self.enable_streaming = enable_streaming
         self.voice_settings = voice_settings or VoiceSettings()
+        self.inactivity_timeout = inactivity_timeout
+        self._first_chunk_sent = False
+        self._ws_session = None
+        self._ws_connection = None
         self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
         if not self.api_key:
-            raise ValueError("ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
+            raise ValueError(
+                "ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
         self._session = httpx.AsyncClient(
-            timeout=httpx.Timeout(connect=15.0, read=30.0, write=5.0, pool=5.0),
+            timeout=httpx.Timeout(connect=15.0, read=30.0,
+                                  write=5.0, pool=5.0),
             follow_redirects=True,
         )
+        self._streams = weakref.WeakSet()
+        self._send_task: asyncio.Task | None = None
+        self._recv_task: asyncio.Task | None = None
+        self._should_stop = False
+        self._connection_lock = asyncio.Lock()
+        self._ws_voice_id: str | None = None
+        self._active_contexts: set[str] = set()
+        self._context_futures: dict[str, asyncio.Future[None]] = {}
+    def reset_first_audio_tracking(self) -> None:
+        """Reset the first audio tracking state for next TTS task"""
+        self._first_chunk_sent = False
     async def synthesize(
         self,
         text: AsyncIterator[str] | str,
@@ -67,23 +105,23 @@ class ElevenLabsTTS(TTS):
         **kwargs: Any,
     ) -> None:
         try:
-            if isinstance(text, AsyncIterator):
-                full_text = ""
-                async for chunk in text:
-                    full_text += chunk
-            else:
-                full_text = text
             if not self.audio_track or not self.loop:
                 self.emit("error", "Audio track or event loop not set")
                 return
             target_voice = voice_id or self.voice
+            self._should_stop = False
             if self.enable_streaming:
-                await self._stream_synthesis(full_text, target_voice)
+                await self._stream_synthesis(text, target_voice)
             else:
-                await self._chunked_synthesis(full_text, target_voice)
+                if isinstance(text, AsyncIterator):
+                    async for segment in segment_text(text):
+                        if self._should_stop:
+                            break
+                        await self._chunked_synthesis(segment, target_voice)
+                else:
+                    await self._chunked_synthesis(text, target_voice)
         except Exception as e:
             self.emit("error", f"TTS synthesis failed: {str(e)}")
@@ -91,17 +129,17 @@ class ElevenLabsTTS(TTS):
     async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
         """Non-streaming synthesis using the standard API"""
         url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
         params = {
             "model_id": self.model,
             "output_format": self.response_format,
         }
         headers = {
             "xi-api-key": self.api_key,
             "Content-Type": "application/json",
         }
         payload = {
             "text": text,
             "voice_settings": {
@@ -114,83 +152,251 @@ class ElevenLabsTTS(TTS):
         try:
             async with self._session.stream(
-                "POST",
-                url,
-                headers=headers,
+                "POST",
+                url,
+                headers=headers,
                 json=payload,
                 params=params
             ) as response:
                 response.raise_for_status()
                 async for chunk in response.aiter_bytes():
+                    if self._should_stop:
+                        break
                     if chunk:
-                        self.loop.create_task(self.audio_track.add_new_bytes(chunk))
+                        await self._stream_audio_chunks(chunk)
         except httpx.HTTPStatusError as e:
-            self.emit("error", f"HTTP error {e.response.status_code}: {e.response.text}")
+            self.emit(
+                "error", f"HTTP error {e.response.status_code}: {e.response.text}")
         except Exception as e:
             self.emit("error", f"Chunked synthesis failed: {str(e)}")
-    async def _stream_synthesis(self, text: str, voice_id: str) -> None:
-        """WebSocket-based streaming synthesis"""
-        ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
-        params = {
-            "model_id": self.model,
-            "output_format": self.response_format,
-        }
-        param_string = "&".join([f"{k}={v}" for k, v in params.items()])
-        full_ws_url = f"{ws_url}?{param_string}"
-        headers = {"xi-api-key": self.api_key}
+    async def _stream_synthesis(self, text: Union[AsyncIterator[str], str], voice_id: str) -> None:
+        """WebSocket-based streaming synthesis using multi-context connection"""
         try:
-            async with aiohttp.ClientSession() as session:
-                async with session.ws_connect(full_ws_url, headers=headers) as ws:
-                    init_message = {
-                        "text": " ",
-                        "voice_settings": {
-                            "stability": self.voice_settings.stability,
-                            "similarity_boost": self.voice_settings.similarity_boost,
-                            "style": self.voice_settings.style,
-                            "use_speaker_boost": self.voice_settings.use_speaker_boost,
-                        },
-                    }
-                    await ws.send_str(json.dumps(init_message))
-                    text_message = {"text": f"{text} "}
-                    await ws.send_str(json.dumps(text_message))
-                    eos_message = {"text": ""}
-                    await ws.send_str(json.dumps(eos_message))
-                    async for msg in ws:
-                        if msg.type == aiohttp.WSMsgType.TEXT:
-                            data = json.loads(msg.data)
-                            if data.get("audio"):
-                                import base64
-                                audio_chunk = base64.b64decode(data["audio"])
-                                self.loop.create_task(self.audio_track.add_new_bytes(audio_chunk))
-                            elif data.get("isFinal"):
+            await self._ensure_connection(voice_id)
+            context_id = uuid.uuid4().hex[:12]
+            done_future: asyncio.Future[None] = asyncio.get_event_loop().create_future()
+            self.register_context(context_id, done_future)
+            async def _single_chunk_gen(s: str) -> AsyncIterator[str]:
+                yield s
+            async def _send_chunks() -> None:
+                try:
+                    first_message_sent = False
+                    if isinstance(text, str):
+                        async for segment in segment_text(_single_chunk_gen(text)):
+                            if self._should_stop:
                                 break
-                            elif data.get("error"):
-                                self.emit("error", f"WebSocket error: {data['error']}")
+                            await self.send_text(context_id, f"{segment} ",
+                                                 voice_settings=None if first_message_sent else self._voice_settings_dict(),
+                                                 flush=True)
+                            first_message_sent = True
+                    else:
+                        async for chunk in text:
+                            if self._should_stop:
                                 break
-                        elif msg.type == aiohttp.WSMsgType.ERROR:
-                            self.emit("error", f"WebSocket connection error: {ws.exception()}")
-                            break
+                            await self.send_text(context_id, f"{chunk} ",
+                                                 voice_settings=None if first_message_sent else self._voice_settings_dict())
+                            first_message_sent = True
+                    if not self._should_stop:
+                        await self.flush_context(context_id)
+                        await self.close_context(context_id)
+                except Exception as e:
+                    if not done_future.done():
+                        done_future.set_exception(e)
+            sender = asyncio.create_task(_send_chunks())
+            await done_future
+            await sender
         except Exception as e:
             self.emit("error", f"Streaming synthesis failed: {str(e)}")
+            if isinstance(text, str):
+                await self._chunked_synthesis(text, voice_id)
+            else:
+                async for segment in segment_text(text):
+                    if self._should_stop:
+                        break
+                    await self._chunked_synthesis(segment, voice_id)
+    def _voice_settings_dict(self) -> dict[str, Any]:
+        return {
+            "stability": self.voice_settings.stability,
+            "similarity_boost": self.voice_settings.similarity_boost,
+            "style": self.voice_settings.style,
+            "use_speaker_boost": self.voice_settings.use_speaker_boost,
+        }
+    async def _stream_audio_chunks(self, audio_bytes: bytes) -> None:
+        if not audio_bytes or self._should_stop:
+            return
+        if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
+            self._first_chunk_sent = True
+            asyncio.create_task(self._first_audio_callback())
+        if self.audio_track and self.loop:
+            await self.audio_track.add_new_bytes(audio_bytes)
+    async def interrupt(self) -> None:
+        """Simple but effective interruption"""
+        self._should_stop = True
+        if self.audio_track:
+            self.audio_track.interrupt()
+        await self.close_all_contexts()
     async def aclose(self) -> None:
         """Cleanup resources"""
+        self._should_stop = True
+        for task in [self._send_task, self._recv_task]:
+            if task and not task.done():
+                task.cancel()
+        for stream in list(self._streams):
+            try:
+                await stream.aclose()
+            except Exception:
+                pass
+        self._streams.clear()
+        if self._ws_connection and not self._ws_connection.closed:
+            try:
+                await self._ws_connection.send_str(json.dumps({"close_socket": True}))
+            except Exception:
+                pass
+            await self._ws_connection.close()
+        if self._ws_session and not self._ws_session.closed:
+            await self._ws_session.close()
+        self._ws_connection = None
+        self._ws_session = None
         if self._session:
             await self._session.aclose()
         await super().aclose()
-    async def interrupt(self) -> None:
-        """Interrupt the TTS process"""
-        if self.audio_track:
-            self.audio_track.interrupt()
+    async def _ensure_connection(self, voice_id: str) -> None:
+        async with self._connection_lock:
+            if self._ws_connection and not self._ws_connection.closed and self._ws_voice_id == voice_id:
+                return
+            if self._ws_connection and not self._ws_connection.closed:
+                try:
+                    await self._ws_connection.send_str(json.dumps({"close_socket": True}))
+                except Exception:
+                    pass
+                await self._ws_connection.close()
+            if self._ws_session and not self._ws_session.closed:
+                await self._ws_session.close()
+            self._ws_session = aiohttp.ClientSession()
+            self._ws_voice_id = voice_id
+            ws_url = f"{self.base_url}/text-to-speech/{voice_id}/multi-stream-input".replace("https://", "wss://").replace("http://", "ws://")
+            params = {
+                "model_id": self.model,
+                "output_format": self.response_format,
+                "inactivity_timeout": self.inactivity_timeout,
+            }
+            param_string = "&".join([f"{k}={v}" for k, v in params.items()])
+            full_ws_url = f"{ws_url}?{param_string}"
+            headers = {"xi-api-key": self.api_key}
+            self._ws_connection = await asyncio.wait_for(self._ws_session.ws_connect(full_ws_url, headers=headers), timeout=10.0)
+            if self._recv_task and not self._recv_task.done():
+                self._recv_task.cancel()
+            self._recv_task = asyncio.create_task(self._recv_loop())
+    def register_context(self, context_id: str, done_future: asyncio.Future[None]) -> None:
+        self._context_futures[context_id] = done_future
+    async def send_text(
+        self,
+        context_id: str,
+        text: str,
+        *,
+        voice_settings: Optional[dict[str, Any]] = None,
+        flush: bool = False,
+    ) -> None:
+        if not self._ws_connection or self._ws_connection.closed:
+            raise RuntimeError("WebSocket connection is closed")
+        if context_id not in self._active_contexts:
+            init_msg = {
+                "context_id": context_id,
+                "text": " ",
+            }
+            if voice_settings:
+                init_msg["voice_settings"] = voice_settings
+            await self._ws_connection.send_str(json.dumps(init_msg))
+            self._active_contexts.add(context_id)
+        pkt: dict[str, Any] = {"context_id": context_id, "text": text}
+        if flush:
+            pkt["flush"] = True
+        await self._ws_connection.send_str(json.dumps(pkt))
+    async def flush_context(self, context_id: str) -> None:
+        if not self._ws_connection or self._ws_connection.closed:
+            return
+        await self._ws_connection.send_str(json.dumps({"context_id": context_id, "flush": True}))
+    async def close_context(self, context_id: str) -> None:
+        if not self._ws_connection or self._ws_connection.closed:
+            return
+        await self._ws_connection.send_str(json.dumps({"context_id": context_id, "close_context": True}))
+    async def close_all_contexts(self) -> None:
+        try:
+            for context_id in list(self._active_contexts):
+                await self.close_context(context_id)
+        except Exception:
+            pass
+    async def _recv_loop(self) -> None:
+        try:
+            while self._ws_connection and not self._ws_connection.closed:
+                msg = await self._ws_connection.receive()
+                if msg.type == aiohttp.WSMsgType.TEXT:
+                    data = json.loads(msg.data)
+                    if data.get("error"):
+                        ctx_id = data.get("contextId")
+                        fut = self._context_futures.get(ctx_id)
+                        if fut and not fut.done():
+                            fut.set_exception(RuntimeError(data["error"]))
+                        continue
+                    if data.get("audio"):
+                        audio_chunk = base64.b64decode(data["audio"]) if isinstance(data["audio"], str) else None
+                        if audio_chunk:
+                            if not self._first_chunk_sent and hasattr(self, '_first_audio_callback') and self._first_audio_callback:
+                                self._first_chunk_sent = True
+                                asyncio.create_task(self._first_audio_callback())
+                            if self.audio_track:
+                                await self.audio_track.add_new_bytes(audio_chunk)
+                    if data.get("is_final") or data.get("isFinal"):
+                        ctx_id = data.get("contextId")
+                        if ctx_id:
+                            fut = self._context_futures.pop(ctx_id, None)
+                            self._active_contexts.discard(ctx_id)
+                            if fut and not fut.done():
+                                fut.set_result(None)
+                elif msg.type in (aiohttp.WSMsgType.CLOSED, aiohttp.WSMsgType.CLOSE, aiohttp.WSMsgType.CLOSING):
+                    break
+        except Exception:
+            for fut in self._context_futures.values():
+                if not fut.done():
+                    fut.set_exception(RuntimeError("WebSocket receive loop error"))
+            self._context_futures.clear()

videosdk/plugins/elevenlabs/version.py CHANGED Viewed

	@@ -1 +1 @@
1	- __version__ = "0.0.3"
1	+ __version__ = "0.0.49"

{videosdk_plugins_elevenlabs-0.0.3.dist-info → videosdk_plugins_elevenlabs-0.0.49.dist-info}/METADATA RENAMED Viewed

@@ -1,8 +1,9 @@
 Metadata-Version: 2.4
 Name: videosdk-plugins-elevenlabs
-Version: 0.0.3
+Version: 0.0.49
 Summary: VideoSDK Agent Framework plugin for ElevenLabs
 Author: videosdk
+License-Expression: Apache-2.0
 Keywords: ai,audio,elevenlabs,video,videosdk
 Classifier: Development Status :: 4 - Beta
 Classifier: Intended Audience :: Developers
@@ -11,12 +12,12 @@ Classifier: Topic :: Multimedia :: Sound/Audio
 Classifier: Topic :: Multimedia :: Video
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.11
-Requires-Dist: videosdk-agents>=0.0.15
+Requires-Dist: videosdk-agents>=0.0.49
 Description-Content-Type: text/markdown
-VideoSDK ElevenLabs Plugin
+# VideoSDK ElevenLabs Plugin
-Agent Framework plugin for tts services from ElevenLabs.
+Agent Framework plugin for TTS services from ElevenLabs.
 ## Installation

videosdk_plugins_elevenlabs-0.0.49.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,7 @@
+videosdk/plugins/elevenlabs/__init__.py,sha256=g33CP7YD-GB32-U5RAkRAtoNNaRG7oVy5iqk-LKz0Aw,139
+videosdk/plugins/elevenlabs/stt.py,sha256=3Vbs_9yYROhNAbBzPEUqzdhrpdO6A6zq7TRvby617rM,12881
+videosdk/plugins/elevenlabs/tts.py,sha256=LWn5AG3lssQ1zxWfJ1GLDFZi1cCGO2FKmxy20gcm3dQ,16033
+videosdk/plugins/elevenlabs/version.py,sha256=LuIJFrM65iX-YC6KaWH9iJWJKBv1GHcHHucNCmnVUqo,23
+videosdk_plugins_elevenlabs-0.0.49.dist-info/METADATA,sha256=E30JnazHE_j1EFaTSVVv2gm5HuOI7_HofN3QAMSqEH8,779
+videosdk_plugins_elevenlabs-0.0.49.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+videosdk_plugins_elevenlabs-0.0.49.dist-info/RECORD,,

{videosdk_plugins_elevenlabs-0.0.3.dist-info → videosdk_plugins_elevenlabs-0.0.49.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: hatchling 1.27.0
+Generator: hatchling 1.28.0
 Root-Is-Purelib: true
 Tag: py3-none-any

videosdk_plugins_elevenlabs-0.0.3.dist-info/RECORD DELETED Viewed

@@ -1,6 +0,0 @@
-videosdk/plugins/elevenlabs/__init__.py,sha256=bb7M4MSOIIb0KxrsRvG1JczJNGjQ3n-LBqKJp671HfU,91
-videosdk/plugins/elevenlabs/tts.py,sha256=l51CgdxHPgoR-Q2Q4FmSzD-Hi_Hz0MDSxbDIc8jwPck,7092
-videosdk/plugins/elevenlabs/version.py,sha256=k5tJXhBQJ4l9fKHJ76K5w98zBHoYvNk9r-UNH6eQ2-k,21
-videosdk_plugins_elevenlabs-0.0.3.dist-info/METADATA,sha256=j-KpXkh45CmJkAjh7L_Yro5-mOwVdw57panNJC9YKHg,745
-videosdk_plugins_elevenlabs-0.0.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-videosdk_plugins_elevenlabs-0.0.3.dist-info/RECORD,,

videosdk-plugins-elevenlabs 0.0.3__py3-none-any.whl → 0.0.49__py3-none-any.whl

videosdk-plugins-elevenlabs 0.0.3py3-none-any.whl → 0.0.49py3-none-any.whl