videosdk-plugins-elevenlabs 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.

@@ -0,0 +1,10 @@
1
+ myenv/
2
+ venv/
3
+ env/
4
+ __pycache__
5
+
6
+ .env
7
+ .env.local
8
+ test_env/
9
+ dist/
10
+ .DS_Store
@@ -0,0 +1,25 @@
1
+ Metadata-Version: 2.4
2
+ Name: videosdk-plugins-elevenlabs
3
+ Version: 0.0.1
4
+ Summary: VideoSDK Agent Framework plugin for ElevenLabs
5
+ Author: videosdk
6
+ Keywords: ai,audio,elevenlabs,video,videosdk
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: Topic :: Communications :: Conferencing
10
+ Classifier: Topic :: Multimedia :: Sound/Audio
11
+ Classifier: Topic :: Multimedia :: Video
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Requires-Python: >=3.11
14
+ Requires-Dist: videosdk-agents>=0.0.11
15
+ Description-Content-Type: text/markdown
16
+
17
+ VideoSDK ElevenLabs Plugin
18
+
19
+ Agent Framework plugin for tts services from ElevenLabs.
20
+
21
+ ## Installation
22
+
23
+ ```bash
24
+ pip install videosdk-plugins-elevenlabs
25
+ ```
@@ -0,0 +1,9 @@
1
+ VideoSDK ElevenLabs Plugin
2
+
3
+ Agent Framework plugin for tts services from ElevenLabs.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install videosdk-plugins-elevenlabs
9
+ ```
@@ -0,0 +1,33 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "videosdk-plugins-elevenlabs"
7
+ dynamic = ["version"]
8
+ description = "VideoSDK Agent Framework plugin for ElevenLabs"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ authors = [{ name = "videosdk"}]
12
+ keywords = ["video", "audio", "ai", "videosdk", "elevenlabs"]
13
+ classifiers = [
14
+ "Intended Audience :: Developers",
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "Topic :: Communications :: Conferencing",
18
+ "Topic :: Multimedia :: Sound/Audio",
19
+ "Topic :: Multimedia :: Video",
20
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
21
+ ]
22
+ dependencies = [
23
+ "videosdk-agents>=0.0.11"
24
+ ]
25
+
26
+ [tool.hatch.version]
27
+ path = "videosdk/plugins/elevenlabs/version.py"
28
+
29
+ [tool.hatch.build.targets.wheel]
30
+ packages = ["videosdk"]
31
+
32
+ [tool.hatch.build.targets.sdist]
33
+ include = ["/videosdk"]
@@ -0,0 +1,3 @@
1
+ from .tts import ElevenLabsTTS, VoiceSettings
2
+
3
+ __all__ = ["ElevenLabsTTS", "VoiceSettings"]
@@ -0,0 +1,205 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, AsyncIterator, Literal, Optional, Union
4
+ import os
5
+ import httpx
6
+ import asyncio
7
+ import json
8
+ import aiohttp
9
+ from dataclasses import dataclass
10
+
11
+ from videosdk.agents import TTS
12
+
13
+ ELEVENLABS_SAMPLE_RATE = 24000
14
+ ELEVENLABS_CHANNELS = 1
15
+
16
+ DEFAULT_MODEL = "eleven_flash_v2_5"
17
+ DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
18
+ API_BASE_URL = "https://api.elevenlabs.io/v1"
19
+
20
+
21
+ @dataclass
22
+ class VoiceSettings:
23
+ stability: float = 0.71
24
+ similarity_boost: float = 0.5
25
+ style: float = 0.0
26
+ use_speaker_boost: bool = True
27
+
28
+
29
+ class ElevenLabsTTS(TTS):
30
+ def __init__(
31
+ self,
32
+ *,
33
+ model: str = DEFAULT_MODEL,
34
+ voice: str = DEFAULT_VOICE_ID,
35
+ speed: float = 1.0,
36
+ api_key: str | None = None,
37
+ response_format: str = "pcm_24000",
38
+ voice_settings: VoiceSettings | None = None,
39
+ base_url: str = API_BASE_URL,
40
+ enable_streaming: bool = False,
41
+ ) -> None:
42
+ super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
43
+
44
+ self.model = model
45
+ self.voice = voice
46
+ self.speed = speed
47
+ self.audio_track = None
48
+ self.loop = None
49
+ self.response_format = response_format
50
+ self.base_url = base_url
51
+ self.enable_streaming = enable_streaming
52
+ self.voice_settings = voice_settings or VoiceSettings()
53
+
54
+ self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
55
+ if not self.api_key:
56
+ raise ValueError("ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
57
+
58
+ # Create HTTP session for requests
59
+ self._session = httpx.AsyncClient(
60
+ timeout=httpx.Timeout(connect=15.0, read=30.0, write=5.0, pool=5.0),
61
+ follow_redirects=True,
62
+ )
63
+
64
+ async def synthesize(
65
+ self,
66
+ text: AsyncIterator[str] | str,
67
+ voice_id: Optional[str] = None,
68
+ **kwargs: Any,
69
+ ) -> None:
70
+ try:
71
+ # Convert AsyncIterator to string if needed
72
+ if isinstance(text, AsyncIterator):
73
+ full_text = ""
74
+ async for chunk in text:
75
+ full_text += chunk
76
+ else:
77
+ full_text = text
78
+
79
+ if not self.audio_track or not self.loop:
80
+ self.emit("error", "Audio track or event loop not set")
81
+ return
82
+
83
+ # Use the provided voice_id or fall back to default
84
+ target_voice = voice_id or self.voice
85
+
86
+ if self.enable_streaming:
87
+ await self._stream_synthesis(full_text, target_voice)
88
+ else:
89
+ await self._chunked_synthesis(full_text, target_voice)
90
+
91
+ except Exception as e:
92
+ self.emit("error", f"TTS synthesis failed: {str(e)}")
93
+
94
+ async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
95
+ """Non-streaming synthesis using the standard API"""
96
+ url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
97
+
98
+ # Add output_format as URL parameter for ElevenLabs API
99
+ params = {
100
+ "model_id": self.model,
101
+ "output_format": self.response_format,
102
+ }
103
+
104
+ headers = {
105
+ "xi-api-key": self.api_key,
106
+ "Content-Type": "application/json",
107
+ }
108
+
109
+ payload = {
110
+ "text": text,
111
+ "voice_settings": {
112
+ "stability": self.voice_settings.stability,
113
+ "similarity_boost": self.voice_settings.similarity_boost,
114
+ "style": self.voice_settings.style,
115
+ "use_speaker_boost": self.voice_settings.use_speaker_boost,
116
+ },
117
+ }
118
+
119
+ try:
120
+ async with self._session.stream(
121
+ "POST",
122
+ url,
123
+ headers=headers,
124
+ json=payload,
125
+ params=params
126
+ ) as response:
127
+ response.raise_for_status()
128
+
129
+ async for chunk in response.aiter_bytes():
130
+ if chunk:
131
+ self.loop.create_task(self.audio_track.add_new_bytes(chunk))
132
+
133
+ except httpx.HTTPStatusError as e:
134
+ self.emit("error", f"HTTP error {e.response.status_code}: {e.response.text}")
135
+ except Exception as e:
136
+ self.emit("error", f"Chunked synthesis failed: {str(e)}")
137
+
138
+ async def _stream_synthesis(self, text: str, voice_id: str) -> None:
139
+ """WebSocket-based streaming synthesis"""
140
+ ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
141
+
142
+ params = {
143
+ "model_id": self.model,
144
+ "output_format": self.response_format,
145
+ }
146
+
147
+ # Build WebSocket URL with parameters
148
+ param_string = "&".join([f"{k}={v}" for k, v in params.items()])
149
+ full_ws_url = f"{ws_url}?{param_string}"
150
+
151
+ headers = {"xi-api-key": self.api_key}
152
+
153
+ try:
154
+ async with aiohttp.ClientSession() as session:
155
+ async with session.ws_connect(full_ws_url, headers=headers) as ws:
156
+ # Send initial configuration
157
+ init_message = {
158
+ "text": " ",
159
+ "voice_settings": {
160
+ "stability": self.voice_settings.stability,
161
+ "similarity_boost": self.voice_settings.similarity_boost,
162
+ "style": self.voice_settings.style,
163
+ "use_speaker_boost": self.voice_settings.use_speaker_boost,
164
+ },
165
+ }
166
+ await ws.send_str(json.dumps(init_message))
167
+
168
+ # Send text data
169
+ text_message = {"text": f"{text} "}
170
+ await ws.send_str(json.dumps(text_message))
171
+
172
+ # Send end-of-stream marker
173
+ eos_message = {"text": ""}
174
+ await ws.send_str(json.dumps(eos_message))
175
+
176
+ # Receive and process audio data
177
+ async for msg in ws:
178
+ if msg.type == aiohttp.WSMsgType.TEXT:
179
+ data = json.loads(msg.data)
180
+ if data.get("audio"):
181
+ import base64
182
+ audio_chunk = base64.b64decode(data["audio"])
183
+ self.loop.create_task(self.audio_track.add_new_bytes(audio_chunk))
184
+ elif data.get("isFinal"):
185
+ break
186
+ elif data.get("error"):
187
+ self.emit("error", f"WebSocket error: {data['error']}")
188
+ break
189
+ elif msg.type == aiohttp.WSMsgType.ERROR:
190
+ self.emit("error", f"WebSocket connection error: {ws.exception()}")
191
+ break
192
+
193
+ except Exception as e:
194
+ self.emit("error", f"Streaming synthesis failed: {str(e)}")
195
+
196
+ async def aclose(self) -> None:
197
+ """Cleanup resources"""
198
+ if self._session:
199
+ await self._session.aclose()
200
+ await super().aclose()
201
+
202
+ async def interrupt(self) -> None:
203
+ """Interrupt the TTS process"""
204
+ if self.audio_track:
205
+ self.audio_track.interrupt()
@@ -0,0 +1 @@
1
+ __version__ = "0.0.1"