videosdk-plugins-elevenlabs 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videosdk-plugins-elevenlabs might be problematic. Click here for more details.
- videosdk_plugins_elevenlabs-0.0.1/.gitignore +10 -0
- videosdk_plugins_elevenlabs-0.0.1/PKG-INFO +25 -0
- videosdk_plugins_elevenlabs-0.0.1/README.md +9 -0
- videosdk_plugins_elevenlabs-0.0.1/pyproject.toml +33 -0
- videosdk_plugins_elevenlabs-0.0.1/videosdk/plugins/elevenlabs/__init__.py +3 -0
- videosdk_plugins_elevenlabs-0.0.1/videosdk/plugins/elevenlabs/tts.py +205 -0
- videosdk_plugins_elevenlabs-0.0.1/videosdk/plugins/elevenlabs/version.py +1 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: videosdk-plugins-elevenlabs
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: VideoSDK Agent Framework plugin for ElevenLabs
|
|
5
|
+
Author: videosdk
|
|
6
|
+
Keywords: ai,audio,elevenlabs,video,videosdk
|
|
7
|
+
Classifier: Development Status :: 4 - Beta
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: Topic :: Communications :: Conferencing
|
|
10
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
|
11
|
+
Classifier: Topic :: Multimedia :: Video
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Requires-Dist: videosdk-agents>=0.0.11
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
VideoSDK ElevenLabs Plugin
|
|
18
|
+
|
|
19
|
+
Agent Framework plugin for tts services from ElevenLabs.
|
|
20
|
+
|
|
21
|
+
## Installation
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install videosdk-plugins-elevenlabs
|
|
25
|
+
```
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "videosdk-plugins-elevenlabs"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "VideoSDK Agent Framework plugin for ElevenLabs"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
authors = [{ name = "videosdk"}]
|
|
12
|
+
keywords = ["video", "audio", "ai", "videosdk", "elevenlabs"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Intended Audience :: Developers",
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Topic :: Communications :: Conferencing",
|
|
18
|
+
"Topic :: Multimedia :: Sound/Audio",
|
|
19
|
+
"Topic :: Multimedia :: Video",
|
|
20
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"videosdk-agents>=0.0.11"
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[tool.hatch.version]
|
|
27
|
+
path = "videosdk/plugins/elevenlabs/version.py"
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["videosdk"]
|
|
31
|
+
|
|
32
|
+
[tool.hatch.build.targets.sdist]
|
|
33
|
+
include = ["/videosdk"]
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, AsyncIterator, Literal, Optional, Union
|
|
4
|
+
import os
|
|
5
|
+
import httpx
|
|
6
|
+
import asyncio
|
|
7
|
+
import json
|
|
8
|
+
import aiohttp
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
from videosdk.agents import TTS
|
|
12
|
+
|
|
13
|
+
ELEVENLABS_SAMPLE_RATE = 24000
|
|
14
|
+
ELEVENLABS_CHANNELS = 1
|
|
15
|
+
|
|
16
|
+
DEFAULT_MODEL = "eleven_flash_v2_5"
|
|
17
|
+
DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
|
|
18
|
+
API_BASE_URL = "https://api.elevenlabs.io/v1"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class VoiceSettings:
|
|
23
|
+
stability: float = 0.71
|
|
24
|
+
similarity_boost: float = 0.5
|
|
25
|
+
style: float = 0.0
|
|
26
|
+
use_speaker_boost: bool = True
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ElevenLabsTTS(TTS):
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
*,
|
|
33
|
+
model: str = DEFAULT_MODEL,
|
|
34
|
+
voice: str = DEFAULT_VOICE_ID,
|
|
35
|
+
speed: float = 1.0,
|
|
36
|
+
api_key: str | None = None,
|
|
37
|
+
response_format: str = "pcm_24000",
|
|
38
|
+
voice_settings: VoiceSettings | None = None,
|
|
39
|
+
base_url: str = API_BASE_URL,
|
|
40
|
+
enable_streaming: bool = False,
|
|
41
|
+
) -> None:
|
|
42
|
+
super().__init__(sample_rate=ELEVENLABS_SAMPLE_RATE, num_channels=ELEVENLABS_CHANNELS)
|
|
43
|
+
|
|
44
|
+
self.model = model
|
|
45
|
+
self.voice = voice
|
|
46
|
+
self.speed = speed
|
|
47
|
+
self.audio_track = None
|
|
48
|
+
self.loop = None
|
|
49
|
+
self.response_format = response_format
|
|
50
|
+
self.base_url = base_url
|
|
51
|
+
self.enable_streaming = enable_streaming
|
|
52
|
+
self.voice_settings = voice_settings or VoiceSettings()
|
|
53
|
+
|
|
54
|
+
self.api_key = api_key or os.getenv("ELEVENLABS_API_KEY")
|
|
55
|
+
if not self.api_key:
|
|
56
|
+
raise ValueError("ElevenLabs API key must be provided either through api_key parameter or ELEVENLABS_API_KEY environment variable")
|
|
57
|
+
|
|
58
|
+
# Create HTTP session for requests
|
|
59
|
+
self._session = httpx.AsyncClient(
|
|
60
|
+
timeout=httpx.Timeout(connect=15.0, read=30.0, write=5.0, pool=5.0),
|
|
61
|
+
follow_redirects=True,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
async def synthesize(
|
|
65
|
+
self,
|
|
66
|
+
text: AsyncIterator[str] | str,
|
|
67
|
+
voice_id: Optional[str] = None,
|
|
68
|
+
**kwargs: Any,
|
|
69
|
+
) -> None:
|
|
70
|
+
try:
|
|
71
|
+
# Convert AsyncIterator to string if needed
|
|
72
|
+
if isinstance(text, AsyncIterator):
|
|
73
|
+
full_text = ""
|
|
74
|
+
async for chunk in text:
|
|
75
|
+
full_text += chunk
|
|
76
|
+
else:
|
|
77
|
+
full_text = text
|
|
78
|
+
|
|
79
|
+
if not self.audio_track or not self.loop:
|
|
80
|
+
self.emit("error", "Audio track or event loop not set")
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
# Use the provided voice_id or fall back to default
|
|
84
|
+
target_voice = voice_id or self.voice
|
|
85
|
+
|
|
86
|
+
if self.enable_streaming:
|
|
87
|
+
await self._stream_synthesis(full_text, target_voice)
|
|
88
|
+
else:
|
|
89
|
+
await self._chunked_synthesis(full_text, target_voice)
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
self.emit("error", f"TTS synthesis failed: {str(e)}")
|
|
93
|
+
|
|
94
|
+
async def _chunked_synthesis(self, text: str, voice_id: str) -> None:
|
|
95
|
+
"""Non-streaming synthesis using the standard API"""
|
|
96
|
+
url = f"{self.base_url}/text-to-speech/{voice_id}/stream"
|
|
97
|
+
|
|
98
|
+
# Add output_format as URL parameter for ElevenLabs API
|
|
99
|
+
params = {
|
|
100
|
+
"model_id": self.model,
|
|
101
|
+
"output_format": self.response_format,
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
headers = {
|
|
105
|
+
"xi-api-key": self.api_key,
|
|
106
|
+
"Content-Type": "application/json",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
payload = {
|
|
110
|
+
"text": text,
|
|
111
|
+
"voice_settings": {
|
|
112
|
+
"stability": self.voice_settings.stability,
|
|
113
|
+
"similarity_boost": self.voice_settings.similarity_boost,
|
|
114
|
+
"style": self.voice_settings.style,
|
|
115
|
+
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
116
|
+
},
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
try:
|
|
120
|
+
async with self._session.stream(
|
|
121
|
+
"POST",
|
|
122
|
+
url,
|
|
123
|
+
headers=headers,
|
|
124
|
+
json=payload,
|
|
125
|
+
params=params
|
|
126
|
+
) as response:
|
|
127
|
+
response.raise_for_status()
|
|
128
|
+
|
|
129
|
+
async for chunk in response.aiter_bytes():
|
|
130
|
+
if chunk:
|
|
131
|
+
self.loop.create_task(self.audio_track.add_new_bytes(chunk))
|
|
132
|
+
|
|
133
|
+
except httpx.HTTPStatusError as e:
|
|
134
|
+
self.emit("error", f"HTTP error {e.response.status_code}: {e.response.text}")
|
|
135
|
+
except Exception as e:
|
|
136
|
+
self.emit("error", f"Chunked synthesis failed: {str(e)}")
|
|
137
|
+
|
|
138
|
+
async def _stream_synthesis(self, text: str, voice_id: str) -> None:
|
|
139
|
+
"""WebSocket-based streaming synthesis"""
|
|
140
|
+
ws_url = f"wss://api.elevenlabs.io/v1/text-to-speech/{voice_id}/stream-input"
|
|
141
|
+
|
|
142
|
+
params = {
|
|
143
|
+
"model_id": self.model,
|
|
144
|
+
"output_format": self.response_format,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
# Build WebSocket URL with parameters
|
|
148
|
+
param_string = "&".join([f"{k}={v}" for k, v in params.items()])
|
|
149
|
+
full_ws_url = f"{ws_url}?{param_string}"
|
|
150
|
+
|
|
151
|
+
headers = {"xi-api-key": self.api_key}
|
|
152
|
+
|
|
153
|
+
try:
|
|
154
|
+
async with aiohttp.ClientSession() as session:
|
|
155
|
+
async with session.ws_connect(full_ws_url, headers=headers) as ws:
|
|
156
|
+
# Send initial configuration
|
|
157
|
+
init_message = {
|
|
158
|
+
"text": " ",
|
|
159
|
+
"voice_settings": {
|
|
160
|
+
"stability": self.voice_settings.stability,
|
|
161
|
+
"similarity_boost": self.voice_settings.similarity_boost,
|
|
162
|
+
"style": self.voice_settings.style,
|
|
163
|
+
"use_speaker_boost": self.voice_settings.use_speaker_boost,
|
|
164
|
+
},
|
|
165
|
+
}
|
|
166
|
+
await ws.send_str(json.dumps(init_message))
|
|
167
|
+
|
|
168
|
+
# Send text data
|
|
169
|
+
text_message = {"text": f"{text} "}
|
|
170
|
+
await ws.send_str(json.dumps(text_message))
|
|
171
|
+
|
|
172
|
+
# Send end-of-stream marker
|
|
173
|
+
eos_message = {"text": ""}
|
|
174
|
+
await ws.send_str(json.dumps(eos_message))
|
|
175
|
+
|
|
176
|
+
# Receive and process audio data
|
|
177
|
+
async for msg in ws:
|
|
178
|
+
if msg.type == aiohttp.WSMsgType.TEXT:
|
|
179
|
+
data = json.loads(msg.data)
|
|
180
|
+
if data.get("audio"):
|
|
181
|
+
import base64
|
|
182
|
+
audio_chunk = base64.b64decode(data["audio"])
|
|
183
|
+
self.loop.create_task(self.audio_track.add_new_bytes(audio_chunk))
|
|
184
|
+
elif data.get("isFinal"):
|
|
185
|
+
break
|
|
186
|
+
elif data.get("error"):
|
|
187
|
+
self.emit("error", f"WebSocket error: {data['error']}")
|
|
188
|
+
break
|
|
189
|
+
elif msg.type == aiohttp.WSMsgType.ERROR:
|
|
190
|
+
self.emit("error", f"WebSocket connection error: {ws.exception()}")
|
|
191
|
+
break
|
|
192
|
+
|
|
193
|
+
except Exception as e:
|
|
194
|
+
self.emit("error", f"Streaming synthesis failed: {str(e)}")
|
|
195
|
+
|
|
196
|
+
async def aclose(self) -> None:
|
|
197
|
+
"""Cleanup resources"""
|
|
198
|
+
if self._session:
|
|
199
|
+
await self._session.aclose()
|
|
200
|
+
await super().aclose()
|
|
201
|
+
|
|
202
|
+
async def interrupt(self) -> None:
|
|
203
|
+
"""Interrupt the TTS process"""
|
|
204
|
+
if self.audio_track:
|
|
205
|
+
self.audio_track.interrupt()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.0.1"
|