livekit-plugins-elevenlabs 0.3.dev0__tar.gz → 0.4.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/PKG-INFO +3 -3
- livekit_plugins_elevenlabs-0.4.dev0/livekit/plugins/elevenlabs/log.py +3 -0
- livekit_plugins_elevenlabs-0.4.dev0/livekit/plugins/elevenlabs/tts.py +392 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit/plugins/elevenlabs/version.py +1 -1
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit_plugins_elevenlabs.egg-info/PKG-INFO +3 -3
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit_plugins_elevenlabs.egg-info/SOURCES.txt +1 -0
- livekit_plugins_elevenlabs-0.4.dev0/livekit_plugins_elevenlabs.egg-info/requires.txt +3 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/setup.py +2 -2
- livekit-plugins-elevenlabs-0.3.dev0/livekit/plugins/elevenlabs/tts.py +0 -344
- livekit-plugins-elevenlabs-0.3.dev0/livekit_plugins_elevenlabs.egg-info/requires.txt +0 -3
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/README.md +0 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit/plugins/elevenlabs/__init__.py +0 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit/plugins/elevenlabs/models.py +0 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit/plugins/elevenlabs/py.typed +0 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit_plugins_elevenlabs.egg-info/dependency_links.txt +0 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit_plugins_elevenlabs.egg-info/top_level.txt +0 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/pyproject.toml +0 -0
- {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.dev0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -19,8 +19,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: livekit~=0.
|
23
|
-
Requires-Dist: livekit-agents~=0.
|
22
|
+
Requires-Dist: livekit~=0.11
|
23
|
+
Requires-Dist: livekit-agents~=0.6.dev0
|
24
24
|
Requires-Dist: aiohttp>=3.8.5
|
25
25
|
|
26
26
|
# LiveKit Plugins Elevenlabs
|
@@ -0,0 +1,392 @@
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
import asyncio
|
16
|
+
import base64
|
17
|
+
import contextlib
|
18
|
+
import dataclasses
|
19
|
+
import json
|
20
|
+
import os
|
21
|
+
from dataclasses import dataclass
|
22
|
+
from typing import AsyncIterable, List
|
23
|
+
|
24
|
+
import aiohttp
|
25
|
+
from livekit import rtc
|
26
|
+
from livekit.agents import aio, tts
|
27
|
+
|
28
|
+
from .log import logger
|
29
|
+
from .models import TTSModels
|
30
|
+
|
31
|
+
|
32
|
+
@dataclass
|
33
|
+
class VoiceSettings:
|
34
|
+
stability: float # [0.0 - 1.0]
|
35
|
+
similarity_boost: float # [0.0 - 1.0]
|
36
|
+
style: float | None = None # [0.0 - 1.0]
|
37
|
+
use_speaker_boost: bool | None = False
|
38
|
+
|
39
|
+
|
40
|
+
@dataclass
|
41
|
+
class Voice:
|
42
|
+
id: str
|
43
|
+
name: str
|
44
|
+
category: str
|
45
|
+
settings: VoiceSettings | None = None
|
46
|
+
|
47
|
+
|
48
|
+
DEFAULT_VOICE = Voice(
|
49
|
+
id="EXAVITQu4vr4xnSDxMaL",
|
50
|
+
name="Bella",
|
51
|
+
category="premade",
|
52
|
+
settings=VoiceSettings(
|
53
|
+
stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
|
54
|
+
),
|
55
|
+
)
|
56
|
+
|
57
|
+
API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
|
58
|
+
AUTHORIZATION_HEADER = "xi-api-key"
|
59
|
+
|
60
|
+
|
61
|
+
@dataclass
|
62
|
+
class TTSOptions:
|
63
|
+
api_key: str
|
64
|
+
voice: Voice
|
65
|
+
model_id: TTSModels
|
66
|
+
base_url: str
|
67
|
+
sample_rate: int
|
68
|
+
latency: int
|
69
|
+
|
70
|
+
|
71
|
+
class TTS(tts.TTS):
|
72
|
+
def __init__(
|
73
|
+
self,
|
74
|
+
*,
|
75
|
+
voice: Voice = DEFAULT_VOICE,
|
76
|
+
model_id: TTSModels = "eleven_turbo_v2",
|
77
|
+
api_key: str | None = None,
|
78
|
+
base_url: str | None = None,
|
79
|
+
sample_rate: int = 24000,
|
80
|
+
latency: int = 3,
|
81
|
+
) -> None:
|
82
|
+
super().__init__(
|
83
|
+
streaming_supported=True, sample_rate=sample_rate, num_channels=1
|
84
|
+
)
|
85
|
+
api_key = api_key or os.environ.get("ELEVEN_API_KEY")
|
86
|
+
if not api_key:
|
87
|
+
raise ValueError("ELEVEN_API_KEY must be set")
|
88
|
+
|
89
|
+
self._session = aiohttp.ClientSession()
|
90
|
+
self._opts = TTSOptions(
|
91
|
+
voice=voice,
|
92
|
+
model_id=model_id,
|
93
|
+
api_key=api_key,
|
94
|
+
base_url=base_url or API_BASE_URL_V1,
|
95
|
+
sample_rate=sample_rate,
|
96
|
+
latency=latency,
|
97
|
+
)
|
98
|
+
|
99
|
+
async def list_voices(self) -> List[Voice]:
|
100
|
+
async with self._session.get(
|
101
|
+
f"{self._opts.base_url}/voices",
|
102
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
103
|
+
) as resp:
|
104
|
+
data = await resp.json()
|
105
|
+
return dict_to_voices_list(data)
|
106
|
+
|
107
|
+
def synthesize(
|
108
|
+
self,
|
109
|
+
text: str,
|
110
|
+
) -> AsyncIterable[tts.SynthesizedAudio]:
|
111
|
+
voice = self._opts.voice
|
112
|
+
url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
|
113
|
+
|
114
|
+
async def generator():
|
115
|
+
try:
|
116
|
+
async with self._session.post(
|
117
|
+
url,
|
118
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
119
|
+
json=dict(
|
120
|
+
text=text,
|
121
|
+
model_id=self._opts.model_id,
|
122
|
+
voice_settings=dataclasses.asdict(voice.settings)
|
123
|
+
if voice.settings
|
124
|
+
else None,
|
125
|
+
),
|
126
|
+
) as resp:
|
127
|
+
data = await resp.read()
|
128
|
+
yield tts.SynthesizedAudio(
|
129
|
+
text=text,
|
130
|
+
data=rtc.AudioFrame(
|
131
|
+
data=data,
|
132
|
+
sample_rate=self._opts.sample_rate,
|
133
|
+
num_channels=1,
|
134
|
+
samples_per_channel=len(data) // 2, # 16-bit
|
135
|
+
),
|
136
|
+
)
|
137
|
+
except Exception as e:
|
138
|
+
logger.error(f"failed to synthesize: {e}")
|
139
|
+
|
140
|
+
return generator()
|
141
|
+
|
142
|
+
def stream(
|
143
|
+
self,
|
144
|
+
) -> "SynthesizeStream":
|
145
|
+
return SynthesizeStream(self._session, self._opts)
|
146
|
+
|
147
|
+
|
148
|
+
class SynthesizeStream(tts.SynthesizeStream):
|
149
|
+
_STREAM_EOS = ""
|
150
|
+
|
151
|
+
def __init__(
|
152
|
+
self,
|
153
|
+
session: aiohttp.ClientSession,
|
154
|
+
opts: TTSOptions,
|
155
|
+
max_retry: int = 32,
|
156
|
+
):
|
157
|
+
self._opts = opts
|
158
|
+
self._session = session
|
159
|
+
|
160
|
+
self._queue = asyncio.Queue[str | None]()
|
161
|
+
self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
|
162
|
+
self._closed = False
|
163
|
+
self._text = ""
|
164
|
+
|
165
|
+
self._main_task = asyncio.create_task(self._run(max_retry))
|
166
|
+
|
167
|
+
def _stream_url(self) -> str:
|
168
|
+
base_url = self._opts.base_url
|
169
|
+
voice_id = self._opts.voice.id
|
170
|
+
model_id = self._opts.model_id
|
171
|
+
sample_rate = self._opts.sample_rate
|
172
|
+
latency = self._opts.latency
|
173
|
+
return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
174
|
+
|
175
|
+
def push_text(self, token: str | None) -> None:
|
176
|
+
if self._closed:
|
177
|
+
raise ValueError("cannot push to a closed stream")
|
178
|
+
|
179
|
+
if token is None:
|
180
|
+
self._flush_if_needed()
|
181
|
+
return
|
182
|
+
|
183
|
+
if len(token) == 0:
|
184
|
+
# 11labs marks the EOS with an empty string, avoid users from pushing empty strings
|
185
|
+
return
|
186
|
+
|
187
|
+
# TODO: Naive word boundary detection may not be good enough for all languages
|
188
|
+
# fmt: off
|
189
|
+
splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
|
190
|
+
# fmt: on
|
191
|
+
|
192
|
+
self._text += token
|
193
|
+
|
194
|
+
while True:
|
195
|
+
last_split = -1
|
196
|
+
for i, c in enumerate(self._text):
|
197
|
+
if c in splitters:
|
198
|
+
last_split = i
|
199
|
+
break
|
200
|
+
|
201
|
+
if last_split == -1:
|
202
|
+
break
|
203
|
+
|
204
|
+
seg = self._text[: last_split + 1]
|
205
|
+
seg = seg.strip() + " " # 11labs expects a space at the end
|
206
|
+
self._queue.put_nowait(seg)
|
207
|
+
self._text = self._text[last_split + 1 :]
|
208
|
+
|
209
|
+
async def aclose(self, *, wait: bool = True) -> None:
|
210
|
+
self._flush_if_needed()
|
211
|
+
self._queue.put_nowait(None)
|
212
|
+
self._closed = True
|
213
|
+
|
214
|
+
if not wait:
|
215
|
+
self._main_task.cancel()
|
216
|
+
|
217
|
+
with contextlib.suppress(asyncio.CancelledError):
|
218
|
+
await self._main_task
|
219
|
+
|
220
|
+
def _flush_if_needed(self) -> None:
|
221
|
+
seg = self._text.strip()
|
222
|
+
if len(seg) > 0:
|
223
|
+
self._queue.put_nowait(seg + " ")
|
224
|
+
|
225
|
+
self._text = ""
|
226
|
+
self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
|
227
|
+
|
228
|
+
async def _run(self, max_retry: int) -> None:
|
229
|
+
retry_count = 0
|
230
|
+
ws: aiohttp.ClientWebSocketResponse | None = None
|
231
|
+
ws_task: asyncio.Task | None = None
|
232
|
+
data_tx: aio.ChanSender[str] | None = None
|
233
|
+
|
234
|
+
try:
|
235
|
+
while True:
|
236
|
+
ws_connected = ws is not None and not ws.closed
|
237
|
+
try:
|
238
|
+
data = await self._queue.get()
|
239
|
+
|
240
|
+
if data is None:
|
241
|
+
if ws_task is not None:
|
242
|
+
await ws_task
|
243
|
+
break
|
244
|
+
|
245
|
+
if not ws_connected:
|
246
|
+
if data == SynthesizeStream._STREAM_EOS:
|
247
|
+
continue
|
248
|
+
|
249
|
+
with contextlib.suppress(asyncio.CancelledError):
|
250
|
+
if ws_task is not None:
|
251
|
+
await ws_task
|
252
|
+
|
253
|
+
ws = await self._session.ws_connect(
|
254
|
+
self._stream_url(),
|
255
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
256
|
+
)
|
257
|
+
data_tx, data_rx = aio.channel()
|
258
|
+
ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
|
259
|
+
|
260
|
+
assert data_tx is not None
|
261
|
+
assert ws_task is not None
|
262
|
+
assert ws is not None
|
263
|
+
|
264
|
+
data_tx.send_nowait(data)
|
265
|
+
|
266
|
+
except Exception:
|
267
|
+
if retry_count >= max_retry:
|
268
|
+
logger.exception(
|
269
|
+
f"failed to connect to 11labs after {max_retry} retries"
|
270
|
+
)
|
271
|
+
break
|
272
|
+
|
273
|
+
retry_delay = min(retry_count * 5, 5) # max 5s
|
274
|
+
retry_count += 1
|
275
|
+
|
276
|
+
logger.warning(
|
277
|
+
f"failed to connect to 11labs, retrying in {retry_delay}s"
|
278
|
+
)
|
279
|
+
await asyncio.sleep(retry_delay)
|
280
|
+
|
281
|
+
except Exception:
|
282
|
+
logger.exception("11labs task failed")
|
283
|
+
finally:
|
284
|
+
with contextlib.suppress(asyncio.CancelledError):
|
285
|
+
if ws_task is not None:
|
286
|
+
ws_task.cancel()
|
287
|
+
await ws_task
|
288
|
+
|
289
|
+
self._event_queue.put_nowait(None)
|
290
|
+
|
291
|
+
async def _run_ws(
|
292
|
+
self, ws: aiohttp.ClientWebSocketResponse, data_rx: aio.ChanReceiver[str]
|
293
|
+
) -> None:
|
294
|
+
closing_ws = False
|
295
|
+
|
296
|
+
self._event_queue.put_nowait(
|
297
|
+
tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
|
298
|
+
)
|
299
|
+
|
300
|
+
async def send_task():
|
301
|
+
nonlocal closing_ws
|
302
|
+
|
303
|
+
# 11labs stream must be initialized with a space
|
304
|
+
voice = self._opts.voice
|
305
|
+
voice_settings = (
|
306
|
+
dataclasses.asdict(voice.settings) if voice.settings else None
|
307
|
+
)
|
308
|
+
init_pkt = dict(
|
309
|
+
text=" ",
|
310
|
+
voice_settings=voice_settings,
|
311
|
+
)
|
312
|
+
await ws.send_str(json.dumps(init_pkt))
|
313
|
+
|
314
|
+
while True:
|
315
|
+
data = await data_rx.recv()
|
316
|
+
data_pkt = dict(
|
317
|
+
text=data,
|
318
|
+
try_trigger_generation=False,
|
319
|
+
)
|
320
|
+
if data == SynthesizeStream._STREAM_EOS:
|
321
|
+
closing_ws = True
|
322
|
+
|
323
|
+
await ws.send_str(json.dumps(data_pkt))
|
324
|
+
|
325
|
+
if closing_ws:
|
326
|
+
return
|
327
|
+
|
328
|
+
async def recv_task():
|
329
|
+
nonlocal closing_ws
|
330
|
+
while True:
|
331
|
+
msg = await ws.receive()
|
332
|
+
if msg.type in (
|
333
|
+
aiohttp.WSMsgType.CLOSED,
|
334
|
+
aiohttp.WSMsgType.CLOSE,
|
335
|
+
aiohttp.WSMsgType.CLOSING,
|
336
|
+
):
|
337
|
+
if closing_ws: # close is expected
|
338
|
+
return
|
339
|
+
|
340
|
+
raise Exception("11labs connection closed unexpectedly")
|
341
|
+
|
342
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
343
|
+
logger.warning("unexpected 11labs message type %s", msg.type)
|
344
|
+
continue
|
345
|
+
|
346
|
+
data: dict = json.loads(msg.data)
|
347
|
+
if data.get("audio"):
|
348
|
+
b64data = base64.b64decode(data["audio"])
|
349
|
+
frame = rtc.AudioFrame(
|
350
|
+
data=b64data,
|
351
|
+
sample_rate=self._opts.sample_rate,
|
352
|
+
num_channels=1,
|
353
|
+
samples_per_channel=len(data) // 2,
|
354
|
+
)
|
355
|
+
self._event_queue.put_nowait(
|
356
|
+
tts.SynthesisEvent(
|
357
|
+
type=tts.SynthesisEventType.AUDIO,
|
358
|
+
audio=tts.SynthesizedAudio(text="", data=frame),
|
359
|
+
)
|
360
|
+
)
|
361
|
+
elif data.get("isFinal"):
|
362
|
+
return
|
363
|
+
|
364
|
+
try:
|
365
|
+
await asyncio.gather(send_task(), recv_task())
|
366
|
+
except Exception:
|
367
|
+
logger.exception("11labs connection failed")
|
368
|
+
finally:
|
369
|
+
self._event_queue.put_nowait(
|
370
|
+
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
371
|
+
)
|
372
|
+
|
373
|
+
async def __anext__(self) -> tts.SynthesisEvent:
|
374
|
+
evt = await self._event_queue.get()
|
375
|
+
if evt is None:
|
376
|
+
raise StopAsyncIteration
|
377
|
+
|
378
|
+
return evt
|
379
|
+
|
380
|
+
|
381
|
+
def dict_to_voices_list(data: dict) -> List[Voice]:
|
382
|
+
voices = []
|
383
|
+
for voice in data["voices"]:
|
384
|
+
voices.append(
|
385
|
+
Voice(
|
386
|
+
id=voice["voice_id"],
|
387
|
+
name=voice["name"],
|
388
|
+
category=voice["category"],
|
389
|
+
settings=None,
|
390
|
+
)
|
391
|
+
)
|
392
|
+
return voices
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.dev0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -19,8 +19,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: livekit~=0.
|
23
|
-
Requires-Dist: livekit-agents~=0.
|
22
|
+
Requires-Dist: livekit~=0.11
|
23
|
+
Requires-Dist: livekit-agents~=0.6.dev0
|
24
24
|
Requires-Dist: aiohttp>=3.8.5
|
25
25
|
|
26
26
|
# LiveKit Plugins Elevenlabs
|
@@ -50,8 +50,8 @@ setuptools.setup(
|
|
50
50
|
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
|
51
51
|
python_requires=">=3.9.0",
|
52
52
|
install_requires=[
|
53
|
-
"livekit ~= 0.
|
54
|
-
"livekit-agents~=0.
|
53
|
+
"livekit ~= 0.11",
|
54
|
+
"livekit-agents~=0.6.dev0",
|
55
55
|
"aiohttp >= 3.8.5",
|
56
56
|
],
|
57
57
|
package_data={
|
@@ -1,344 +0,0 @@
|
|
1
|
-
# Copyright 2023 LiveKit, Inc.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
|
15
|
-
import asyncio
|
16
|
-
import base64
|
17
|
-
import contextlib
|
18
|
-
import dataclasses
|
19
|
-
import json
|
20
|
-
import logging
|
21
|
-
import os
|
22
|
-
from dataclasses import dataclass
|
23
|
-
from typing import Any, AsyncIterable, Dict, List, Optional
|
24
|
-
|
25
|
-
import aiohttp
|
26
|
-
from livekit import rtc
|
27
|
-
from livekit.agents import tts
|
28
|
-
|
29
|
-
from .models import TTSModels
|
30
|
-
|
31
|
-
|
32
|
-
@dataclass
|
33
|
-
class Voice:
|
34
|
-
id: str
|
35
|
-
name: str
|
36
|
-
category: str
|
37
|
-
settings: Optional["VoiceSettings"] = None
|
38
|
-
|
39
|
-
|
40
|
-
@dataclass
|
41
|
-
class VoiceSettings:
|
42
|
-
stability: float # [0.0 - 1.0]
|
43
|
-
similarity_boost: float # [0.0 - 1.0]
|
44
|
-
style: Optional[float] = None # [0.0 - 1.0]
|
45
|
-
use_speaker_boost: Optional[bool] = False
|
46
|
-
|
47
|
-
|
48
|
-
DEFAULT_VOICE = Voice(
|
49
|
-
id="EXAVITQu4vr4xnSDxMaL",
|
50
|
-
name="Bella",
|
51
|
-
category="premade",
|
52
|
-
settings=VoiceSettings(
|
53
|
-
stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
|
54
|
-
),
|
55
|
-
)
|
56
|
-
|
57
|
-
API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
|
58
|
-
AUTHORIZATION_HEADER = "xi-api-key"
|
59
|
-
STREAM_EOS = ""
|
60
|
-
|
61
|
-
|
62
|
-
@dataclass
|
63
|
-
class TTSOptions:
|
64
|
-
api_key: str
|
65
|
-
voice: Voice
|
66
|
-
model_id: TTSModels
|
67
|
-
base_url: str
|
68
|
-
sample_rate: int
|
69
|
-
latency: int
|
70
|
-
|
71
|
-
|
72
|
-
class TTS(tts.TTS):
|
73
|
-
def __init__(
|
74
|
-
self,
|
75
|
-
*,
|
76
|
-
voice: Voice = DEFAULT_VOICE,
|
77
|
-
model_id: TTSModels = "eleven_multilingual_v2",
|
78
|
-
api_key: Optional[str] = None,
|
79
|
-
base_url: Optional[str] = None,
|
80
|
-
sample_rate: int = 24000,
|
81
|
-
latency: int = 2,
|
82
|
-
) -> None:
|
83
|
-
super().__init__(streaming_supported=True)
|
84
|
-
api_key = api_key or os.environ.get("ELEVEN_API_KEY")
|
85
|
-
if not api_key:
|
86
|
-
raise ValueError("ELEVEN_API_KEY must be set")
|
87
|
-
|
88
|
-
self._session = aiohttp.ClientSession()
|
89
|
-
self._config = TTSOptions(
|
90
|
-
voice=voice,
|
91
|
-
model_id=model_id,
|
92
|
-
api_key=api_key,
|
93
|
-
base_url=base_url or API_BASE_URL_V1,
|
94
|
-
sample_rate=sample_rate,
|
95
|
-
latency=latency,
|
96
|
-
)
|
97
|
-
|
98
|
-
async def list_voices(self) -> List[Voice]:
|
99
|
-
async with self._session.get(
|
100
|
-
f"{self._config.base_url}/voices",
|
101
|
-
headers={AUTHORIZATION_HEADER: self._config.api_key},
|
102
|
-
) as resp:
|
103
|
-
data = await resp.json()
|
104
|
-
return dict_to_voices_list(data)
|
105
|
-
|
106
|
-
def synthesize(
|
107
|
-
self,
|
108
|
-
text: str,
|
109
|
-
) -> AsyncIterable[tts.SynthesizedAudio]:
|
110
|
-
voice = self._config.voice
|
111
|
-
|
112
|
-
async def generator():
|
113
|
-
async with self._session.post(
|
114
|
-
f"{self._config.base_url}/text-to-speech/{voice.id}?output_format=pcm_44100",
|
115
|
-
headers={AUTHORIZATION_HEADER: self._config.api_key},
|
116
|
-
json=dict(
|
117
|
-
text=text,
|
118
|
-
model_id=self._config.model_id,
|
119
|
-
voice_settings=dataclasses.asdict(voice.settings)
|
120
|
-
if voice.settings
|
121
|
-
else None,
|
122
|
-
),
|
123
|
-
) as resp:
|
124
|
-
data = await resp.read()
|
125
|
-
yield tts.SynthesizedAudio(
|
126
|
-
text=text,
|
127
|
-
data=rtc.AudioFrame(
|
128
|
-
data=data,
|
129
|
-
sample_rate=44100,
|
130
|
-
num_channels=1,
|
131
|
-
samples_per_channel=len(data) // 2, # 16-bit
|
132
|
-
),
|
133
|
-
)
|
134
|
-
|
135
|
-
return generator()
|
136
|
-
|
137
|
-
def stream(
|
138
|
-
self,
|
139
|
-
) -> "SynthesizeStream":
|
140
|
-
return SynthesizeStream(self._session, self._config)
|
141
|
-
|
142
|
-
|
143
|
-
class SynthesizeStream(tts.SynthesizeStream):
|
144
|
-
def __init__(
|
145
|
-
self,
|
146
|
-
session: aiohttp.ClientSession,
|
147
|
-
config: TTSOptions,
|
148
|
-
):
|
149
|
-
self._config = config
|
150
|
-
self._session = session
|
151
|
-
|
152
|
-
self._queue = asyncio.Queue[str]()
|
153
|
-
self._event_queue = asyncio.Queue[tts.SynthesisEvent]()
|
154
|
-
self._closed = False
|
155
|
-
|
156
|
-
self._main_task = asyncio.create_task(self._run(max_retry=32))
|
157
|
-
|
158
|
-
def log_exception(task: asyncio.Task) -> None:
|
159
|
-
if not task.cancelled() and task.exception():
|
160
|
-
logging.error(f"elevenlabs synthesis task failed: {task.exception()}")
|
161
|
-
|
162
|
-
self._main_task.add_done_callback(log_exception)
|
163
|
-
self._text = ""
|
164
|
-
|
165
|
-
def _stream_url(self) -> str:
|
166
|
-
base_url = self._config.base_url
|
167
|
-
voice_id = self._config.voice.id
|
168
|
-
model_id = self._config.model_id
|
169
|
-
return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{self._config.sample_rate}&optimize_streaming_latency={self._config.latency}"
|
170
|
-
|
171
|
-
def push_text(self, token: str | None) -> None:
|
172
|
-
if self._closed:
|
173
|
-
raise ValueError("cannot push to a closed stream")
|
174
|
-
|
175
|
-
if not token or len(token) == 0:
|
176
|
-
return
|
177
|
-
|
178
|
-
# TODO: Native word boundary detection may not be good enough for all languages
|
179
|
-
# fmt: off
|
180
|
-
splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
|
181
|
-
# fmt: on
|
182
|
-
|
183
|
-
self._text += token
|
184
|
-
if token[-1] in splitters:
|
185
|
-
self._queue.put_nowait(self._text)
|
186
|
-
self._text = ""
|
187
|
-
|
188
|
-
async def _run(self, max_retry: int) -> None:
|
189
|
-
retry_count = 0
|
190
|
-
listen_task: Optional[asyncio.Task] = None
|
191
|
-
ws: Optional[aiohttp.ClientWebSocketResponse] = None
|
192
|
-
retry_text_queue: asyncio.Queue[str] = asyncio.Queue()
|
193
|
-
while True:
|
194
|
-
try:
|
195
|
-
ws = await self._try_connect()
|
196
|
-
retry_count = 0 # reset retry count
|
197
|
-
|
198
|
-
listen_task = asyncio.create_task(self._listen_task(ws))
|
199
|
-
|
200
|
-
# forward queued text to 11labs
|
201
|
-
started = False
|
202
|
-
while not ws.closed:
|
203
|
-
text = None
|
204
|
-
if not retry_text_queue.empty():
|
205
|
-
text = await retry_text_queue.get()
|
206
|
-
retry_text_queue.task_done()
|
207
|
-
else:
|
208
|
-
text = await self._queue.get()
|
209
|
-
|
210
|
-
if not started:
|
211
|
-
self._event_queue.put_nowait(
|
212
|
-
tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
|
213
|
-
)
|
214
|
-
started = True
|
215
|
-
text_packet = dict(
|
216
|
-
text=text,
|
217
|
-
try_trigger_generation=True,
|
218
|
-
)
|
219
|
-
|
220
|
-
# This case can happen in normal operation because 11labs will not
|
221
|
-
# keep connections open indefinitely if we are not sending data.
|
222
|
-
try:
|
223
|
-
await ws.send_str(json.dumps(text_packet))
|
224
|
-
except Exception:
|
225
|
-
await retry_text_queue.put(text)
|
226
|
-
break
|
227
|
-
|
228
|
-
# We call self._queue.task_done() even if we are retrying the text because
|
229
|
-
# all text has gone through self._queue. An exception may have short-circuited
|
230
|
-
# out of the loop so task_done() will not have already been called on text that
|
231
|
-
# is being retried.
|
232
|
-
self._queue.task_done()
|
233
|
-
if text == STREAM_EOS:
|
234
|
-
await listen_task
|
235
|
-
# We know 11labs is closing the stream after each request/flush
|
236
|
-
self._event_queue.put_nowait(
|
237
|
-
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
238
|
-
)
|
239
|
-
break
|
240
|
-
|
241
|
-
except asyncio.CancelledError:
|
242
|
-
if ws:
|
243
|
-
await ws.close()
|
244
|
-
if listen_task:
|
245
|
-
await asyncio.shield(listen_task)
|
246
|
-
break
|
247
|
-
except Exception as e:
|
248
|
-
if retry_count > max_retry and max_retry > 0:
|
249
|
-
logging.error(f"failed to connect to ElevenLabs: {e}")
|
250
|
-
break
|
251
|
-
|
252
|
-
retry_delay = min(retry_count * 5, 5) # max 5s
|
253
|
-
retry_count += 1
|
254
|
-
logging.warning(
|
255
|
-
f"failed to connect to ElevenLabs: {e} - retrying in {retry_delay}s"
|
256
|
-
)
|
257
|
-
await asyncio.sleep(retry_delay)
|
258
|
-
|
259
|
-
self._closed = True
|
260
|
-
|
261
|
-
async def _try_connect(self) -> aiohttp.ClientWebSocketResponse:
|
262
|
-
ws = await self._session.ws_connect(
|
263
|
-
self._stream_url(),
|
264
|
-
headers={AUTHORIZATION_HEADER: self._config.api_key},
|
265
|
-
)
|
266
|
-
|
267
|
-
voice = self._config.voice
|
268
|
-
voice_settings = dataclasses.asdict(voice.settings) if voice.settings else None
|
269
|
-
|
270
|
-
init_packet = dict(
|
271
|
-
text=" ",
|
272
|
-
voice_settings=voice_settings,
|
273
|
-
)
|
274
|
-
await ws.send_str(json.dumps(init_packet))
|
275
|
-
return ws
|
276
|
-
|
277
|
-
async def _listen_task(self, ws: aiohttp.ClientWebSocketResponse) -> None:
|
278
|
-
while True:
|
279
|
-
msg = await ws.receive()
|
280
|
-
|
281
|
-
if msg.type in (
|
282
|
-
aiohttp.WSMsgType.CLOSED,
|
283
|
-
aiohttp.WSMsgType.CLOSE,
|
284
|
-
aiohttp.WSMsgType.CLOSING,
|
285
|
-
):
|
286
|
-
break
|
287
|
-
|
288
|
-
if msg.type != aiohttp.WSMsgType.TEXT:
|
289
|
-
continue
|
290
|
-
|
291
|
-
jsonMessage: Dict[str, Any] = json.loads(str(msg.data))
|
292
|
-
if jsonMessage.get("audio"):
|
293
|
-
data = base64.b64decode(jsonMessage["audio"])
|
294
|
-
audio_frame = rtc.AudioFrame(
|
295
|
-
data=data,
|
296
|
-
sample_rate=self._config.sample_rate,
|
297
|
-
num_channels=1,
|
298
|
-
samples_per_channel=len(data) // 2,
|
299
|
-
)
|
300
|
-
self._event_queue.put_nowait(
|
301
|
-
tts.SynthesisEvent(
|
302
|
-
type=tts.SynthesisEventType.AUDIO,
|
303
|
-
audio=tts.SynthesizedAudio(text="", data=audio_frame),
|
304
|
-
)
|
305
|
-
)
|
306
|
-
elif jsonMessage.get("isFinal"):
|
307
|
-
break
|
308
|
-
else:
|
309
|
-
logging.error(f"Unhandled message from ElevenLabs: {msg}")
|
310
|
-
|
311
|
-
async def flush(self) -> None:
|
312
|
-
self._queue.put_nowait(self._text + " ")
|
313
|
-
self._text = ""
|
314
|
-
self._queue.put_nowait(STREAM_EOS)
|
315
|
-
await self._queue.join()
|
316
|
-
|
317
|
-
async def aclose(self, wait=False) -> None:
|
318
|
-
if wait:
|
319
|
-
logging.warning(
|
320
|
-
"wait=True is not yet supported for ElevenLabs TTS. Closing immediately."
|
321
|
-
)
|
322
|
-
self._main_task.cancel()
|
323
|
-
with contextlib.suppress(asyncio.CancelledError):
|
324
|
-
await self._main_task
|
325
|
-
|
326
|
-
async def __anext__(self) -> tts.SynthesisEvent:
|
327
|
-
if self._closed and self._event_queue.empty():
|
328
|
-
raise StopAsyncIteration
|
329
|
-
|
330
|
-
return await self._event_queue.get()
|
331
|
-
|
332
|
-
|
333
|
-
def dict_to_voices_list(data: dict) -> List[Voice]:
|
334
|
-
voices = []
|
335
|
-
for voice in data["voices"]:
|
336
|
-
voices.append(
|
337
|
-
Voice(
|
338
|
-
id=voice["voice_id"],
|
339
|
-
name=voice["name"],
|
340
|
-
category=voice["category"],
|
341
|
-
settings=None,
|
342
|
-
)
|
343
|
-
)
|
344
|
-
return voices
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|