livekit-plugins-elevenlabs 0.4.dev2__tar.gz → 0.5.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/PKG-INFO +2 -2
- livekit_plugins_elevenlabs-0.5.dev0/livekit/plugins/elevenlabs/tts.py +487 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/PKG-INFO +2 -2
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/requires.txt +1 -1
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/setup.py +1 -1
- livekit_plugins_elevenlabs-0.4.dev2/livekit/plugins/elevenlabs/tts.py +0 -392
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/README.md +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/__init__.py +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/log.py +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/models.py +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/py.typed +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/SOURCES.txt +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/dependency_links.txt +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/top_level.txt +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/pyproject.toml +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.dev0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
22
|
Requires-Dist: livekit~=0.11
|
23
|
-
Requires-Dist: livekit-agents~=0.
|
23
|
+
Requires-Dist: livekit-agents~=0.7.dev0
|
24
24
|
Requires-Dist: aiohttp>=3.8.5
|
25
25
|
|
26
26
|
# LiveKit Plugins Elevenlabs
|
@@ -0,0 +1,487 @@
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
|
17
|
+
import asyncio
|
18
|
+
import base64
|
19
|
+
import contextlib
|
20
|
+
import dataclasses
|
21
|
+
import json
|
22
|
+
import os
|
23
|
+
from dataclasses import dataclass
|
24
|
+
from typing import List, Optional
|
25
|
+
|
26
|
+
import aiohttp
|
27
|
+
from livekit import rtc
|
28
|
+
from livekit.agents import aio, tokenize, tts, utils
|
29
|
+
|
30
|
+
from .log import logger
|
31
|
+
from .models import TTSModels
|
32
|
+
|
33
|
+
|
34
|
+
@dataclass
|
35
|
+
class VoiceSettings:
|
36
|
+
stability: float # [0.0 - 1.0]
|
37
|
+
similarity_boost: float # [0.0 - 1.0]
|
38
|
+
style: float | None = None # [0.0 - 1.0]
|
39
|
+
use_speaker_boost: bool | None = False
|
40
|
+
|
41
|
+
|
42
|
+
@dataclass
|
43
|
+
class Voice:
|
44
|
+
id: str
|
45
|
+
name: str
|
46
|
+
category: str
|
47
|
+
settings: VoiceSettings | None = None
|
48
|
+
|
49
|
+
|
50
|
+
DEFAULT_VOICE = Voice(
|
51
|
+
id="EXAVITQu4vr4xnSDxMaL",
|
52
|
+
name="Bella",
|
53
|
+
category="premade",
|
54
|
+
settings=VoiceSettings(
|
55
|
+
stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
|
56
|
+
),
|
57
|
+
)
|
58
|
+
|
59
|
+
API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
|
60
|
+
AUTHORIZATION_HEADER = "xi-api-key"
|
61
|
+
|
62
|
+
|
63
|
+
@dataclass
|
64
|
+
class _TTSOptions:
|
65
|
+
api_key: str
|
66
|
+
voice: Voice
|
67
|
+
model_id: TTSModels
|
68
|
+
base_url: str
|
69
|
+
sample_rate: int
|
70
|
+
streaming_latency: int
|
71
|
+
word_tokenizer: tokenize.WordTokenizer
|
72
|
+
chunk_length_schedule: list[int]
|
73
|
+
|
74
|
+
|
75
|
+
class TTS(tts.TTS):
|
76
|
+
def __init__(
|
77
|
+
self,
|
78
|
+
*,
|
79
|
+
voice: Voice = DEFAULT_VOICE,
|
80
|
+
model_id: TTSModels = "eleven_turbo_v2",
|
81
|
+
api_key: str | None = None,
|
82
|
+
base_url: str | None = None,
|
83
|
+
sample_rate: int = 24000,
|
84
|
+
streaming_latency: int = 3,
|
85
|
+
word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
|
86
|
+
ignore_punctuation=False # punctuation can help for intonation
|
87
|
+
),
|
88
|
+
# default value of 11labs is [120, 160, 250, 290], but we want faster responses by default
|
89
|
+
# (range is 50-500)
|
90
|
+
chunk_length_schedule: list[int] = [80, 120, 200, 260],
|
91
|
+
http_session: aiohttp.ClientSession | None = None,
|
92
|
+
) -> None:
|
93
|
+
super().__init__(
|
94
|
+
streaming_supported=True, sample_rate=sample_rate, num_channels=1
|
95
|
+
)
|
96
|
+
api_key = api_key or os.environ.get("ELEVEN_API_KEY")
|
97
|
+
if not api_key:
|
98
|
+
raise ValueError("ELEVEN_API_KEY must be set")
|
99
|
+
|
100
|
+
self._opts = _TTSOptions(
|
101
|
+
voice=voice,
|
102
|
+
model_id=model_id,
|
103
|
+
api_key=api_key,
|
104
|
+
base_url=base_url or API_BASE_URL_V1,
|
105
|
+
sample_rate=sample_rate,
|
106
|
+
streaming_latency=streaming_latency,
|
107
|
+
word_tokenizer=word_tokenizer,
|
108
|
+
chunk_length_schedule=chunk_length_schedule,
|
109
|
+
)
|
110
|
+
self._session = http_session
|
111
|
+
|
112
|
+
def _ensure_session(self) -> aiohttp.ClientSession:
|
113
|
+
if not self._session:
|
114
|
+
self._session = utils.http_session()
|
115
|
+
|
116
|
+
return self._session
|
117
|
+
|
118
|
+
async def list_voices(self) -> List[Voice]:
|
119
|
+
async with self._ensure_session().get(
|
120
|
+
f"{self._opts.base_url}/voices",
|
121
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
122
|
+
) as resp:
|
123
|
+
return _dict_to_voices_list(await resp.json())
|
124
|
+
|
125
|
+
def synthesize(
|
126
|
+
self,
|
127
|
+
text: str,
|
128
|
+
) -> "ChunkedStream":
|
129
|
+
return ChunkedStream(text, self._opts, self._ensure_session())
|
130
|
+
|
131
|
+
def stream(
|
132
|
+
self,
|
133
|
+
) -> "SynthesizeStream":
|
134
|
+
return SynthesizeStream(self._ensure_session(), self._opts)
|
135
|
+
|
136
|
+
|
137
|
+
class ChunkedStream(tts.ChunkedStream):
|
138
|
+
"""Synthesize using the chunked api endpoint"""
|
139
|
+
|
140
|
+
def __init__(
|
141
|
+
self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
|
142
|
+
) -> None:
|
143
|
+
self._opts = opts
|
144
|
+
self._text = text
|
145
|
+
self._session = session
|
146
|
+
self._task: asyncio.Task | None = None
|
147
|
+
self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
|
148
|
+
|
149
|
+
def _synthesize_url(self) -> str:
|
150
|
+
base_url = self._opts.base_url
|
151
|
+
voice_id = self._opts.voice.id
|
152
|
+
model_id = self._opts.model_id
|
153
|
+
sample_rate = self._opts.sample_rate
|
154
|
+
latency = self._opts.streaming_latency
|
155
|
+
url = (
|
156
|
+
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
157
|
+
f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
158
|
+
)
|
159
|
+
return url
|
160
|
+
|
161
|
+
async def _main_task(self):
|
162
|
+
try:
|
163
|
+
await self._run()
|
164
|
+
except Exception:
|
165
|
+
logger.exception("11labs main task failed in chunked stream")
|
166
|
+
finally:
|
167
|
+
self._queue.put_nowait(None)
|
168
|
+
|
169
|
+
async def _run(self) -> None:
|
170
|
+
async with self._session.post(
|
171
|
+
self._synthesize_url(),
|
172
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
173
|
+
json=dict(
|
174
|
+
text=self._text,
|
175
|
+
model_id=self._opts.model_id,
|
176
|
+
voice_settings=(
|
177
|
+
dataclasses.asdict(self._opts.voice.settings)
|
178
|
+
if self._opts.voice.settings
|
179
|
+
else None
|
180
|
+
),
|
181
|
+
),
|
182
|
+
) as resp:
|
183
|
+
# avoid very small frames. chunk by 10ms 16bits
|
184
|
+
bytes_per_frame = (self._opts.sample_rate // 100) * 2
|
185
|
+
buf = bytearray()
|
186
|
+
async for data, _ in resp.content.iter_chunks():
|
187
|
+
buf.extend(data)
|
188
|
+
|
189
|
+
while len(buf) >= bytes_per_frame:
|
190
|
+
frame_data = buf[:bytes_per_frame]
|
191
|
+
buf = buf[bytes_per_frame:]
|
192
|
+
|
193
|
+
self._queue.put_nowait(
|
194
|
+
tts.SynthesizedAudio(
|
195
|
+
text=self._text,
|
196
|
+
data=rtc.AudioFrame(
|
197
|
+
data=frame_data,
|
198
|
+
sample_rate=self._opts.sample_rate,
|
199
|
+
num_channels=1,
|
200
|
+
samples_per_channel=len(frame_data) // 2,
|
201
|
+
),
|
202
|
+
)
|
203
|
+
)
|
204
|
+
|
205
|
+
# send any remaining data
|
206
|
+
if len(buf) > 0:
|
207
|
+
self._queue.put_nowait(
|
208
|
+
tts.SynthesizedAudio(
|
209
|
+
text=self._text,
|
210
|
+
data=rtc.AudioFrame(
|
211
|
+
data=buf,
|
212
|
+
sample_rate=self._opts.sample_rate,
|
213
|
+
num_channels=1,
|
214
|
+
samples_per_channel=len(buf) // 2,
|
215
|
+
),
|
216
|
+
)
|
217
|
+
)
|
218
|
+
|
219
|
+
async def __anext__(self) -> tts.SynthesizedAudio:
|
220
|
+
if not self._task:
|
221
|
+
self._task = asyncio.create_task(self._main_task())
|
222
|
+
|
223
|
+
frame = await self._queue.get()
|
224
|
+
if frame is None:
|
225
|
+
raise StopAsyncIteration
|
226
|
+
|
227
|
+
return frame
|
228
|
+
|
229
|
+
async def aclose(self) -> None:
|
230
|
+
if not self._task:
|
231
|
+
return
|
232
|
+
|
233
|
+
self._task.cancel()
|
234
|
+
with contextlib.suppress(asyncio.CancelledError):
|
235
|
+
await self._task
|
236
|
+
|
237
|
+
|
238
|
+
class SynthesizeStream(tts.SynthesizeStream):
|
239
|
+
"""Streamed API using websockets"""
|
240
|
+
|
241
|
+
@dataclass
|
242
|
+
class _SegmentConnection:
|
243
|
+
audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
|
244
|
+
task: asyncio.Task
|
245
|
+
|
246
|
+
def __init__(
|
247
|
+
self,
|
248
|
+
session: aiohttp.ClientSession,
|
249
|
+
opts: _TTSOptions,
|
250
|
+
max_retry_per_segment: int = 3,
|
251
|
+
):
|
252
|
+
self._opts = opts
|
253
|
+
self._session = session
|
254
|
+
self._main_task = asyncio.create_task(self._run(max_retry_per_segment))
|
255
|
+
self._event_queue = asyncio.Queue[Optional[tts.SynthesisEvent]]()
|
256
|
+
self._closed = False
|
257
|
+
self._word_stream = opts.word_tokenizer.stream()
|
258
|
+
|
259
|
+
def _stream_url(self) -> str:
|
260
|
+
base_url = self._opts.base_url
|
261
|
+
voice_id = self._opts.voice.id
|
262
|
+
model_id = self._opts.model_id
|
263
|
+
sample_rate = self._opts.sample_rate
|
264
|
+
latency = self._opts.streaming_latency
|
265
|
+
url = (
|
266
|
+
f"{base_url}/text-to-speech/{voice_id}/stream-input?"
|
267
|
+
f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
268
|
+
)
|
269
|
+
|
270
|
+
return url
|
271
|
+
|
272
|
+
def push_text(self, token: str | None) -> None:
|
273
|
+
if self._closed:
|
274
|
+
raise ValueError("cannot push to a closed stream")
|
275
|
+
|
276
|
+
if token is None:
|
277
|
+
self._word_stream.mark_segment_end()
|
278
|
+
return
|
279
|
+
|
280
|
+
self._word_stream.push_text(token)
|
281
|
+
|
282
|
+
async def aclose(self, *, wait: bool = True) -> None:
|
283
|
+
self._closed = True
|
284
|
+
await self._word_stream.aclose()
|
285
|
+
|
286
|
+
if not wait:
|
287
|
+
self._main_task.cancel()
|
288
|
+
|
289
|
+
with contextlib.suppress(asyncio.CancelledError):
|
290
|
+
await self._main_task
|
291
|
+
|
292
|
+
async def _run(self, max_retry_per_segment: int) -> None:
|
293
|
+
conns_q = asyncio.Queue[Optional[SynthesizeStream._SegmentConnection]]()
|
294
|
+
|
295
|
+
async def _forward_events() -> None:
|
296
|
+
"""forward events from the ws connections to the event queue.
|
297
|
+
This is used to keep the right order."""
|
298
|
+
while True:
|
299
|
+
c = await conns_q.get()
|
300
|
+
if c is None:
|
301
|
+
break # no more segment, stream closed
|
302
|
+
|
303
|
+
self._event_queue.put_nowait(
|
304
|
+
tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
|
305
|
+
)
|
306
|
+
|
307
|
+
async for frame in c.audio_rx:
|
308
|
+
self._event_queue.put_nowait(
|
309
|
+
tts.SynthesisEvent(
|
310
|
+
type=tts.SynthesisEventType.AUDIO, audio=frame
|
311
|
+
)
|
312
|
+
)
|
313
|
+
|
314
|
+
self._event_queue.put_nowait(
|
315
|
+
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
316
|
+
)
|
317
|
+
|
318
|
+
async def _read_tokens() -> None:
|
319
|
+
"""read tokens from the word stream and create connections for each segment,
|
320
|
+
(this also allows concurrent connections to 11labs)"""
|
321
|
+
|
322
|
+
cur_segment: SynthesizeStream._SegmentConnection | None = None
|
323
|
+
token_tx: aio.ChanSender[str] | None = None
|
324
|
+
async for ev in self._word_stream:
|
325
|
+
if ev.type == tokenize.TokenEventType.STARTED:
|
326
|
+
token_tx, token_rx = aio.channel()
|
327
|
+
audio_tx: aio.ChanSender[tts.SynthesizedAudio]
|
328
|
+
audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
|
329
|
+
audio_tx, audio_rx = aio.channel()
|
330
|
+
task = asyncio.create_task(
|
331
|
+
self._run_ws(max_retry_per_segment, audio_tx, token_rx)
|
332
|
+
)
|
333
|
+
cur_segment = SynthesizeStream._SegmentConnection(audio_rx, task)
|
334
|
+
conns_q.put_nowait(cur_segment)
|
335
|
+
elif ev.type == tokenize.TokenEventType.TOKEN:
|
336
|
+
assert token_tx is not None
|
337
|
+
token_tx.send_nowait(ev.token)
|
338
|
+
elif ev.type == tokenize.TokenEventType.FINISHED:
|
339
|
+
assert token_tx is not None
|
340
|
+
token_tx.close()
|
341
|
+
cur_segment = token_tx = None
|
342
|
+
|
343
|
+
conns_q.put_nowait(None)
|
344
|
+
|
345
|
+
try:
|
346
|
+
await asyncio.gather(_forward_events(), _read_tokens())
|
347
|
+
except Exception:
|
348
|
+
logger.exception("11labs task failed")
|
349
|
+
|
350
|
+
self._event_queue.put_nowait(None)
|
351
|
+
|
352
|
+
async def _run_ws(
|
353
|
+
self,
|
354
|
+
max_retry: int,
|
355
|
+
audio_tx: aio.ChanSender[tts.SynthesizedAudio],
|
356
|
+
token_rx: aio.ChanReceiver[str],
|
357
|
+
) -> None:
|
358
|
+
# try to connect to 11labs
|
359
|
+
ws_conn: aiohttp.ClientWebSocketResponse | None = None
|
360
|
+
for try_i in range(max_retry):
|
361
|
+
try:
|
362
|
+
ws_conn = await self._session.ws_connect(
|
363
|
+
self._stream_url(),
|
364
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
365
|
+
)
|
366
|
+
|
367
|
+
voice_settings = None
|
368
|
+
if self._opts.voice.settings is not None:
|
369
|
+
voice_settings = dataclasses.asdict(self._opts.voice.settings)
|
370
|
+
|
371
|
+
init_pkt = dict(
|
372
|
+
text=" ",
|
373
|
+
try_trigger_generation=True,
|
374
|
+
voice_settings=voice_settings,
|
375
|
+
generation_config=dict(
|
376
|
+
chunk_length_schedule=self._opts.chunk_length_schedule,
|
377
|
+
),
|
378
|
+
)
|
379
|
+
await ws_conn.send_str(json.dumps(init_pkt))
|
380
|
+
except Exception:
|
381
|
+
if try_i + 1 == max_retry:
|
382
|
+
logger.exception(
|
383
|
+
f"failed to connect to 11labs after {max_retry} retries"
|
384
|
+
)
|
385
|
+
return
|
386
|
+
|
387
|
+
retry_delay = min(try_i * 5, 5) # max 5s
|
388
|
+
logger.warning(
|
389
|
+
f"failed to connect to 11labs, retrying in {retry_delay}s"
|
390
|
+
)
|
391
|
+
await asyncio.sleep(retry_delay)
|
392
|
+
|
393
|
+
assert ws_conn is not None
|
394
|
+
|
395
|
+
all_tokens_consumed = False
|
396
|
+
|
397
|
+
async def send_task():
|
398
|
+
async for token in token_rx:
|
399
|
+
if token == "":
|
400
|
+
continue # empty token is closing the stream in 11labs protocol
|
401
|
+
|
402
|
+
# try_trigger_generation=True is a bad practice, we expose
|
403
|
+
# chunk_length_schedule instead
|
404
|
+
data_pkt = dict(
|
405
|
+
text=f"{token} ", # must always end with a space
|
406
|
+
try_trigger_generation=False,
|
407
|
+
)
|
408
|
+
await ws_conn.send_str(json.dumps(data_pkt))
|
409
|
+
|
410
|
+
# no more token, mark eos
|
411
|
+
flush_pkt = dict(
|
412
|
+
text="",
|
413
|
+
)
|
414
|
+
await ws_conn.send_str(json.dumps(flush_pkt))
|
415
|
+
|
416
|
+
nonlocal all_tokens_consumed
|
417
|
+
all_tokens_consumed = True
|
418
|
+
|
419
|
+
async def recv_task():
|
420
|
+
while True:
|
421
|
+
msg = await ws_conn.receive()
|
422
|
+
if msg.type in (
|
423
|
+
aiohttp.WSMsgType.CLOSED,
|
424
|
+
aiohttp.WSMsgType.CLOSE,
|
425
|
+
aiohttp.WSMsgType.CLOSING,
|
426
|
+
):
|
427
|
+
if all_tokens_consumed:
|
428
|
+
return # close is expected
|
429
|
+
|
430
|
+
raise Exception(
|
431
|
+
"11labs connection closed unexpectedly, not all tokens have been consumed"
|
432
|
+
)
|
433
|
+
|
434
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
435
|
+
# audio frames are serialized in base64..
|
436
|
+
logger.warning("unexpected 11labs message type %s", msg.type)
|
437
|
+
continue
|
438
|
+
|
439
|
+
data: dict = json.loads(msg.data)
|
440
|
+
if data.get("audio"):
|
441
|
+
b64data = base64.b64decode(data["audio"])
|
442
|
+
|
443
|
+
frame = rtc.AudioFrame(
|
444
|
+
data=b64data,
|
445
|
+
sample_rate=self._opts.sample_rate,
|
446
|
+
num_channels=1,
|
447
|
+
samples_per_channel=len(b64data) // 2,
|
448
|
+
)
|
449
|
+
|
450
|
+
text = ""
|
451
|
+
if data.get("alignment"):
|
452
|
+
text = data["alignment"].get("chars", "")
|
453
|
+
|
454
|
+
audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
|
455
|
+
continue
|
456
|
+
elif data.get("isFinal"):
|
457
|
+
return # last message
|
458
|
+
|
459
|
+
logger.error("unexpected 11labs message %s", data)
|
460
|
+
|
461
|
+
try:
|
462
|
+
await asyncio.gather(send_task(), recv_task())
|
463
|
+
except Exception:
|
464
|
+
logger.exception("11labs ws connection failed")
|
465
|
+
finally:
|
466
|
+
audio_tx.close()
|
467
|
+
|
468
|
+
async def __anext__(self) -> tts.SynthesisEvent:
|
469
|
+
evt = await self._event_queue.get()
|
470
|
+
if evt is None:
|
471
|
+
raise StopAsyncIteration
|
472
|
+
|
473
|
+
return evt
|
474
|
+
|
475
|
+
|
476
|
+
def _dict_to_voices_list(data: dict) -> List[Voice]:
|
477
|
+
voices = []
|
478
|
+
for voice in data["voices"]:
|
479
|
+
voices.append(
|
480
|
+
Voice(
|
481
|
+
id=voice["voice_id"],
|
482
|
+
name=voice["name"],
|
483
|
+
category=voice["category"],
|
484
|
+
settings=None,
|
485
|
+
)
|
486
|
+
)
|
487
|
+
return voices
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.dev0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
22
|
Requires-Dist: livekit~=0.11
|
23
|
-
Requires-Dist: livekit-agents~=0.
|
23
|
+
Requires-Dist: livekit-agents~=0.7.dev0
|
24
24
|
Requires-Dist: aiohttp>=3.8.5
|
25
25
|
|
26
26
|
# LiveKit Plugins Elevenlabs
|
@@ -1,392 +0,0 @@
|
|
1
|
-
# Copyright 2023 LiveKit, Inc.
|
2
|
-
#
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
# you may not use this file except in compliance with the License.
|
5
|
-
# You may obtain a copy of the License at
|
6
|
-
#
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
#
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
# See the License for the specific language governing permissions and
|
13
|
-
# limitations under the License.
|
14
|
-
|
15
|
-
import asyncio
|
16
|
-
import base64
|
17
|
-
import contextlib
|
18
|
-
import dataclasses
|
19
|
-
import json
|
20
|
-
import os
|
21
|
-
from dataclasses import dataclass
|
22
|
-
from typing import AsyncIterable, List
|
23
|
-
|
24
|
-
import aiohttp
|
25
|
-
from livekit import rtc
|
26
|
-
from livekit.agents import aio, tts
|
27
|
-
|
28
|
-
from .log import logger
|
29
|
-
from .models import TTSModels
|
30
|
-
|
31
|
-
|
32
|
-
@dataclass
|
33
|
-
class VoiceSettings:
|
34
|
-
stability: float # [0.0 - 1.0]
|
35
|
-
similarity_boost: float # [0.0 - 1.0]
|
36
|
-
style: float | None = None # [0.0 - 1.0]
|
37
|
-
use_speaker_boost: bool | None = False
|
38
|
-
|
39
|
-
|
40
|
-
@dataclass
|
41
|
-
class Voice:
|
42
|
-
id: str
|
43
|
-
name: str
|
44
|
-
category: str
|
45
|
-
settings: VoiceSettings | None = None
|
46
|
-
|
47
|
-
|
48
|
-
DEFAULT_VOICE = Voice(
|
49
|
-
id="EXAVITQu4vr4xnSDxMaL",
|
50
|
-
name="Bella",
|
51
|
-
category="premade",
|
52
|
-
settings=VoiceSettings(
|
53
|
-
stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
|
54
|
-
),
|
55
|
-
)
|
56
|
-
|
57
|
-
API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
|
58
|
-
AUTHORIZATION_HEADER = "xi-api-key"
|
59
|
-
|
60
|
-
|
61
|
-
@dataclass
|
62
|
-
class TTSOptions:
|
63
|
-
api_key: str
|
64
|
-
voice: Voice
|
65
|
-
model_id: TTSModels
|
66
|
-
base_url: str
|
67
|
-
sample_rate: int
|
68
|
-
latency: int
|
69
|
-
|
70
|
-
|
71
|
-
class TTS(tts.TTS):
|
72
|
-
def __init__(
|
73
|
-
self,
|
74
|
-
*,
|
75
|
-
voice: Voice = DEFAULT_VOICE,
|
76
|
-
model_id: TTSModels = "eleven_turbo_v2",
|
77
|
-
api_key: str | None = None,
|
78
|
-
base_url: str | None = None,
|
79
|
-
sample_rate: int = 24000,
|
80
|
-
latency: int = 3,
|
81
|
-
) -> None:
|
82
|
-
super().__init__(
|
83
|
-
streaming_supported=True, sample_rate=sample_rate, num_channels=1
|
84
|
-
)
|
85
|
-
api_key = api_key or os.environ.get("ELEVEN_API_KEY")
|
86
|
-
if not api_key:
|
87
|
-
raise ValueError("ELEVEN_API_KEY must be set")
|
88
|
-
|
89
|
-
self._session = aiohttp.ClientSession()
|
90
|
-
self._opts = TTSOptions(
|
91
|
-
voice=voice,
|
92
|
-
model_id=model_id,
|
93
|
-
api_key=api_key,
|
94
|
-
base_url=base_url or API_BASE_URL_V1,
|
95
|
-
sample_rate=sample_rate,
|
96
|
-
latency=latency,
|
97
|
-
)
|
98
|
-
|
99
|
-
async def list_voices(self) -> List[Voice]:
|
100
|
-
async with self._session.get(
|
101
|
-
f"{self._opts.base_url}/voices",
|
102
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
103
|
-
) as resp:
|
104
|
-
data = await resp.json()
|
105
|
-
return dict_to_voices_list(data)
|
106
|
-
|
107
|
-
def synthesize(
|
108
|
-
self,
|
109
|
-
text: str,
|
110
|
-
) -> AsyncIterable[tts.SynthesizedAudio]:
|
111
|
-
voice = self._opts.voice
|
112
|
-
url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
|
113
|
-
|
114
|
-
async def generator():
|
115
|
-
try:
|
116
|
-
async with self._session.post(
|
117
|
-
url,
|
118
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
119
|
-
json=dict(
|
120
|
-
text=text,
|
121
|
-
model_id=self._opts.model_id,
|
122
|
-
voice_settings=dataclasses.asdict(voice.settings)
|
123
|
-
if voice.settings
|
124
|
-
else None,
|
125
|
-
),
|
126
|
-
) as resp:
|
127
|
-
data = await resp.read()
|
128
|
-
yield tts.SynthesizedAudio(
|
129
|
-
text=text,
|
130
|
-
data=rtc.AudioFrame(
|
131
|
-
data=data,
|
132
|
-
sample_rate=self._opts.sample_rate,
|
133
|
-
num_channels=1,
|
134
|
-
samples_per_channel=len(data) // 2, # 16-bit
|
135
|
-
),
|
136
|
-
)
|
137
|
-
except Exception as e:
|
138
|
-
logger.error(f"failed to synthesize: {e}")
|
139
|
-
|
140
|
-
return generator()
|
141
|
-
|
142
|
-
def stream(
|
143
|
-
self,
|
144
|
-
) -> "SynthesizeStream":
|
145
|
-
return SynthesizeStream(self._session, self._opts)
|
146
|
-
|
147
|
-
|
148
|
-
class SynthesizeStream(tts.SynthesizeStream):
|
149
|
-
_STREAM_EOS = ""
|
150
|
-
|
151
|
-
def __init__(
|
152
|
-
self,
|
153
|
-
session: aiohttp.ClientSession,
|
154
|
-
opts: TTSOptions,
|
155
|
-
max_retry: int = 32,
|
156
|
-
):
|
157
|
-
self._opts = opts
|
158
|
-
self._session = session
|
159
|
-
|
160
|
-
self._queue = asyncio.Queue[str | None]()
|
161
|
-
self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
|
162
|
-
self._closed = False
|
163
|
-
self._text = ""
|
164
|
-
|
165
|
-
self._main_task = asyncio.create_task(self._run(max_retry))
|
166
|
-
|
167
|
-
def _stream_url(self) -> str:
|
168
|
-
base_url = self._opts.base_url
|
169
|
-
voice_id = self._opts.voice.id
|
170
|
-
model_id = self._opts.model_id
|
171
|
-
sample_rate = self._opts.sample_rate
|
172
|
-
latency = self._opts.latency
|
173
|
-
return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
174
|
-
|
175
|
-
def push_text(self, token: str | None) -> None:
|
176
|
-
if self._closed:
|
177
|
-
raise ValueError("cannot push to a closed stream")
|
178
|
-
|
179
|
-
if token is None:
|
180
|
-
self._flush_if_needed()
|
181
|
-
return
|
182
|
-
|
183
|
-
if len(token) == 0:
|
184
|
-
# 11labs marks the EOS with an empty string, avoid users from pushing empty strings
|
185
|
-
return
|
186
|
-
|
187
|
-
# TODO: Naive word boundary detection may not be good enough for all languages
|
188
|
-
# fmt: off
|
189
|
-
splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
|
190
|
-
# fmt: on
|
191
|
-
|
192
|
-
self._text += token
|
193
|
-
|
194
|
-
while True:
|
195
|
-
last_split = -1
|
196
|
-
for i, c in enumerate(self._text):
|
197
|
-
if c in splitters:
|
198
|
-
last_split = i
|
199
|
-
break
|
200
|
-
|
201
|
-
if last_split == -1:
|
202
|
-
break
|
203
|
-
|
204
|
-
seg = self._text[: last_split + 1]
|
205
|
-
seg = seg.strip() + " " # 11labs expects a space at the end
|
206
|
-
self._queue.put_nowait(seg)
|
207
|
-
self._text = self._text[last_split + 1 :]
|
208
|
-
|
209
|
-
async def aclose(self, *, wait: bool = True) -> None:
|
210
|
-
self._flush_if_needed()
|
211
|
-
self._queue.put_nowait(None)
|
212
|
-
self._closed = True
|
213
|
-
|
214
|
-
if not wait:
|
215
|
-
self._main_task.cancel()
|
216
|
-
|
217
|
-
with contextlib.suppress(asyncio.CancelledError):
|
218
|
-
await self._main_task
|
219
|
-
|
220
|
-
def _flush_if_needed(self) -> None:
|
221
|
-
seg = self._text.strip()
|
222
|
-
if len(seg) > 0:
|
223
|
-
self._queue.put_nowait(seg + " ")
|
224
|
-
|
225
|
-
self._text = ""
|
226
|
-
self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
|
227
|
-
|
228
|
-
async def _run(self, max_retry: int) -> None:
|
229
|
-
retry_count = 0
|
230
|
-
ws: aiohttp.ClientWebSocketResponse | None = None
|
231
|
-
ws_task: asyncio.Task | None = None
|
232
|
-
data_tx: aio.ChanSender[str] | None = None
|
233
|
-
|
234
|
-
try:
|
235
|
-
while True:
|
236
|
-
ws_connected = ws is not None and not ws.closed
|
237
|
-
try:
|
238
|
-
data = await self._queue.get()
|
239
|
-
|
240
|
-
if data is None:
|
241
|
-
if ws_task is not None:
|
242
|
-
await ws_task
|
243
|
-
break
|
244
|
-
|
245
|
-
if not ws_connected:
|
246
|
-
if data == SynthesizeStream._STREAM_EOS:
|
247
|
-
continue
|
248
|
-
|
249
|
-
with contextlib.suppress(asyncio.CancelledError):
|
250
|
-
if ws_task is not None:
|
251
|
-
await ws_task
|
252
|
-
|
253
|
-
ws = await self._session.ws_connect(
|
254
|
-
self._stream_url(),
|
255
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
256
|
-
)
|
257
|
-
data_tx, data_rx = aio.channel()
|
258
|
-
ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
|
259
|
-
|
260
|
-
assert data_tx is not None
|
261
|
-
assert ws_task is not None
|
262
|
-
assert ws is not None
|
263
|
-
|
264
|
-
data_tx.send_nowait(data)
|
265
|
-
|
266
|
-
except Exception:
|
267
|
-
if retry_count >= max_retry:
|
268
|
-
logger.exception(
|
269
|
-
f"failed to connect to 11labs after {max_retry} retries"
|
270
|
-
)
|
271
|
-
break
|
272
|
-
|
273
|
-
retry_delay = min(retry_count * 5, 5) # max 5s
|
274
|
-
retry_count += 1
|
275
|
-
|
276
|
-
logger.warning(
|
277
|
-
f"failed to connect to 11labs, retrying in {retry_delay}s"
|
278
|
-
)
|
279
|
-
await asyncio.sleep(retry_delay)
|
280
|
-
|
281
|
-
except Exception:
|
282
|
-
logger.exception("11labs task failed")
|
283
|
-
finally:
|
284
|
-
with contextlib.suppress(asyncio.CancelledError):
|
285
|
-
if ws_task is not None:
|
286
|
-
ws_task.cancel()
|
287
|
-
await ws_task
|
288
|
-
|
289
|
-
self._event_queue.put_nowait(None)
|
290
|
-
|
291
|
-
async def _run_ws(
|
292
|
-
self, ws: aiohttp.ClientWebSocketResponse, data_rx: aio.ChanReceiver[str]
|
293
|
-
) -> None:
|
294
|
-
closing_ws = False
|
295
|
-
|
296
|
-
self._event_queue.put_nowait(
|
297
|
-
tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
|
298
|
-
)
|
299
|
-
|
300
|
-
async def send_task():
|
301
|
-
nonlocal closing_ws
|
302
|
-
|
303
|
-
# 11labs stream must be initialized with a space
|
304
|
-
voice = self._opts.voice
|
305
|
-
voice_settings = (
|
306
|
-
dataclasses.asdict(voice.settings) if voice.settings else None
|
307
|
-
)
|
308
|
-
init_pkt = dict(
|
309
|
-
text=" ",
|
310
|
-
voice_settings=voice_settings,
|
311
|
-
)
|
312
|
-
await ws.send_str(json.dumps(init_pkt))
|
313
|
-
|
314
|
-
while True:
|
315
|
-
data = await data_rx.recv()
|
316
|
-
data_pkt = dict(
|
317
|
-
text=data,
|
318
|
-
try_trigger_generation=True,
|
319
|
-
)
|
320
|
-
if data == SynthesizeStream._STREAM_EOS:
|
321
|
-
closing_ws = True
|
322
|
-
|
323
|
-
await ws.send_str(json.dumps(data_pkt))
|
324
|
-
|
325
|
-
if closing_ws:
|
326
|
-
return
|
327
|
-
|
328
|
-
async def recv_task():
|
329
|
-
nonlocal closing_ws
|
330
|
-
while True:
|
331
|
-
msg = await ws.receive()
|
332
|
-
if msg.type in (
|
333
|
-
aiohttp.WSMsgType.CLOSED,
|
334
|
-
aiohttp.WSMsgType.CLOSE,
|
335
|
-
aiohttp.WSMsgType.CLOSING,
|
336
|
-
):
|
337
|
-
if closing_ws: # close is expected
|
338
|
-
return
|
339
|
-
|
340
|
-
raise Exception("11labs connection closed unexpectedly")
|
341
|
-
|
342
|
-
if msg.type != aiohttp.WSMsgType.TEXT:
|
343
|
-
logger.warning("unexpected 11labs message type %s", msg.type)
|
344
|
-
continue
|
345
|
-
|
346
|
-
data: dict = json.loads(msg.data)
|
347
|
-
if data.get("audio"):
|
348
|
-
b64data = base64.b64decode(data["audio"])
|
349
|
-
frame = rtc.AudioFrame(
|
350
|
-
data=b64data,
|
351
|
-
sample_rate=self._opts.sample_rate,
|
352
|
-
num_channels=1,
|
353
|
-
samples_per_channel=len(b64data) // 2,
|
354
|
-
)
|
355
|
-
self._event_queue.put_nowait(
|
356
|
-
tts.SynthesisEvent(
|
357
|
-
type=tts.SynthesisEventType.AUDIO,
|
358
|
-
audio=tts.SynthesizedAudio(text="", data=frame),
|
359
|
-
)
|
360
|
-
)
|
361
|
-
elif data.get("isFinal"):
|
362
|
-
return
|
363
|
-
|
364
|
-
try:
|
365
|
-
await asyncio.gather(send_task(), recv_task())
|
366
|
-
except Exception:
|
367
|
-
logger.exception("11labs connection failed")
|
368
|
-
finally:
|
369
|
-
self._event_queue.put_nowait(
|
370
|
-
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
371
|
-
)
|
372
|
-
|
373
|
-
async def __anext__(self) -> tts.SynthesisEvent:
|
374
|
-
evt = await self._event_queue.get()
|
375
|
-
if evt is None:
|
376
|
-
raise StopAsyncIteration
|
377
|
-
|
378
|
-
return evt
|
379
|
-
|
380
|
-
|
381
|
-
def dict_to_voices_list(data: dict) -> List[Voice]:
|
382
|
-
voices = []
|
383
|
-
for voice in data["voices"]:
|
384
|
-
voices.append(
|
385
|
-
Voice(
|
386
|
-
id=voice["voice_id"],
|
387
|
-
name=voice["name"],
|
388
|
-
category=voice["category"],
|
389
|
-
settings=None,
|
390
|
-
)
|
391
|
-
)
|
392
|
-
return voices
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|