livekit-plugins-elevenlabs 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/models.py +9 -1
- livekit/plugins/elevenlabs/tts.py +146 -154
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.8.0.dist-info → livekit_plugins_elevenlabs-0.8.2.dist-info}/METADATA +2 -2
- livekit_plugins_elevenlabs-0.8.2.dist-info/RECORD +10 -0
- {livekit_plugins_elevenlabs-0.8.0.dist-info → livekit_plugins_elevenlabs-0.8.2.dist-info}/WHEEL +1 -1
- livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD +0 -10
- {livekit_plugins_elevenlabs-0.8.0.dist-info → livekit_plugins_elevenlabs-0.8.2.dist-info}/top_level.txt +0 -0
@@ -10,4 +10,12 @@ TTSModels = Literal[
|
|
10
10
|
"eleven_flash_v2",
|
11
11
|
]
|
12
12
|
|
13
|
-
TTSEncoding = Literal[
|
13
|
+
TTSEncoding = Literal[
|
14
|
+
"mp3_22050_32",
|
15
|
+
"mp3_44100",
|
16
|
+
"mp3_44100_32",
|
17
|
+
"mp3_44100_64",
|
18
|
+
"mp3_44100_96",
|
19
|
+
"mp3_44100_128",
|
20
|
+
"mp3_44100_192",
|
21
|
+
]
|
@@ -37,7 +37,9 @@ from livekit.agents import (
|
|
37
37
|
from .log import logger
|
38
38
|
from .models import TTSEncoding, TTSModels
|
39
39
|
|
40
|
-
|
40
|
+
# by default, use 22.05kHz sample rate at 32kbps
|
41
|
+
# in our testing, reduce TTFB by about ~110ms
|
42
|
+
_DefaultEncoding: TTSEncoding = "mp3_22050_32"
|
41
43
|
|
42
44
|
|
43
45
|
def _sample_rate_from_format(output_format: TTSEncoding) -> int:
|
@@ -102,6 +104,7 @@ class TTS(tts.TTS):
|
|
102
104
|
*,
|
103
105
|
voice: Voice = DEFAULT_VOICE,
|
104
106
|
model: TTSModels | str = "eleven_flash_v2_5",
|
107
|
+
encoding: TTSEncoding | None = None,
|
105
108
|
api_key: str | None = None,
|
106
109
|
base_url: str | None = None,
|
107
110
|
streaming_latency: int = 0,
|
@@ -131,11 +134,14 @@ class TTS(tts.TTS):
|
|
131
134
|
language (str | None): Language code for the TTS model, as of 10/24/24 only valid for "eleven_turbo_v2_5". Optional.
|
132
135
|
"""
|
133
136
|
|
137
|
+
if not encoding:
|
138
|
+
encoding = _DefaultEncoding
|
139
|
+
|
134
140
|
super().__init__(
|
135
141
|
capabilities=tts.TTSCapabilities(
|
136
142
|
streaming=True,
|
137
143
|
),
|
138
|
-
sample_rate=_sample_rate_from_format(
|
144
|
+
sample_rate=_sample_rate_from_format(encoding),
|
139
145
|
num_channels=1,
|
140
146
|
)
|
141
147
|
|
@@ -161,7 +167,7 @@ class TTS(tts.TTS):
|
|
161
167
|
model=model,
|
162
168
|
api_key=api_key,
|
163
169
|
base_url=base_url or API_BASE_URL_V1,
|
164
|
-
encoding=
|
170
|
+
encoding=encoding,
|
165
171
|
sample_rate=self.sample_rate,
|
166
172
|
streaming_latency=streaming_latency,
|
167
173
|
word_tokenizer=word_tokenizer,
|
@@ -171,36 +177,14 @@ class TTS(tts.TTS):
|
|
171
177
|
inactivity_timeout=inactivity_timeout,
|
172
178
|
)
|
173
179
|
self._session = http_session
|
174
|
-
self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
|
175
|
-
connect_cb=self._connect_ws,
|
176
|
-
close_cb=self._close_ws,
|
177
|
-
max_session_duration=inactivity_timeout,
|
178
|
-
mark_refreshed_on_get=True,
|
179
|
-
)
|
180
180
|
self._streams = weakref.WeakSet[SynthesizeStream]()
|
181
181
|
|
182
|
-
async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
|
183
|
-
session = self._ensure_session()
|
184
|
-
return await asyncio.wait_for(
|
185
|
-
session.ws_connect(
|
186
|
-
_stream_url(self._opts),
|
187
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
188
|
-
),
|
189
|
-
self._conn_options.timeout,
|
190
|
-
)
|
191
|
-
|
192
|
-
async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
|
193
|
-
await ws.close()
|
194
|
-
|
195
182
|
def _ensure_session(self) -> aiohttp.ClientSession:
|
196
183
|
if not self._session:
|
197
184
|
self._session = utils.http_context.http_session()
|
198
185
|
|
199
186
|
return self._session
|
200
187
|
|
201
|
-
def prewarm(self) -> None:
|
202
|
-
self._pool.prewarm()
|
203
|
-
|
204
188
|
async def list_voices(self) -> List[Voice]:
|
205
189
|
async with self._ensure_session().get(
|
206
190
|
f"{self._opts.base_url}/voices",
|
@@ -242,7 +226,12 @@ class TTS(tts.TTS):
|
|
242
226
|
def stream(
|
243
227
|
self, *, conn_options: Optional[APIConnectOptions] = None
|
244
228
|
) -> "SynthesizeStream":
|
245
|
-
stream = SynthesizeStream(
|
229
|
+
stream = SynthesizeStream(
|
230
|
+
tts=self,
|
231
|
+
conn_options=conn_options,
|
232
|
+
opts=self._opts,
|
233
|
+
session=self._ensure_session(),
|
234
|
+
)
|
246
235
|
self._streams.add(stream)
|
247
236
|
return stream
|
248
237
|
|
@@ -250,7 +239,6 @@ class TTS(tts.TTS):
|
|
250
239
|
for stream in list(self._streams):
|
251
240
|
await stream.aclose()
|
252
241
|
self._streams.clear()
|
253
|
-
await self._pool.aclose()
|
254
242
|
await super().aclose()
|
255
243
|
|
256
244
|
|
@@ -338,11 +326,12 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
338
326
|
self,
|
339
327
|
*,
|
340
328
|
tts: TTS,
|
341
|
-
|
329
|
+
session: aiohttp.ClientSession,
|
342
330
|
opts: _TTSOptions,
|
331
|
+
conn_options: Optional[APIConnectOptions] = None,
|
343
332
|
):
|
344
|
-
super().__init__(tts=tts)
|
345
|
-
self._opts, self.
|
333
|
+
super().__init__(tts=tts, conn_options=conn_options)
|
334
|
+
self._opts, self._session = opts, session
|
346
335
|
|
347
336
|
async def _run(self) -> None:
|
348
337
|
request_id = utils.shortuuid()
|
@@ -397,140 +386,143 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
397
386
|
word_stream: tokenize.WordStream,
|
398
387
|
request_id: str,
|
399
388
|
) -> None:
|
400
|
-
|
401
|
-
|
402
|
-
|
389
|
+
ws_conn = await self._session.ws_connect(
|
390
|
+
_stream_url(self._opts),
|
391
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
392
|
+
)
|
403
393
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
394
|
+
segment_id = utils.shortuuid()
|
395
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
396
|
+
sample_rate=self._opts.sample_rate,
|
397
|
+
num_channels=1,
|
398
|
+
)
|
408
399
|
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
400
|
+
# 11labs protocol expects the first message to be an "init msg"
|
401
|
+
init_pkt = dict(
|
402
|
+
text=" ",
|
403
|
+
voice_settings=_strip_nones(dataclasses.asdict(self._opts.voice.settings))
|
404
|
+
if self._opts.voice.settings
|
405
|
+
else None,
|
406
|
+
generation_config=dict(
|
407
|
+
chunk_length_schedule=self._opts.chunk_length_schedule
|
408
|
+
),
|
409
|
+
)
|
410
|
+
await ws_conn.send_str(json.dumps(init_pkt))
|
411
|
+
eos_sent = False
|
412
|
+
|
413
|
+
@utils.log_exceptions(logger=logger)
|
414
|
+
async def send_task():
|
415
|
+
nonlocal eos_sent
|
416
|
+
xml_content = []
|
417
|
+
async for data in word_stream:
|
418
|
+
text = data.token
|
419
|
+
# send the xml phoneme in one go
|
420
|
+
if (
|
421
|
+
self._opts.enable_ssml_parsing
|
422
|
+
and data.token.startswith("<phoneme")
|
423
|
+
or xml_content
|
424
|
+
):
|
425
|
+
xml_content.append(text)
|
426
|
+
if data.token.find("</phoneme>") > -1:
|
427
|
+
text = self._opts.word_tokenizer.format_words(xml_content)
|
428
|
+
xml_content = []
|
429
|
+
else:
|
430
|
+
continue
|
431
|
+
|
432
|
+
data_pkt = dict(text=f"{text} ") # must always end with a space
|
433
|
+
self._mark_started()
|
434
|
+
await ws_conn.send_str(json.dumps(data_pkt))
|
435
|
+
if xml_content:
|
436
|
+
logger.warning("11labs stream ended with incomplete xml content")
|
437
|
+
|
438
|
+
# no more token, mark eos
|
439
|
+
eos_pkt = dict(text="")
|
440
|
+
await ws_conn.send_str(json.dumps(eos_pkt))
|
441
|
+
eos_sent = True
|
442
|
+
|
443
|
+
# consumes from decoder and generates events
|
444
|
+
@utils.log_exceptions(logger=logger)
|
445
|
+
async def generate_task():
|
446
|
+
emitter = tts.SynthesizedAudioEmitter(
|
447
|
+
event_ch=self._event_ch,
|
448
|
+
request_id=request_id,
|
449
|
+
segment_id=segment_id,
|
420
450
|
)
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
async def send_task():
|
425
|
-
nonlocal expected_text
|
426
|
-
xml_content = []
|
427
|
-
async for data in word_stream:
|
428
|
-
text = data.token
|
429
|
-
expected_text += text
|
430
|
-
# send the xml phoneme in one go
|
431
|
-
if (
|
432
|
-
self._opts.enable_ssml_parsing
|
433
|
-
and data.token.startswith("<phoneme")
|
434
|
-
or xml_content
|
435
|
-
):
|
436
|
-
xml_content.append(text)
|
437
|
-
if text.find("</phoneme>") > -1:
|
438
|
-
text = self._opts.word_tokenizer.format_words(xml_content)
|
439
|
-
xml_content = []
|
440
|
-
else:
|
441
|
-
continue
|
442
|
-
|
443
|
-
data_pkt = dict(text=f"{text} ") # must always end with a space
|
444
|
-
self._mark_started()
|
445
|
-
await ws_conn.send_str(json.dumps(data_pkt))
|
446
|
-
if xml_content:
|
447
|
-
logger.warning("11labs stream ended with incomplete xml content")
|
448
|
-
await ws_conn.send_str(json.dumps({"flush": True}))
|
449
|
-
|
450
|
-
# consumes from decoder and generates events
|
451
|
-
@utils.log_exceptions(logger=logger)
|
452
|
-
async def generate_task():
|
453
|
-
emitter = tts.SynthesizedAudioEmitter(
|
454
|
-
event_ch=self._event_ch,
|
455
|
-
request_id=request_id,
|
456
|
-
segment_id=segment_id,
|
457
|
-
)
|
458
|
-
async for frame in decoder:
|
459
|
-
emitter.push(frame)
|
460
|
-
emitter.flush()
|
451
|
+
async for frame in decoder:
|
452
|
+
emitter.push(frame)
|
453
|
+
emitter.flush()
|
461
454
|
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
455
|
+
# receives from ws and decodes audio
|
456
|
+
@utils.log_exceptions(logger=logger)
|
457
|
+
async def recv_task():
|
458
|
+
nonlocal eos_sent
|
459
|
+
|
460
|
+
while True:
|
461
|
+
msg = await ws_conn.receive()
|
462
|
+
if msg.type in (
|
463
|
+
aiohttp.WSMsgType.CLOSED,
|
464
|
+
aiohttp.WSMsgType.CLOSE,
|
465
|
+
aiohttp.WSMsgType.CLOSING,
|
466
|
+
):
|
467
|
+
if not eos_sent:
|
475
468
|
raise APIStatusError(
|
476
469
|
"11labs connection closed unexpectedly, not all tokens have been consumed",
|
477
470
|
request_id=request_id,
|
478
471
|
)
|
472
|
+
return
|
479
473
|
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
status_code=500,
|
507
|
-
request_id=request_id,
|
508
|
-
body=None,
|
509
|
-
)
|
474
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
475
|
+
logger.warning("unexpected 11labs message type %s", msg.type)
|
476
|
+
continue
|
477
|
+
|
478
|
+
data = json.loads(msg.data)
|
479
|
+
if data.get("audio"):
|
480
|
+
b64data = base64.b64decode(data["audio"])
|
481
|
+
decoder.push(b64data)
|
482
|
+
|
483
|
+
elif data.get("isFinal"):
|
484
|
+
decoder.end_input()
|
485
|
+
break
|
486
|
+
elif data.get("error"):
|
487
|
+
raise APIStatusError(
|
488
|
+
message=data["error"],
|
489
|
+
status_code=500,
|
490
|
+
request_id=request_id,
|
491
|
+
body=None,
|
492
|
+
)
|
493
|
+
else:
|
494
|
+
raise APIStatusError(
|
495
|
+
message=f"unexpected 11labs message {data}",
|
496
|
+
status_code=500,
|
497
|
+
request_id=request_id,
|
498
|
+
body=None,
|
499
|
+
)
|
510
500
|
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
501
|
+
tasks = [
|
502
|
+
asyncio.create_task(send_task()),
|
503
|
+
asyncio.create_task(recv_task()),
|
504
|
+
asyncio.create_task(generate_task()),
|
505
|
+
]
|
506
|
+
try:
|
507
|
+
await asyncio.gather(*tasks)
|
508
|
+
except asyncio.TimeoutError as e:
|
509
|
+
raise APITimeoutError() from e
|
510
|
+
except aiohttp.ClientResponseError as e:
|
511
|
+
raise APIStatusError(
|
512
|
+
message=e.message,
|
513
|
+
status_code=e.status,
|
514
|
+
request_id=request_id,
|
515
|
+
body=None,
|
516
|
+
) from e
|
517
|
+
except APIStatusError:
|
518
|
+
raise
|
519
|
+
except Exception as e:
|
520
|
+
raise APIConnectionError() from e
|
521
|
+
finally:
|
522
|
+
await utils.aio.gracefully_cancel(*tasks)
|
523
|
+
await decoder.aclose()
|
524
|
+
if ws_conn is not None:
|
525
|
+
await ws_conn.close()
|
534
526
|
|
535
527
|
|
536
528
|
def _dict_to_voices_list(data: dict[str, Any]):
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=p_wHEz15bdsNEqwzN831ysm70PNWQ-xeN__BKvGPZxA,401
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=ipXJdSYMKBd8mzTL3JfvYdRc2sJJRASOPPh2Ppy8NBk,19529
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=qwktN8wnyHMjA3ewh43aDIBBwMd3jorNpCaoGTqBDrw,600
|
7
|
+
livekit_plugins_elevenlabs-0.8.2.dist-info/METADATA,sha256=mOzHe4OynY-A7OK1hi1OK4eXTnMRvKGG0CYjW_kXz0s,1529
|
8
|
+
livekit_plugins_elevenlabs-0.8.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
9
|
+
livekit_plugins_elevenlabs-0.8.2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_elevenlabs-0.8.2.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=nB43wLS1ilzS7IxLYVSQxBjKPnbiPl4AHpHAOlG2i00,273
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=KCZnuAngDZck4zIMMgp0BLV0GS31kKChMvdvXUVZ8vY,20491
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=fObgfvFfJb5Vj0qY1hgEiVKSo6z6atjrJvwAVl4KvR4,600
|
7
|
-
livekit_plugins_elevenlabs-0.8.0.dist-info/METADATA,sha256=BwddENtvF9zqxTgjgIsHyavyRfA82TBISYEVwFfo2vs,1529
|
8
|
-
livekit_plugins_elevenlabs-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
9
|
-
livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD,,
|
File without changes
|