livekit-plugins-elevenlabs 0.7.12__py3-none-any.whl → 0.7.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/tts.py +175 -159
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.7.12.dist-info → livekit_plugins_elevenlabs-0.7.14.dist-info}/METADATA +1 -1
- livekit_plugins_elevenlabs-0.7.14.dist-info/RECORD +10 -0
- {livekit_plugins_elevenlabs-0.7.12.dist-info → livekit_plugins_elevenlabs-0.7.14.dist-info}/WHEEL +1 -1
- livekit_plugins_elevenlabs-0.7.12.dist-info/RECORD +0 -10
- {livekit_plugins_elevenlabs-0.7.12.dist-info → livekit_plugins_elevenlabs-0.7.14.dist-info}/top_level.txt +0 -0
@@ -19,13 +19,13 @@ import base64
|
|
19
19
|
import dataclasses
|
20
20
|
import json
|
21
21
|
import os
|
22
|
+
import weakref
|
22
23
|
from dataclasses import dataclass
|
23
|
-
from typing import Any, List, Literal
|
24
|
+
from typing import Any, List, Literal, Optional
|
24
25
|
|
25
26
|
import aiohttp
|
26
27
|
from livekit import rtc
|
27
28
|
from livekit.agents import (
|
28
|
-
DEFAULT_API_CONNECT_OPTIONS,
|
29
29
|
APIConnectionError,
|
30
30
|
APIConnectOptions,
|
31
31
|
APIStatusError,
|
@@ -170,6 +170,24 @@ class TTS(tts.TTS):
|
|
170
170
|
language=language,
|
171
171
|
)
|
172
172
|
self._session = http_session
|
173
|
+
self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
|
174
|
+
connect_cb=self._connect_ws,
|
175
|
+
close_cb=self._close_ws,
|
176
|
+
)
|
177
|
+
self._streams = weakref.WeakSet[SynthesizeStream]()
|
178
|
+
|
179
|
+
async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
|
180
|
+
session = self._ensure_session()
|
181
|
+
return await asyncio.wait_for(
|
182
|
+
session.ws_connect(
|
183
|
+
_stream_url(self._opts),
|
184
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
185
|
+
),
|
186
|
+
self._conn_options.timeout,
|
187
|
+
)
|
188
|
+
|
189
|
+
async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
|
190
|
+
await ws.close()
|
173
191
|
|
174
192
|
def _ensure_session(self) -> aiohttp.ClientSession:
|
175
193
|
if not self._session:
|
@@ -205,7 +223,7 @@ class TTS(tts.TTS):
|
|
205
223
|
self,
|
206
224
|
text: str,
|
207
225
|
*,
|
208
|
-
conn_options: APIConnectOptions =
|
226
|
+
conn_options: Optional[APIConnectOptions] = None,
|
209
227
|
) -> "ChunkedStream":
|
210
228
|
return ChunkedStream(
|
211
229
|
tts=self,
|
@@ -216,14 +234,18 @@ class TTS(tts.TTS):
|
|
216
234
|
)
|
217
235
|
|
218
236
|
def stream(
|
219
|
-
self, *, conn_options: APIConnectOptions =
|
237
|
+
self, *, conn_options: Optional[APIConnectOptions] = None
|
220
238
|
) -> "SynthesizeStream":
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
)
|
239
|
+
stream = SynthesizeStream(tts=self, pool=self._pool, opts=self._opts)
|
240
|
+
self._streams.add(stream)
|
241
|
+
return stream
|
242
|
+
|
243
|
+
async def aclose(self) -> None:
|
244
|
+
for stream in list(self._streams):
|
245
|
+
await stream.aclose()
|
246
|
+
self._streams.clear()
|
247
|
+
await self._pool.aclose()
|
248
|
+
await super().aclose()
|
227
249
|
|
228
250
|
|
229
251
|
class ChunkedStream(tts.ChunkedStream):
|
@@ -235,7 +257,7 @@ class ChunkedStream(tts.ChunkedStream):
|
|
235
257
|
tts: TTS,
|
236
258
|
input_text: str,
|
237
259
|
opts: _TTSOptions,
|
238
|
-
conn_options: APIConnectOptions,
|
260
|
+
conn_options: Optional[APIConnectOptions] = None,
|
239
261
|
session: aiohttp.ClientSession,
|
240
262
|
) -> None:
|
241
263
|
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
@@ -317,15 +339,15 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
317
339
|
self,
|
318
340
|
*,
|
319
341
|
tts: TTS,
|
320
|
-
|
321
|
-
conn_options: APIConnectOptions,
|
342
|
+
pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
|
322
343
|
opts: _TTSOptions,
|
323
344
|
):
|
324
|
-
super().__init__(tts=tts
|
325
|
-
self._opts, self.
|
345
|
+
super().__init__(tts=tts)
|
346
|
+
self._opts, self._pool = opts, pool
|
326
347
|
self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
|
327
348
|
|
328
349
|
async def _run(self) -> None:
|
350
|
+
request_id = utils.shortuuid()
|
329
351
|
self._segments_ch = utils.aio.Chan[tokenize.WordStream]()
|
330
352
|
|
331
353
|
@utils.log_exceptions(logger=logger)
|
@@ -343,183 +365,177 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
343
365
|
elif isinstance(input, self._FlushSentinel):
|
344
366
|
if word_stream is not None:
|
345
367
|
word_stream.end_input()
|
346
|
-
|
347
368
|
word_stream = None
|
348
|
-
|
349
369
|
self._segments_ch.close()
|
350
370
|
|
351
371
|
@utils.log_exceptions(logger=logger)
|
352
|
-
async def
|
372
|
+
async def _process_segments():
|
353
373
|
async for word_stream in self._segments_ch:
|
354
|
-
await self._run_ws(word_stream)
|
374
|
+
await self._run_ws(word_stream, request_id)
|
355
375
|
|
356
376
|
tasks = [
|
357
377
|
asyncio.create_task(_tokenize_input()),
|
358
|
-
asyncio.create_task(
|
378
|
+
asyncio.create_task(_process_segments()),
|
359
379
|
]
|
360
380
|
try:
|
361
381
|
await asyncio.gather(*tasks)
|
382
|
+
except asyncio.TimeoutError as e:
|
383
|
+
raise APITimeoutError() from e
|
384
|
+
except aiohttp.ClientResponseError as e:
|
385
|
+
raise APIStatusError(
|
386
|
+
message=e.message,
|
387
|
+
status_code=e.status,
|
388
|
+
request_id=request_id,
|
389
|
+
body=None,
|
390
|
+
) from e
|
391
|
+
except Exception as e:
|
392
|
+
raise APIConnectionError() from e
|
362
393
|
finally:
|
363
394
|
await utils.aio.gracefully_cancel(*tasks)
|
364
395
|
|
365
396
|
async def _run_ws(
|
366
397
|
self,
|
367
398
|
word_stream: tokenize.WordStream,
|
368
|
-
|
399
|
+
request_id: str,
|
369
400
|
) -> None:
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
380
|
-
)
|
381
|
-
break
|
382
|
-
except Exception as e:
|
383
|
-
logger.warning(
|
384
|
-
f"failed to connect to 11labs, retrying in {retry_delay}s",
|
385
|
-
exc_info=e,
|
401
|
+
async with self._pool.connection() as ws_conn:
|
402
|
+
segment_id = utils.shortuuid()
|
403
|
+
expected_text = "" # accumulate all tokens sent
|
404
|
+
|
405
|
+
# 11labs protocol expects the first message to be an "init msg"
|
406
|
+
init_pkt = dict(
|
407
|
+
text=" ",
|
408
|
+
voice_settings=_strip_nones(
|
409
|
+
dataclasses.asdict(self._opts.voice.settings)
|
386
410
|
)
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
segment_id = utils.shortuuid()
|
393
|
-
|
394
|
-
# 11labs protocol expects the first message to be an "init msg"
|
395
|
-
init_pkt = dict(
|
396
|
-
text=" ",
|
397
|
-
try_trigger_generation=True,
|
398
|
-
voice_settings=_strip_nones(dataclasses.asdict(self._opts.voice.settings))
|
399
|
-
if self._opts.voice.settings
|
400
|
-
else None,
|
401
|
-
generation_config=dict(
|
402
|
-
chunk_length_schedule=self._opts.chunk_length_schedule
|
403
|
-
),
|
404
|
-
)
|
405
|
-
await ws_conn.send_str(json.dumps(init_pkt))
|
406
|
-
eos_sent = False
|
407
|
-
|
408
|
-
async def send_task():
|
409
|
-
nonlocal eos_sent
|
410
|
-
|
411
|
-
xml_content = []
|
412
|
-
async for data in word_stream:
|
413
|
-
text = data.token
|
414
|
-
|
415
|
-
# send the xml phoneme in one go
|
416
|
-
if (
|
417
|
-
self._opts.enable_ssml_parsing
|
418
|
-
and data.token.startswith("<phoneme")
|
419
|
-
or xml_content
|
420
|
-
):
|
421
|
-
xml_content.append(text)
|
422
|
-
if data.token.find("</phoneme>") > -1:
|
423
|
-
text = self._opts.word_tokenizer.format_words(xml_content)
|
424
|
-
xml_content = []
|
425
|
-
else:
|
426
|
-
continue
|
427
|
-
|
428
|
-
# try_trigger_generation=True is a bad practice, we expose
|
429
|
-
# chunk_length_schedule instead
|
430
|
-
data_pkt = dict(
|
431
|
-
text=f"{text} ", # must always end with a space
|
432
|
-
try_trigger_generation=False,
|
433
|
-
)
|
434
|
-
self._mark_started()
|
435
|
-
await ws_conn.send_str(json.dumps(data_pkt))
|
436
|
-
|
437
|
-
if xml_content:
|
438
|
-
logger.warning("11labs stream ended with incomplete xml content")
|
439
|
-
|
440
|
-
# no more token, mark eos
|
441
|
-
eos_pkt = dict(text="")
|
442
|
-
await ws_conn.send_str(json.dumps(eos_pkt))
|
443
|
-
eos_sent = True
|
444
|
-
|
445
|
-
async def recv_task():
|
446
|
-
nonlocal eos_sent
|
447
|
-
audio_bstream = utils.audio.AudioByteStream(
|
448
|
-
sample_rate=self._opts.sample_rate,
|
449
|
-
num_channels=1,
|
411
|
+
if self._opts.voice.settings
|
412
|
+
else None,
|
413
|
+
generation_config=dict(
|
414
|
+
chunk_length_schedule=self._opts.chunk_length_schedule
|
415
|
+
),
|
450
416
|
)
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
417
|
+
await ws_conn.send_str(json.dumps(init_pkt))
|
418
|
+
|
419
|
+
async def send_task():
|
420
|
+
nonlocal expected_text
|
421
|
+
xml_content = []
|
422
|
+
async for data in word_stream:
|
423
|
+
text = data.token
|
424
|
+
expected_text += text
|
425
|
+
# send the xml phoneme in one go
|
426
|
+
if (
|
427
|
+
self._opts.enable_ssml_parsing
|
428
|
+
and data.token.startswith("<phoneme")
|
429
|
+
or xml_content
|
430
|
+
):
|
431
|
+
xml_content.append(text)
|
432
|
+
if text.find("</phoneme>") > -1:
|
433
|
+
text = self._opts.word_tokenizer.format_words(xml_content)
|
434
|
+
xml_content = []
|
435
|
+
else:
|
436
|
+
continue
|
437
|
+
|
438
|
+
data_pkt = dict(text=f"{text} ") # must always end with a space
|
439
|
+
self._mark_started()
|
440
|
+
await ws_conn.send_str(json.dumps(data_pkt))
|
441
|
+
if xml_content:
|
442
|
+
logger.warning("11labs stream ended with incomplete xml content")
|
443
|
+
await ws_conn.send_str(json.dumps({"flush": True}))
|
444
|
+
|
445
|
+
async def recv_task():
|
446
|
+
nonlocal expected_text
|
447
|
+
received_text = ""
|
448
|
+
audio_bstream = utils.audio.AudioByteStream(
|
449
|
+
sample_rate=self._opts.sample_rate,
|
450
|
+
num_channels=1,
|
451
|
+
)
|
452
|
+
last_frame: rtc.AudioFrame | None = None
|
453
|
+
|
454
|
+
def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
|
455
|
+
nonlocal last_frame
|
456
|
+
if last_frame is not None:
|
457
|
+
self._event_ch.send_nowait(
|
458
|
+
tts.SynthesizedAudio(
|
459
|
+
request_id=request_id,
|
460
|
+
segment_id=segment_id,
|
461
|
+
frame=last_frame,
|
462
|
+
is_final=is_final,
|
463
|
+
)
|
463
464
|
)
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
aiohttp.WSMsgType.CLOSING,
|
474
|
-
):
|
475
|
-
if not eos_sent:
|
465
|
+
last_frame = None
|
466
|
+
|
467
|
+
while True:
|
468
|
+
msg = await ws_conn.receive()
|
469
|
+
if msg.type in (
|
470
|
+
aiohttp.WSMsgType.CLOSED,
|
471
|
+
aiohttp.WSMsgType.CLOSE,
|
472
|
+
aiohttp.WSMsgType.CLOSING,
|
473
|
+
):
|
476
474
|
raise APIStatusError(
|
477
475
|
"11labs connection closed unexpectedly, not all tokens have been consumed",
|
478
476
|
request_id=request_id,
|
479
477
|
)
|
480
|
-
return
|
481
478
|
|
482
|
-
|
483
|
-
|
484
|
-
|
479
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
480
|
+
logger.warning("unexpected 11labs message type %s", msg.type)
|
481
|
+
continue
|
485
482
|
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
483
|
+
data = json.loads(msg.data)
|
484
|
+
encoding = _encoding_from_format(self._opts.encoding)
|
485
|
+
if data.get("audio"):
|
486
|
+
b64data = base64.b64decode(data["audio"])
|
487
|
+
if encoding == "mp3":
|
488
|
+
for frame in self._mp3_decoder.decode_chunk(b64data):
|
489
|
+
for frame in audio_bstream.write(frame.data.tobytes()):
|
490
|
+
_send_last_frame(
|
491
|
+
segment_id=segment_id, is_final=False
|
492
|
+
)
|
493
|
+
last_frame = frame
|
494
|
+
else:
|
495
|
+
for frame in audio_bstream.write(b64data):
|
493
496
|
_send_last_frame(segment_id=segment_id, is_final=False)
|
494
497
|
last_frame = frame
|
495
|
-
|
496
|
-
|
497
|
-
for frame in audio_bstream.write(b64data):
|
498
|
+
elif data.get("isFinal"):
|
499
|
+
for frame in audio_bstream.flush():
|
498
500
|
_send_last_frame(segment_id=segment_id, is_final=False)
|
499
501
|
last_frame = frame
|
502
|
+
_send_last_frame(segment_id=segment_id, is_final=True)
|
503
|
+
break
|
504
|
+
elif data.get("error"):
|
505
|
+
logger.error("11labs reported an error: %s", data["error"])
|
506
|
+
else:
|
507
|
+
logger.error("unexpected 11labs message %s", data)
|
500
508
|
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
else:
|
512
|
-
logger.error("unexpected 11labs message %s", data)
|
513
|
-
|
514
|
-
tasks = [
|
515
|
-
asyncio.create_task(send_task()),
|
516
|
-
asyncio.create_task(recv_task()),
|
517
|
-
]
|
509
|
+
if alignment := data.get("normalizedAlignment"):
|
510
|
+
received_text += "".join(alignment.get("chars", [])).replace(
|
511
|
+
" ", ""
|
512
|
+
)
|
513
|
+
if received_text == expected_text:
|
514
|
+
for frame in audio_bstream.flush():
|
515
|
+
_send_last_frame(segment_id=segment_id, is_final=False)
|
516
|
+
last_frame = frame
|
517
|
+
_send_last_frame(segment_id=segment_id, is_final=True)
|
518
|
+
break
|
518
519
|
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
520
|
+
tasks = [
|
521
|
+
asyncio.create_task(send_task()),
|
522
|
+
asyncio.create_task(recv_task()),
|
523
|
+
]
|
524
|
+
try:
|
525
|
+
await asyncio.gather(*tasks)
|
526
|
+
except asyncio.TimeoutError as e:
|
527
|
+
raise APITimeoutError() from e
|
528
|
+
except aiohttp.ClientResponseError as e:
|
529
|
+
raise APIStatusError(
|
530
|
+
message=e.message,
|
531
|
+
status_code=e.status,
|
532
|
+
request_id=request_id,
|
533
|
+
body=None,
|
534
|
+
) from e
|
535
|
+
except Exception as e:
|
536
|
+
raise APIConnectionError() from e
|
537
|
+
finally:
|
538
|
+
await utils.aio.gracefully_cancel(*tasks)
|
523
539
|
|
524
540
|
|
525
541
|
def _dict_to_voices_list(data: dict[str, Any]):
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=cVoaMYNlUXZzP-HOpbtU16OM9m-bACnSat8-o87tTyk,435
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=_d8V_YLx1tuScKtmDipoKHhqF3y68lXg03phixEHU3M,21419
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=1Trenk6kp4J1gdS0z55hdro60GNOnD1s0F3-AoNr4VM,601
|
7
|
+
livekit_plugins_elevenlabs-0.7.14.dist-info/METADATA,sha256=WGgcKpZb9PYymh1pNvF7B5dhLXUlQj3n0ALlwJmfYfE,1523
|
8
|
+
livekit_plugins_elevenlabs-0.7.14.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
9
|
+
livekit_plugins_elevenlabs-0.7.14.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_elevenlabs-0.7.14.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=cVoaMYNlUXZzP-HOpbtU16OM9m-bACnSat8-o87tTyk,435
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=jZsn-5s2qLMmrYYGYabV7l3qX99mjtdsA7mYgcJqQcw,19670
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=gGZFCPXFCJwzV7-seSYe-yQUCcH8Tyf19JtzUUCdylU,601
|
7
|
-
livekit_plugins_elevenlabs-0.7.12.dist-info/METADATA,sha256=FtC3EwKcOkiliplwxo_DmfnCexOCtseyAzzLKj3fk-I,1523
|
8
|
-
livekit_plugins_elevenlabs-0.7.12.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
-
livekit_plugins_elevenlabs-0.7.12.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.7.12.dist-info/RECORD,,
|
File without changes
|