livekit-plugins-elevenlabs 0.7.13__py3-none-any.whl → 0.7.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,13 +19,13 @@ import base64
19
19
  import dataclasses
20
20
  import json
21
21
  import os
22
+ import weakref
22
23
  from dataclasses import dataclass
23
- from typing import Any, List, Literal
24
+ from typing import Any, List, Literal, Optional
24
25
 
25
26
  import aiohttp
26
27
  from livekit import rtc
27
28
  from livekit.agents import (
28
- DEFAULT_API_CONNECT_OPTIONS,
29
29
  APIConnectionError,
30
30
  APIConnectOptions,
31
31
  APIStatusError,
@@ -170,6 +170,24 @@ class TTS(tts.TTS):
170
170
  language=language,
171
171
  )
172
172
  self._session = http_session
173
+ self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
174
+ connect_cb=self._connect_ws,
175
+ close_cb=self._close_ws,
176
+ )
177
+ self._streams = weakref.WeakSet[SynthesizeStream]()
178
+
179
+ async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
180
+ session = self._ensure_session()
181
+ return await asyncio.wait_for(
182
+ session.ws_connect(
183
+ _stream_url(self._opts),
184
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
185
+ ),
186
+ self._conn_options.timeout,
187
+ )
188
+
189
+ async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
190
+ await ws.close()
173
191
 
174
192
  def _ensure_session(self) -> aiohttp.ClientSession:
175
193
  if not self._session:
@@ -205,7 +223,7 @@ class TTS(tts.TTS):
205
223
  self,
206
224
  text: str,
207
225
  *,
208
- conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
226
+ conn_options: Optional[APIConnectOptions] = None,
209
227
  ) -> "ChunkedStream":
210
228
  return ChunkedStream(
211
229
  tts=self,
@@ -216,14 +234,18 @@ class TTS(tts.TTS):
216
234
  )
217
235
 
218
236
  def stream(
219
- self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
237
+ self, *, conn_options: Optional[APIConnectOptions] = None
220
238
  ) -> "SynthesizeStream":
221
- return SynthesizeStream(
222
- tts=self,
223
- conn_options=conn_options,
224
- opts=self._opts,
225
- session=self._ensure_session(),
226
- )
239
+ stream = SynthesizeStream(tts=self, pool=self._pool, opts=self._opts)
240
+ self._streams.add(stream)
241
+ return stream
242
+
243
+ async def aclose(self) -> None:
244
+ for stream in list(self._streams):
245
+ await stream.aclose()
246
+ self._streams.clear()
247
+ await self._pool.aclose()
248
+ await super().aclose()
227
249
 
228
250
 
229
251
  class ChunkedStream(tts.ChunkedStream):
@@ -235,7 +257,7 @@ class ChunkedStream(tts.ChunkedStream):
235
257
  tts: TTS,
236
258
  input_text: str,
237
259
  opts: _TTSOptions,
238
- conn_options: APIConnectOptions,
260
+ conn_options: Optional[APIConnectOptions] = None,
239
261
  session: aiohttp.ClientSession,
240
262
  ) -> None:
241
263
  super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
@@ -317,15 +339,15 @@ class SynthesizeStream(tts.SynthesizeStream):
317
339
  self,
318
340
  *,
319
341
  tts: TTS,
320
- session: aiohttp.ClientSession,
321
- conn_options: APIConnectOptions,
342
+ pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
322
343
  opts: _TTSOptions,
323
344
  ):
324
- super().__init__(tts=tts, conn_options=conn_options)
325
- self._opts, self._session = opts, session
345
+ super().__init__(tts=tts)
346
+ self._opts, self._pool = opts, pool
326
347
  self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
327
348
 
328
349
  async def _run(self) -> None:
350
+ request_id = utils.shortuuid()
329
351
  self._segments_ch = utils.aio.Chan[tokenize.WordStream]()
330
352
 
331
353
  @utils.log_exceptions(logger=logger)
@@ -343,185 +365,177 @@ class SynthesizeStream(tts.SynthesizeStream):
343
365
  elif isinstance(input, self._FlushSentinel):
344
366
  if word_stream is not None:
345
367
  word_stream.end_input()
346
-
347
368
  word_stream = None
348
-
349
369
  self._segments_ch.close()
350
370
 
351
371
  @utils.log_exceptions(logger=logger)
352
- async def _run():
372
+ async def _process_segments():
353
373
  async for word_stream in self._segments_ch:
354
- await self._run_ws(word_stream)
374
+ await self._run_ws(word_stream, request_id)
355
375
 
356
376
  tasks = [
357
377
  asyncio.create_task(_tokenize_input()),
358
- asyncio.create_task(_run()),
378
+ asyncio.create_task(_process_segments()),
359
379
  ]
360
380
  try:
361
381
  await asyncio.gather(*tasks)
382
+ except asyncio.TimeoutError as e:
383
+ raise APITimeoutError() from e
384
+ except aiohttp.ClientResponseError as e:
385
+ raise APIStatusError(
386
+ message=e.message,
387
+ status_code=e.status,
388
+ request_id=request_id,
389
+ body=None,
390
+ ) from e
391
+ except Exception as e:
392
+ raise APIConnectionError() from e
362
393
  finally:
363
394
  await utils.aio.gracefully_cancel(*tasks)
364
395
 
365
396
  async def _run_ws(
366
397
  self,
367
398
  word_stream: tokenize.WordStream,
368
- max_retry: int = 3,
399
+ request_id: str,
369
400
  ) -> None:
370
- ws_conn: aiohttp.ClientWebSocketResponse | None = None
371
- for try_i in range(max_retry):
372
- retry_delay = 5
373
- try:
374
- if try_i > 0:
375
- await asyncio.sleep(retry_delay)
376
-
377
- ws_conn = await self._session.ws_connect(
378
- _stream_url(self._opts),
379
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
380
- )
381
- break
382
- except Exception as e:
383
- logger.warning(
384
- f"failed to connect to 11labs, retrying in {retry_delay}s",
385
- exc_info=e,
401
+ async with self._pool.connection() as ws_conn:
402
+ segment_id = utils.shortuuid()
403
+ expected_text = "" # accumulate all tokens sent
404
+
405
+ # 11labs protocol expects the first message to be an "init msg"
406
+ init_pkt = dict(
407
+ text=" ",
408
+ voice_settings=_strip_nones(
409
+ dataclasses.asdict(self._opts.voice.settings)
386
410
  )
387
-
388
- if ws_conn is None:
389
- raise Exception(f"failed to connect to 11labs after {max_retry} retries")
390
-
391
- request_id = utils.shortuuid()
392
- segment_id = utils.shortuuid()
393
-
394
- # 11labs protocol expects the first message to be an "init msg"
395
- init_pkt = dict(
396
- text=" ",
397
- try_trigger_generation=True,
398
- voice_settings=_strip_nones(dataclasses.asdict(self._opts.voice.settings))
399
- if self._opts.voice.settings
400
- else None,
401
- generation_config=dict(
402
- chunk_length_schedule=self._opts.chunk_length_schedule
403
- ),
404
- )
405
- await ws_conn.send_str(json.dumps(init_pkt))
406
- eos_sent = False
407
-
408
- async def send_task():
409
- nonlocal eos_sent
410
-
411
- xml_content = []
412
- async for data in word_stream:
413
- text = data.token
414
-
415
- # send the xml phoneme in one go
416
- if (
417
- self._opts.enable_ssml_parsing
418
- and data.token.startswith("<phoneme")
419
- or xml_content
420
- ):
421
- xml_content.append(text)
422
- if data.token.find("</phoneme>") > -1:
423
- text = self._opts.word_tokenizer.format_words(xml_content)
424
- xml_content = []
425
- else:
426
- continue
427
-
428
- # try_trigger_generation=True is a bad practice, we expose
429
- # chunk_length_schedule instead
430
- data_pkt = dict(
431
- text=f"{text} ", # must always end with a space
432
- try_trigger_generation=False,
433
- )
434
- self._mark_started()
435
- await ws_conn.send_str(json.dumps(data_pkt))
436
-
437
- if xml_content:
438
- logger.warning("11labs stream ended with incomplete xml content")
439
-
440
- # no more token, mark eos
441
- eos_pkt = dict(text="")
442
- await ws_conn.send_str(json.dumps(eos_pkt))
443
- eos_sent = True
444
-
445
- async def recv_task():
446
- nonlocal eos_sent
447
- audio_bstream = utils.audio.AudioByteStream(
448
- sample_rate=self._opts.sample_rate,
449
- num_channels=1,
411
+ if self._opts.voice.settings
412
+ else None,
413
+ generation_config=dict(
414
+ chunk_length_schedule=self._opts.chunk_length_schedule
415
+ ),
450
416
  )
451
-
452
- last_frame: rtc.AudioFrame | None = None
453
-
454
- def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
455
- nonlocal last_frame
456
- if last_frame is not None:
457
- self._event_ch.send_nowait(
458
- tts.SynthesizedAudio(
459
- request_id=request_id,
460
- segment_id=segment_id,
461
- frame=last_frame,
462
- is_final=is_final,
417
+ await ws_conn.send_str(json.dumps(init_pkt))
418
+
419
+ async def send_task():
420
+ nonlocal expected_text
421
+ xml_content = []
422
+ async for data in word_stream:
423
+ text = data.token
424
+ expected_text += text
425
+ # send the xml phoneme in one go
426
+ if (
427
+ self._opts.enable_ssml_parsing
428
+ and data.token.startswith("<phoneme")
429
+ or xml_content
430
+ ):
431
+ xml_content.append(text)
432
+ if text.find("</phoneme>") > -1:
433
+ text = self._opts.word_tokenizer.format_words(xml_content)
434
+ xml_content = []
435
+ else:
436
+ continue
437
+
438
+ data_pkt = dict(text=f"{text} ") # must always end with a space
439
+ self._mark_started()
440
+ await ws_conn.send_str(json.dumps(data_pkt))
441
+ if xml_content:
442
+ logger.warning("11labs stream ended with incomplete xml content")
443
+ await ws_conn.send_str(json.dumps({"flush": True}))
444
+
445
+ async def recv_task():
446
+ nonlocal expected_text
447
+ received_text = ""
448
+ audio_bstream = utils.audio.AudioByteStream(
449
+ sample_rate=self._opts.sample_rate,
450
+ num_channels=1,
451
+ )
452
+ last_frame: rtc.AudioFrame | None = None
453
+
454
+ def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
455
+ nonlocal last_frame
456
+ if last_frame is not None:
457
+ self._event_ch.send_nowait(
458
+ tts.SynthesizedAudio(
459
+ request_id=request_id,
460
+ segment_id=segment_id,
461
+ frame=last_frame,
462
+ is_final=is_final,
463
+ )
463
464
  )
464
- )
465
-
466
- last_frame = None
467
-
468
- while True:
469
- msg = await ws_conn.receive()
470
- if msg.type in (
471
- aiohttp.WSMsgType.CLOSED,
472
- aiohttp.WSMsgType.CLOSE,
473
- aiohttp.WSMsgType.CLOSING,
474
- ):
475
- if not eos_sent:
465
+ last_frame = None
466
+
467
+ while True:
468
+ msg = await ws_conn.receive()
469
+ if msg.type in (
470
+ aiohttp.WSMsgType.CLOSED,
471
+ aiohttp.WSMsgType.CLOSE,
472
+ aiohttp.WSMsgType.CLOSING,
473
+ ):
476
474
  raise APIStatusError(
477
475
  "11labs connection closed unexpectedly, not all tokens have been consumed",
478
476
  request_id=request_id,
479
477
  )
480
- return
481
478
 
482
- if msg.type != aiohttp.WSMsgType.TEXT:
483
- logger.warning("unexpected 11labs message type %s", msg.type)
484
- continue
479
+ if msg.type != aiohttp.WSMsgType.TEXT:
480
+ logger.warning("unexpected 11labs message type %s", msg.type)
481
+ continue
485
482
 
486
- data = json.loads(msg.data)
487
- encoding = _encoding_from_format(self._opts.encoding)
488
- if data.get("audio"):
489
- b64data = base64.b64decode(data["audio"])
490
- if encoding == "mp3":
491
- for frame in self._mp3_decoder.decode_chunk(b64data):
492
- for frame in audio_bstream.write(frame.data.tobytes()):
483
+ data = json.loads(msg.data)
484
+ encoding = _encoding_from_format(self._opts.encoding)
485
+ if data.get("audio"):
486
+ b64data = base64.b64decode(data["audio"])
487
+ if encoding == "mp3":
488
+ for frame in self._mp3_decoder.decode_chunk(b64data):
489
+ for frame in audio_bstream.write(frame.data.tobytes()):
490
+ _send_last_frame(
491
+ segment_id=segment_id, is_final=False
492
+ )
493
+ last_frame = frame
494
+ else:
495
+ for frame in audio_bstream.write(b64data):
493
496
  _send_last_frame(segment_id=segment_id, is_final=False)
494
497
  last_frame = frame
495
-
496
- else:
497
- for frame in audio_bstream.write(b64data):
498
+ elif data.get("isFinal"):
499
+ for frame in audio_bstream.flush():
498
500
  _send_last_frame(segment_id=segment_id, is_final=False)
499
501
  last_frame = frame
502
+ _send_last_frame(segment_id=segment_id, is_final=True)
503
+ break
504
+ elif data.get("error"):
505
+ logger.error("11labs reported an error: %s", data["error"])
506
+ else:
507
+ logger.error("unexpected 11labs message %s", data)
500
508
 
501
- elif data.get("isFinal"):
502
- for frame in audio_bstream.flush():
503
- _send_last_frame(segment_id=segment_id, is_final=False)
504
- last_frame = frame
505
-
506
- _send_last_frame(segment_id=segment_id, is_final=True)
507
-
508
- pass
509
- elif data.get("error"):
510
- logger.error("11labs reported an error: %s", data["error"])
511
- else:
512
- logger.error("unexpected 11labs message %s", data)
513
-
514
- tasks = [
515
- asyncio.create_task(send_task()),
516
- asyncio.create_task(recv_task()),
517
- ]
509
+ if alignment := data.get("normalizedAlignment"):
510
+ received_text += "".join(alignment.get("chars", [])).replace(
511
+ " ", ""
512
+ )
513
+ if received_text == expected_text:
514
+ for frame in audio_bstream.flush():
515
+ _send_last_frame(segment_id=segment_id, is_final=False)
516
+ last_frame = frame
517
+ _send_last_frame(segment_id=segment_id, is_final=True)
518
+ break
518
519
 
519
- try:
520
- await asyncio.gather(*tasks)
521
- finally:
522
- await utils.aio.gracefully_cancel(*tasks)
523
- if ws_conn is not None:
524
- await ws_conn.close()
520
+ tasks = [
521
+ asyncio.create_task(send_task()),
522
+ asyncio.create_task(recv_task()),
523
+ ]
524
+ try:
525
+ await asyncio.gather(*tasks)
526
+ except asyncio.TimeoutError as e:
527
+ raise APITimeoutError() from e
528
+ except aiohttp.ClientResponseError as e:
529
+ raise APIStatusError(
530
+ message=e.message,
531
+ status_code=e.status,
532
+ request_id=request_id,
533
+ body=None,
534
+ ) from e
535
+ except Exception as e:
536
+ raise APIConnectionError() from e
537
+ finally:
538
+ await utils.aio.gracefully_cancel(*tasks)
525
539
 
526
540
 
527
541
  def _dict_to_voices_list(data: dict[str, Any]):
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.7.13"
15
+ __version__ = "0.7.14"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.7.13
3
+ Version: 0.7.14
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=cVoaMYNlUXZzP-HOpbtU16OM9m-bACnSat8-o87tTyk,435
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=_d8V_YLx1tuScKtmDipoKHhqF3y68lXg03phixEHU3M,21419
6
+ livekit/plugins/elevenlabs/version.py,sha256=1Trenk6kp4J1gdS0z55hdro60GNOnD1s0F3-AoNr4VM,601
7
+ livekit_plugins_elevenlabs-0.7.14.dist-info/METADATA,sha256=WGgcKpZb9PYymh1pNvF7B5dhLXUlQj3n0ALlwJmfYfE,1523
8
+ livekit_plugins_elevenlabs-0.7.14.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
9
+ livekit_plugins_elevenlabs-0.7.14.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.7.14.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=cVoaMYNlUXZzP-HOpbtU16OM9m-bACnSat8-o87tTyk,435
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=dFeEtnNYR0sIDjQZARvUb6cG3VUD4gUNU3hpbFGpyNo,19744
6
- livekit/plugins/elevenlabs/version.py,sha256=ePihhrwb0N1YVz4mZBMdwgECen0up-RbS8yrvmQGHt4,601
7
- livekit_plugins_elevenlabs-0.7.13.dist-info/METADATA,sha256=en67LviRFvRsErhZ5qvb8UvbmQc2nLcYSijszgdMj1Q,1523
8
- livekit_plugins_elevenlabs-0.7.13.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
- livekit_plugins_elevenlabs-0.7.13.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.7.13.dist-info/RECORD,,