livekit-plugins-elevenlabs 0.8.0__py3-none-any.whl → 0.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,4 +10,12 @@ TTSModels = Literal[
10
10
  "eleven_flash_v2",
11
11
  ]
12
12
 
13
- TTSEncoding = Literal["mp3_44100",]
13
+ TTSEncoding = Literal[
14
+ "mp3_22050_32",
15
+ "mp3_44100",
16
+ "mp3_44100_32",
17
+ "mp3_44100_64",
18
+ "mp3_44100_96",
19
+ "mp3_44100_128",
20
+ "mp3_44100_192",
21
+ ]
@@ -37,7 +37,9 @@ from livekit.agents import (
37
37
  from .log import logger
38
38
  from .models import TTSEncoding, TTSModels
39
39
 
40
- _DefaultEncoding: TTSEncoding = "mp3_44100"
40
+ # by default, use 22.05kHz sample rate at 32kbps
41
+ # in our testing, reduce TTFB by about ~110ms
42
+ _DefaultEncoding: TTSEncoding = "mp3_22050_32"
41
43
 
42
44
 
43
45
  def _sample_rate_from_format(output_format: TTSEncoding) -> int:
@@ -102,6 +104,7 @@ class TTS(tts.TTS):
102
104
  *,
103
105
  voice: Voice = DEFAULT_VOICE,
104
106
  model: TTSModels | str = "eleven_flash_v2_5",
107
+ encoding: TTSEncoding | None = None,
105
108
  api_key: str | None = None,
106
109
  base_url: str | None = None,
107
110
  streaming_latency: int = 0,
@@ -131,11 +134,14 @@ class TTS(tts.TTS):
131
134
  language (str | None): Language code for the TTS model, as of 10/24/24 only valid for "eleven_turbo_v2_5". Optional.
132
135
  """
133
136
 
137
+ if not encoding:
138
+ encoding = _DefaultEncoding
139
+
134
140
  super().__init__(
135
141
  capabilities=tts.TTSCapabilities(
136
142
  streaming=True,
137
143
  ),
138
- sample_rate=_sample_rate_from_format(_DefaultEncoding),
144
+ sample_rate=_sample_rate_from_format(encoding),
139
145
  num_channels=1,
140
146
  )
141
147
 
@@ -161,7 +167,7 @@ class TTS(tts.TTS):
161
167
  model=model,
162
168
  api_key=api_key,
163
169
  base_url=base_url or API_BASE_URL_V1,
164
- encoding=_DefaultEncoding,
170
+ encoding=encoding,
165
171
  sample_rate=self.sample_rate,
166
172
  streaming_latency=streaming_latency,
167
173
  word_tokenizer=word_tokenizer,
@@ -171,36 +177,14 @@ class TTS(tts.TTS):
171
177
  inactivity_timeout=inactivity_timeout,
172
178
  )
173
179
  self._session = http_session
174
- self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
175
- connect_cb=self._connect_ws,
176
- close_cb=self._close_ws,
177
- max_session_duration=inactivity_timeout,
178
- mark_refreshed_on_get=True,
179
- )
180
180
  self._streams = weakref.WeakSet[SynthesizeStream]()
181
181
 
182
- async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
183
- session = self._ensure_session()
184
- return await asyncio.wait_for(
185
- session.ws_connect(
186
- _stream_url(self._opts),
187
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
188
- ),
189
- self._conn_options.timeout,
190
- )
191
-
192
- async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
193
- await ws.close()
194
-
195
182
  def _ensure_session(self) -> aiohttp.ClientSession:
196
183
  if not self._session:
197
184
  self._session = utils.http_context.http_session()
198
185
 
199
186
  return self._session
200
187
 
201
- def prewarm(self) -> None:
202
- self._pool.prewarm()
203
-
204
188
  async def list_voices(self) -> List[Voice]:
205
189
  async with self._ensure_session().get(
206
190
  f"{self._opts.base_url}/voices",
@@ -242,7 +226,12 @@ class TTS(tts.TTS):
242
226
  def stream(
243
227
  self, *, conn_options: Optional[APIConnectOptions] = None
244
228
  ) -> "SynthesizeStream":
245
- stream = SynthesizeStream(tts=self, pool=self._pool, opts=self._opts)
229
+ stream = SynthesizeStream(
230
+ tts=self,
231
+ conn_options=conn_options,
232
+ opts=self._opts,
233
+ session=self._ensure_session(),
234
+ )
246
235
  self._streams.add(stream)
247
236
  return stream
248
237
 
@@ -250,7 +239,6 @@ class TTS(tts.TTS):
250
239
  for stream in list(self._streams):
251
240
  await stream.aclose()
252
241
  self._streams.clear()
253
- await self._pool.aclose()
254
242
  await super().aclose()
255
243
 
256
244
 
@@ -338,11 +326,12 @@ class SynthesizeStream(tts.SynthesizeStream):
338
326
  self,
339
327
  *,
340
328
  tts: TTS,
341
- pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
329
+ session: aiohttp.ClientSession,
342
330
  opts: _TTSOptions,
331
+ conn_options: Optional[APIConnectOptions] = None,
343
332
  ):
344
- super().__init__(tts=tts)
345
- self._opts, self._pool = opts, pool
333
+ super().__init__(tts=tts, conn_options=conn_options)
334
+ self._opts, self._session = opts, session
346
335
 
347
336
  async def _run(self) -> None:
348
337
  request_id = utils.shortuuid()
@@ -397,140 +386,143 @@ class SynthesizeStream(tts.SynthesizeStream):
397
386
  word_stream: tokenize.WordStream,
398
387
  request_id: str,
399
388
  ) -> None:
400
- async with self._pool.connection() as ws_conn:
401
- segment_id = utils.shortuuid()
402
- expected_text = "" # accumulate all tokens sent
389
+ ws_conn = await self._session.ws_connect(
390
+ _stream_url(self._opts),
391
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
392
+ )
403
393
 
404
- decoder = utils.codecs.AudioStreamDecoder(
405
- sample_rate=self._opts.sample_rate,
406
- num_channels=1,
407
- )
394
+ segment_id = utils.shortuuid()
395
+ decoder = utils.codecs.AudioStreamDecoder(
396
+ sample_rate=self._opts.sample_rate,
397
+ num_channels=1,
398
+ )
408
399
 
409
- # 11labs protocol expects the first message to be an "init msg"
410
- init_pkt = dict(
411
- text=" ",
412
- voice_settings=_strip_nones(
413
- dataclasses.asdict(self._opts.voice.settings)
414
- )
415
- if self._opts.voice.settings
416
- else None,
417
- generation_config=dict(
418
- chunk_length_schedule=self._opts.chunk_length_schedule
419
- ),
400
+ # 11labs protocol expects the first message to be an "init msg"
401
+ init_pkt = dict(
402
+ text=" ",
403
+ voice_settings=_strip_nones(dataclasses.asdict(self._opts.voice.settings))
404
+ if self._opts.voice.settings
405
+ else None,
406
+ generation_config=dict(
407
+ chunk_length_schedule=self._opts.chunk_length_schedule
408
+ ),
409
+ )
410
+ await ws_conn.send_str(json.dumps(init_pkt))
411
+ eos_sent = False
412
+
413
+ @utils.log_exceptions(logger=logger)
414
+ async def send_task():
415
+ nonlocal eos_sent
416
+ xml_content = []
417
+ async for data in word_stream:
418
+ text = data.token
419
+ # send the xml phoneme in one go
420
+ if (
421
+ self._opts.enable_ssml_parsing
422
+ and data.token.startswith("<phoneme")
423
+ or xml_content
424
+ ):
425
+ xml_content.append(text)
426
+ if data.token.find("</phoneme>") > -1:
427
+ text = self._opts.word_tokenizer.format_words(xml_content)
428
+ xml_content = []
429
+ else:
430
+ continue
431
+
432
+ data_pkt = dict(text=f"{text} ") # must always end with a space
433
+ self._mark_started()
434
+ await ws_conn.send_str(json.dumps(data_pkt))
435
+ if xml_content:
436
+ logger.warning("11labs stream ended with incomplete xml content")
437
+
438
+ # no more token, mark eos
439
+ eos_pkt = dict(text="")
440
+ await ws_conn.send_str(json.dumps(eos_pkt))
441
+ eos_sent = True
442
+
443
+ # consumes from decoder and generates events
444
+ @utils.log_exceptions(logger=logger)
445
+ async def generate_task():
446
+ emitter = tts.SynthesizedAudioEmitter(
447
+ event_ch=self._event_ch,
448
+ request_id=request_id,
449
+ segment_id=segment_id,
420
450
  )
421
- await ws_conn.send_str(json.dumps(init_pkt))
422
-
423
- @utils.log_exceptions(logger=logger)
424
- async def send_task():
425
- nonlocal expected_text
426
- xml_content = []
427
- async for data in word_stream:
428
- text = data.token
429
- expected_text += text
430
- # send the xml phoneme in one go
431
- if (
432
- self._opts.enable_ssml_parsing
433
- and data.token.startswith("<phoneme")
434
- or xml_content
435
- ):
436
- xml_content.append(text)
437
- if text.find("</phoneme>") > -1:
438
- text = self._opts.word_tokenizer.format_words(xml_content)
439
- xml_content = []
440
- else:
441
- continue
442
-
443
- data_pkt = dict(text=f"{text} ") # must always end with a space
444
- self._mark_started()
445
- await ws_conn.send_str(json.dumps(data_pkt))
446
- if xml_content:
447
- logger.warning("11labs stream ended with incomplete xml content")
448
- await ws_conn.send_str(json.dumps({"flush": True}))
449
-
450
- # consumes from decoder and generates events
451
- @utils.log_exceptions(logger=logger)
452
- async def generate_task():
453
- emitter = tts.SynthesizedAudioEmitter(
454
- event_ch=self._event_ch,
455
- request_id=request_id,
456
- segment_id=segment_id,
457
- )
458
- async for frame in decoder:
459
- emitter.push(frame)
460
- emitter.flush()
451
+ async for frame in decoder:
452
+ emitter.push(frame)
453
+ emitter.flush()
461
454
 
462
- # receives from ws and decodes audio
463
- @utils.log_exceptions(logger=logger)
464
- async def recv_task():
465
- nonlocal expected_text
466
- received_text = ""
467
-
468
- while True:
469
- msg = await ws_conn.receive()
470
- if msg.type in (
471
- aiohttp.WSMsgType.CLOSED,
472
- aiohttp.WSMsgType.CLOSE,
473
- aiohttp.WSMsgType.CLOSING,
474
- ):
455
+ # receives from ws and decodes audio
456
+ @utils.log_exceptions(logger=logger)
457
+ async def recv_task():
458
+ nonlocal eos_sent
459
+
460
+ while True:
461
+ msg = await ws_conn.receive()
462
+ if msg.type in (
463
+ aiohttp.WSMsgType.CLOSED,
464
+ aiohttp.WSMsgType.CLOSE,
465
+ aiohttp.WSMsgType.CLOSING,
466
+ ):
467
+ if not eos_sent:
475
468
  raise APIStatusError(
476
469
  "11labs connection closed unexpectedly, not all tokens have been consumed",
477
470
  request_id=request_id,
478
471
  )
472
+ return
479
473
 
480
- if msg.type != aiohttp.WSMsgType.TEXT:
481
- logger.warning("unexpected 11labs message type %s", msg.type)
482
- continue
483
-
484
- data = json.loads(msg.data)
485
- if data.get("audio"):
486
- b64data = base64.b64decode(data["audio"])
487
- decoder.push(b64data)
488
-
489
- if alignment := data.get("normalizedAlignment"):
490
- received_text += "".join(
491
- alignment.get("chars", [])
492
- ).replace(" ", "")
493
- if received_text == expected_text:
494
- decoder.end_input()
495
- break
496
- elif data.get("error"):
497
- raise APIStatusError(
498
- message=data["error"],
499
- status_code=500,
500
- request_id=request_id,
501
- body=None,
502
- )
503
- else:
504
- raise APIStatusError(
505
- message=f"unexpected 11labs message {data}",
506
- status_code=500,
507
- request_id=request_id,
508
- body=None,
509
- )
474
+ if msg.type != aiohttp.WSMsgType.TEXT:
475
+ logger.warning("unexpected 11labs message type %s", msg.type)
476
+ continue
477
+
478
+ data = json.loads(msg.data)
479
+ if data.get("audio"):
480
+ b64data = base64.b64decode(data["audio"])
481
+ decoder.push(b64data)
482
+
483
+ elif data.get("isFinal"):
484
+ decoder.end_input()
485
+ break
486
+ elif data.get("error"):
487
+ raise APIStatusError(
488
+ message=data["error"],
489
+ status_code=500,
490
+ request_id=request_id,
491
+ body=None,
492
+ )
493
+ else:
494
+ raise APIStatusError(
495
+ message=f"unexpected 11labs message {data}",
496
+ status_code=500,
497
+ request_id=request_id,
498
+ body=None,
499
+ )
510
500
 
511
- tasks = [
512
- asyncio.create_task(send_task()),
513
- asyncio.create_task(recv_task()),
514
- asyncio.create_task(generate_task()),
515
- ]
516
- try:
517
- await asyncio.gather(*tasks)
518
- except asyncio.TimeoutError as e:
519
- raise APITimeoutError() from e
520
- except aiohttp.ClientResponseError as e:
521
- raise APIStatusError(
522
- message=e.message,
523
- status_code=e.status,
524
- request_id=request_id,
525
- body=None,
526
- ) from e
527
- except APIStatusError:
528
- raise
529
- except Exception as e:
530
- raise APIConnectionError() from e
531
- finally:
532
- await utils.aio.gracefully_cancel(*tasks)
533
- await decoder.aclose()
501
+ tasks = [
502
+ asyncio.create_task(send_task()),
503
+ asyncio.create_task(recv_task()),
504
+ asyncio.create_task(generate_task()),
505
+ ]
506
+ try:
507
+ await asyncio.gather(*tasks)
508
+ except asyncio.TimeoutError as e:
509
+ raise APITimeoutError() from e
510
+ except aiohttp.ClientResponseError as e:
511
+ raise APIStatusError(
512
+ message=e.message,
513
+ status_code=e.status,
514
+ request_id=request_id,
515
+ body=None,
516
+ ) from e
517
+ except APIStatusError:
518
+ raise
519
+ except Exception as e:
520
+ raise APIConnectionError() from e
521
+ finally:
522
+ await utils.aio.gracefully_cancel(*tasks)
523
+ await decoder.aclose()
524
+ if ws_conn is not None:
525
+ await ws_conn.close()
534
526
 
535
527
 
536
528
  def _dict_to_voices_list(data: dict[str, Any]):
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.8.0"
15
+ __version__ = "0.8.2"
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.8.0
3
+ Version: 0.8.2
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=p_wHEz15bdsNEqwzN831ysm70PNWQ-xeN__BKvGPZxA,401
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=ipXJdSYMKBd8mzTL3JfvYdRc2sJJRASOPPh2Ppy8NBk,19529
6
+ livekit/plugins/elevenlabs/version.py,sha256=qwktN8wnyHMjA3ewh43aDIBBwMd3jorNpCaoGTqBDrw,600
7
+ livekit_plugins_elevenlabs-0.8.2.dist-info/METADATA,sha256=mOzHe4OynY-A7OK1hi1OK4eXTnMRvKGG0CYjW_kXz0s,1529
8
+ livekit_plugins_elevenlabs-0.8.2.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
9
+ livekit_plugins_elevenlabs-0.8.2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.8.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: setuptools (78.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=nB43wLS1ilzS7IxLYVSQxBjKPnbiPl4AHpHAOlG2i00,273
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=KCZnuAngDZck4zIMMgp0BLV0GS31kKChMvdvXUVZ8vY,20491
6
- livekit/plugins/elevenlabs/version.py,sha256=fObgfvFfJb5Vj0qY1hgEiVKSo6z6atjrJvwAVl4KvR4,600
7
- livekit_plugins_elevenlabs-0.8.0.dist-info/METADATA,sha256=BwddENtvF9zqxTgjgIsHyavyRfA82TBISYEVwFfo2vs,1529
8
- livekit_plugins_elevenlabs-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
9
- livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD,,