livekit-plugins-elevenlabs 0.7.14__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,14 +10,4 @@ TTSModels = Literal[
10
10
  "eleven_flash_v2",
11
11
  ]
12
12
 
13
- TTSEncoding = Literal[
14
- "mp3_22050_32",
15
- "mp3_44100_32",
16
- "mp3_44100_64",
17
- "mp3_44100_96",
18
- "mp3_44100_128",
19
- "mp3_44100_192",
20
- "pcm_16000",
21
- "pcm_22050",
22
- "pcm_44100",
23
- ]
13
+ TTSEncoding = Literal["mp3_44100",]
@@ -21,10 +21,9 @@ import json
21
21
  import os
22
22
  import weakref
23
23
  from dataclasses import dataclass
24
- from typing import Any, List, Literal, Optional
24
+ from typing import Any, List, Optional
25
25
 
26
26
  import aiohttp
27
- from livekit import rtc
28
27
  from livekit.agents import (
29
28
  APIConnectionError,
30
29
  APIConnectOptions,
@@ -38,28 +37,20 @@ from livekit.agents import (
38
37
  from .log import logger
39
38
  from .models import TTSEncoding, TTSModels
40
39
 
41
- _Encoding = Literal["mp3", "pcm"]
40
+ _DefaultEncoding: TTSEncoding = "mp3_44100"
42
41
 
43
42
 
44
43
  def _sample_rate_from_format(output_format: TTSEncoding) -> int:
45
- split = output_format.split("_") # e.g: mp3_22050_32
44
+ split = output_format.split("_") # e.g: mp3_44100
46
45
  return int(split[1])
47
46
 
48
47
 
49
- def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
50
- if output_format.startswith("mp3"):
51
- return "mp3"
52
- elif output_format.startswith("pcm"):
53
- return "pcm"
54
-
55
- raise ValueError(f"Unknown format: {output_format}")
56
-
57
-
58
48
  @dataclass
59
49
  class VoiceSettings:
60
50
  stability: float # [0.0 - 1.0]
61
51
  similarity_boost: float # [0.0 - 1.0]
62
52
  style: float | None = None # [0.0 - 1.0]
53
+ speed: float | None = 1.0 # [0.8 - 1.2]
63
54
  use_speaker_boost: bool | None = False
64
55
 
65
56
 
@@ -76,12 +67,17 @@ DEFAULT_VOICE = Voice(
76
67
  name="Bella",
77
68
  category="premade",
78
69
  settings=VoiceSettings(
79
- stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
70
+ stability=0.71,
71
+ speed=1.0,
72
+ similarity_boost=0.5,
73
+ style=0.0,
74
+ use_speaker_boost=True,
80
75
  ),
81
76
  )
82
77
 
83
78
  API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
84
79
  AUTHORIZATION_HEADER = "xi-api-key"
80
+ WS_INACTIVITY_TIMEOUT = 300
85
81
 
86
82
 
87
83
  @dataclass
@@ -97,6 +93,7 @@ class _TTSOptions:
97
93
  word_tokenizer: tokenize.WordTokenizer
98
94
  chunk_length_schedule: list[int]
99
95
  enable_ssml_parsing: bool
96
+ inactivity_timeout: int
100
97
 
101
98
 
102
99
  class TTS(tts.TTS):
@@ -107,11 +104,9 @@ class TTS(tts.TTS):
107
104
  model: TTSModels | str = "eleven_flash_v2_5",
108
105
  api_key: str | None = None,
109
106
  base_url: str | None = None,
110
- encoding: TTSEncoding = "mp3_22050_32",
111
- streaming_latency: int = 3,
112
- word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
113
- ignore_punctuation=False # punctuation can help for intonation
114
- ),
107
+ streaming_latency: int = 0,
108
+ inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
109
+ word_tokenizer: Optional[tokenize.WordTokenizer] = None,
115
110
  enable_ssml_parsing: bool = False,
116
111
  chunk_length_schedule: list[int] = [80, 120, 200, 260], # range is [50, 500]
117
112
  http_session: aiohttp.ClientSession | None = None,
@@ -127,8 +122,8 @@ class TTS(tts.TTS):
127
122
  model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
128
123
  api_key (str | None): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
129
124
  base_url (str | None): Custom base URL for the API. Optional.
130
- encoding (TTSEncoding): Audio encoding format. Defaults to "mp3_22050_32".
131
- streaming_latency (int): Latency in seconds for streaming. Defaults to 3.
125
+ streaming_latency (int): Optimize for streaming latency, defaults to 0 - disabled. 4 for max latency optimizations. deprecated
126
+ inactivity_timeout (int): Inactivity timeout in seconds for the websocket connection. Defaults to 300.
132
127
  word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
133
128
  enable_ssml_parsing (bool): Enable SSML parsing for input text. Defaults to False.
134
129
  chunk_length_schedule (list[int]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
@@ -140,7 +135,7 @@ class TTS(tts.TTS):
140
135
  capabilities=tts.TTSCapabilities(
141
136
  streaming=True,
142
137
  ),
143
- sample_rate=_sample_rate_from_format(encoding),
138
+ sample_rate=_sample_rate_from_format(_DefaultEncoding),
144
139
  num_channels=1,
145
140
  )
146
141
 
@@ -156,23 +151,31 @@ class TTS(tts.TTS):
156
151
  "ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable"
157
152
  )
158
153
 
154
+ if word_tokenizer is None:
155
+ word_tokenizer = tokenize.basic.WordTokenizer(
156
+ ignore_punctuation=False # punctuation can help for intonation
157
+ )
158
+
159
159
  self._opts = _TTSOptions(
160
160
  voice=voice,
161
161
  model=model,
162
162
  api_key=api_key,
163
163
  base_url=base_url or API_BASE_URL_V1,
164
- encoding=encoding,
164
+ encoding=_DefaultEncoding,
165
165
  sample_rate=self.sample_rate,
166
166
  streaming_latency=streaming_latency,
167
167
  word_tokenizer=word_tokenizer,
168
168
  chunk_length_schedule=chunk_length_schedule,
169
169
  enable_ssml_parsing=enable_ssml_parsing,
170
170
  language=language,
171
+ inactivity_timeout=inactivity_timeout,
171
172
  )
172
173
  self._session = http_session
173
174
  self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
174
175
  connect_cb=self._connect_ws,
175
176
  close_cb=self._close_ws,
177
+ max_session_duration=inactivity_timeout,
178
+ mark_refreshed_on_get=True,
176
179
  )
177
180
  self._streams = weakref.WeakSet[SynthesizeStream]()
178
181
 
@@ -195,6 +198,9 @@ class TTS(tts.TTS):
195
198
 
196
199
  return self._session
197
200
 
201
+ def prewarm(self) -> None:
202
+ self._pool.prewarm()
203
+
198
204
  async def list_voices(self) -> List[Voice]:
199
205
  async with self._ensure_session().get(
200
206
  f"{self._opts.base_url}/voices",
@@ -262,15 +268,9 @@ class ChunkedStream(tts.ChunkedStream):
262
268
  ) -> None:
263
269
  super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
264
270
  self._opts, self._session = opts, session
265
- if _encoding_from_format(self._opts.encoding) == "mp3":
266
- self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
267
271
 
268
272
  async def _run(self) -> None:
269
273
  request_id = utils.shortuuid()
270
- bstream = utils.audio.AudioByteStream(
271
- sample_rate=self._opts.sample_rate, num_channels=1
272
- )
273
-
274
274
  voice_settings = (
275
275
  _strip_nones(dataclasses.asdict(self._opts.voice.settings))
276
276
  if self._opts.voice.settings
@@ -282,6 +282,12 @@ class ChunkedStream(tts.ChunkedStream):
282
282
  "voice_settings": voice_settings,
283
283
  }
284
284
 
285
+ decoder = utils.codecs.AudioStreamDecoder(
286
+ sample_rate=self._opts.sample_rate,
287
+ num_channels=1,
288
+ )
289
+
290
+ decode_task: asyncio.Task | None = None
285
291
  try:
286
292
  async with self._session.post(
287
293
  _synthesize_url(self._opts),
@@ -293,32 +299,21 @@ class ChunkedStream(tts.ChunkedStream):
293
299
  logger.error("11labs returned non-audio data: %s", content)
294
300
  return
295
301
 
296
- encoding = _encoding_from_format(self._opts.encoding)
297
- if encoding == "mp3":
298
- async for bytes_data, _ in resp.content.iter_chunks():
299
- for frame in self._mp3_decoder.decode_chunk(bytes_data):
300
- for frame in bstream.write(frame.data.tobytes()):
301
- self._event_ch.send_nowait(
302
- tts.SynthesizedAudio(
303
- request_id=request_id,
304
- frame=frame,
305
- )
306
- )
307
- else:
308
- async for bytes_data, _ in resp.content.iter_chunks():
309
- for frame in bstream.write(bytes_data):
310
- self._event_ch.send_nowait(
311
- tts.SynthesizedAudio(
312
- request_id=request_id,
313
- frame=frame,
314
- )
315
- )
316
-
317
- for frame in bstream.flush():
318
- self._event_ch.send_nowait(
319
- tts.SynthesizedAudio(request_id=request_id, frame=frame)
320
- )
302
+ async def _decode_loop():
303
+ try:
304
+ async for bytes_data, _ in resp.content.iter_chunks():
305
+ decoder.push(bytes_data)
306
+ finally:
307
+ decoder.end_input()
321
308
 
309
+ decode_task = asyncio.create_task(_decode_loop())
310
+ emitter = tts.SynthesizedAudioEmitter(
311
+ event_ch=self._event_ch,
312
+ request_id=request_id,
313
+ )
314
+ async for frame in decoder:
315
+ emitter.push(frame)
316
+ emitter.flush()
322
317
  except asyncio.TimeoutError as e:
323
318
  raise APITimeoutError() from e
324
319
  except aiohttp.ClientResponseError as e:
@@ -330,6 +325,10 @@ class ChunkedStream(tts.ChunkedStream):
330
325
  ) from e
331
326
  except Exception as e:
332
327
  raise APIConnectionError() from e
328
+ finally:
329
+ if decode_task:
330
+ await utils.aio.gracefully_cancel(decode_task)
331
+ await decoder.aclose()
333
332
 
334
333
 
335
334
  class SynthesizeStream(tts.SynthesizeStream):
@@ -344,7 +343,6 @@ class SynthesizeStream(tts.SynthesizeStream):
344
343
  ):
345
344
  super().__init__(tts=tts)
346
345
  self._opts, self._pool = opts, pool
347
- self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
348
346
 
349
347
  async def _run(self) -> None:
350
348
  request_id = utils.shortuuid()
@@ -360,12 +358,13 @@ class SynthesizeStream(tts.SynthesizeStream):
360
358
  # new segment (after flush for e.g)
361
359
  word_stream = self._opts.word_tokenizer.stream()
362
360
  self._segments_ch.send_nowait(word_stream)
363
-
364
361
  word_stream.push_text(input)
365
362
  elif isinstance(input, self._FlushSentinel):
366
363
  if word_stream is not None:
367
364
  word_stream.end_input()
368
365
  word_stream = None
366
+ if word_stream is not None:
367
+ word_stream.end_input()
369
368
  self._segments_ch.close()
370
369
 
371
370
  @utils.log_exceptions(logger=logger)
@@ -402,6 +401,11 @@ class SynthesizeStream(tts.SynthesizeStream):
402
401
  segment_id = utils.shortuuid()
403
402
  expected_text = "" # accumulate all tokens sent
404
403
 
404
+ decoder = utils.codecs.AudioStreamDecoder(
405
+ sample_rate=self._opts.sample_rate,
406
+ num_channels=1,
407
+ )
408
+
405
409
  # 11labs protocol expects the first message to be an "init msg"
406
410
  init_pkt = dict(
407
411
  text=" ",
@@ -416,6 +420,7 @@ class SynthesizeStream(tts.SynthesizeStream):
416
420
  )
417
421
  await ws_conn.send_str(json.dumps(init_pkt))
418
422
 
423
+ @utils.log_exceptions(logger=logger)
419
424
  async def send_task():
420
425
  nonlocal expected_text
421
426
  xml_content = []
@@ -442,27 +447,23 @@ class SynthesizeStream(tts.SynthesizeStream):
442
447
  logger.warning("11labs stream ended with incomplete xml content")
443
448
  await ws_conn.send_str(json.dumps({"flush": True}))
444
449
 
450
+ # consumes from decoder and generates events
451
+ @utils.log_exceptions(logger=logger)
452
+ async def generate_task():
453
+ emitter = tts.SynthesizedAudioEmitter(
454
+ event_ch=self._event_ch,
455
+ request_id=request_id,
456
+ segment_id=segment_id,
457
+ )
458
+ async for frame in decoder:
459
+ emitter.push(frame)
460
+ emitter.flush()
461
+
462
+ # receives from ws and decodes audio
463
+ @utils.log_exceptions(logger=logger)
445
464
  async def recv_task():
446
465
  nonlocal expected_text
447
466
  received_text = ""
448
- audio_bstream = utils.audio.AudioByteStream(
449
- sample_rate=self._opts.sample_rate,
450
- num_channels=1,
451
- )
452
- last_frame: rtc.AudioFrame | None = None
453
-
454
- def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
455
- nonlocal last_frame
456
- if last_frame is not None:
457
- self._event_ch.send_nowait(
458
- tts.SynthesizedAudio(
459
- request_id=request_id,
460
- segment_id=segment_id,
461
- frame=last_frame,
462
- is_final=is_final,
463
- )
464
- )
465
- last_frame = None
466
467
 
467
468
  while True:
468
469
  msg = await ws_conn.receive()
@@ -481,45 +482,36 @@ class SynthesizeStream(tts.SynthesizeStream):
481
482
  continue
482
483
 
483
484
  data = json.loads(msg.data)
484
- encoding = _encoding_from_format(self._opts.encoding)
485
485
  if data.get("audio"):
486
486
  b64data = base64.b64decode(data["audio"])
487
- if encoding == "mp3":
488
- for frame in self._mp3_decoder.decode_chunk(b64data):
489
- for frame in audio_bstream.write(frame.data.tobytes()):
490
- _send_last_frame(
491
- segment_id=segment_id, is_final=False
492
- )
493
- last_frame = frame
494
- else:
495
- for frame in audio_bstream.write(b64data):
496
- _send_last_frame(segment_id=segment_id, is_final=False)
497
- last_frame = frame
498
- elif data.get("isFinal"):
499
- for frame in audio_bstream.flush():
500
- _send_last_frame(segment_id=segment_id, is_final=False)
501
- last_frame = frame
502
- _send_last_frame(segment_id=segment_id, is_final=True)
503
- break
487
+ decoder.push(b64data)
488
+
489
+ if alignment := data.get("normalizedAlignment"):
490
+ received_text += "".join(
491
+ alignment.get("chars", [])
492
+ ).replace(" ", "")
493
+ if received_text == expected_text:
494
+ decoder.end_input()
495
+ break
504
496
  elif data.get("error"):
505
- logger.error("11labs reported an error: %s", data["error"])
497
+ raise APIStatusError(
498
+ message=data["error"],
499
+ status_code=500,
500
+ request_id=request_id,
501
+ body=None,
502
+ )
506
503
  else:
507
- logger.error("unexpected 11labs message %s", data)
508
-
509
- if alignment := data.get("normalizedAlignment"):
510
- received_text += "".join(alignment.get("chars", [])).replace(
511
- " ", ""
504
+ raise APIStatusError(
505
+ message=f"unexpected 11labs message {data}",
506
+ status_code=500,
507
+ request_id=request_id,
508
+ body=None,
512
509
  )
513
- if received_text == expected_text:
514
- for frame in audio_bstream.flush():
515
- _send_last_frame(segment_id=segment_id, is_final=False)
516
- last_frame = frame
517
- _send_last_frame(segment_id=segment_id, is_final=True)
518
- break
519
510
 
520
511
  tasks = [
521
512
  asyncio.create_task(send_task()),
522
513
  asyncio.create_task(recv_task()),
514
+ asyncio.create_task(generate_task()),
523
515
  ]
524
516
  try:
525
517
  await asyncio.gather(*tasks)
@@ -532,10 +524,13 @@ class SynthesizeStream(tts.SynthesizeStream):
532
524
  request_id=request_id,
533
525
  body=None,
534
526
  ) from e
527
+ except APIStatusError:
528
+ raise
535
529
  except Exception as e:
536
530
  raise APIConnectionError() from e
537
531
  finally:
538
532
  await utils.aio.gracefully_cancel(*tasks)
533
+ await decoder.aclose()
539
534
 
540
535
 
541
536
  def _dict_to_voices_list(data: dict[str, Any]):
@@ -561,11 +556,13 @@ def _synthesize_url(opts: _TTSOptions) -> str:
561
556
  voice_id = opts.voice.id
562
557
  model_id = opts.model
563
558
  output_format = opts.encoding
564
- latency = opts.streaming_latency
565
- return (
559
+ url = (
566
560
  f"{base_url}/text-to-speech/{voice_id}/stream?"
567
- f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
561
+ f"model_id={model_id}&output_format={output_format}"
568
562
  )
563
+ if opts.streaming_latency:
564
+ url += f"&optimize_streaming_latency={opts.streaming_latency}"
565
+ return url
569
566
 
570
567
 
571
568
  def _stream_url(opts: _TTSOptions) -> str:
@@ -573,14 +570,16 @@ def _stream_url(opts: _TTSOptions) -> str:
573
570
  voice_id = opts.voice.id
574
571
  model_id = opts.model
575
572
  output_format = opts.encoding
576
- latency = opts.streaming_latency
577
573
  enable_ssml = str(opts.enable_ssml_parsing).lower()
578
574
  language = opts.language
575
+ inactivity_timeout = opts.inactivity_timeout
579
576
  url = (
580
577
  f"{base_url}/text-to-speech/{voice_id}/stream-input?"
581
- f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}&"
582
- f"enable_ssml_parsing={enable_ssml}"
578
+ f"model_id={model_id}&output_format={output_format}&"
579
+ f"enable_ssml_parsing={enable_ssml}&inactivity_timeout={inactivity_timeout}"
583
580
  )
584
581
  if language is not None:
585
582
  url += f"&language_code={language}"
583
+ if opts.streaming_latency:
584
+ url += f"&optimize_streaming_latency={opts.streaming_latency}"
586
585
  return url
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.7.14"
15
+ __version__ = "0.8.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.7.14
3
+ Version: 0.8.0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit-agents[codecs]>=0.12.11
22
+ Requires-Dist: livekit-agents[codecs]<1.0.0,>=0.12.16
23
23
  Dynamic: classifier
24
24
  Dynamic: description
25
25
  Dynamic: description-content-type
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=nB43wLS1ilzS7IxLYVSQxBjKPnbiPl4AHpHAOlG2i00,273
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=KCZnuAngDZck4zIMMgp0BLV0GS31kKChMvdvXUVZ8vY,20491
6
+ livekit/plugins/elevenlabs/version.py,sha256=fObgfvFfJb5Vj0qY1hgEiVKSo6z6atjrJvwAVl4KvR4,600
7
+ livekit_plugins_elevenlabs-0.8.0.dist-info/METADATA,sha256=BwddENtvF9zqxTgjgIsHyavyRfA82TBISYEVwFfo2vs,1529
8
+ livekit_plugins_elevenlabs-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
9
+ livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=cVoaMYNlUXZzP-HOpbtU16OM9m-bACnSat8-o87tTyk,435
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=_d8V_YLx1tuScKtmDipoKHhqF3y68lXg03phixEHU3M,21419
6
- livekit/plugins/elevenlabs/version.py,sha256=1Trenk6kp4J1gdS0z55hdro60GNOnD1s0F3-AoNr4VM,601
7
- livekit_plugins_elevenlabs-0.7.14.dist-info/METADATA,sha256=WGgcKpZb9PYymh1pNvF7B5dhLXUlQj3n0ALlwJmfYfE,1523
8
- livekit_plugins_elevenlabs-0.7.14.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
9
- livekit_plugins_elevenlabs-0.7.14.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.7.14.dist-info/RECORD,,