livekit-plugins-elevenlabs 0.7.13__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,14 +10,4 @@ TTSModels = Literal[
10
10
  "eleven_flash_v2",
11
11
  ]
12
12
 
13
- TTSEncoding = Literal[
14
- "mp3_22050_32",
15
- "mp3_44100_32",
16
- "mp3_44100_64",
17
- "mp3_44100_96",
18
- "mp3_44100_128",
19
- "mp3_44100_192",
20
- "pcm_16000",
21
- "pcm_22050",
22
- "pcm_44100",
23
- ]
13
+ TTSEncoding = Literal["mp3_44100",]
@@ -19,13 +19,12 @@ import base64
19
19
  import dataclasses
20
20
  import json
21
21
  import os
22
+ import weakref
22
23
  from dataclasses import dataclass
23
- from typing import Any, List, Literal
24
+ from typing import Any, List, Optional
24
25
 
25
26
  import aiohttp
26
- from livekit import rtc
27
27
  from livekit.agents import (
28
- DEFAULT_API_CONNECT_OPTIONS,
29
28
  APIConnectionError,
30
29
  APIConnectOptions,
31
30
  APIStatusError,
@@ -38,28 +37,20 @@ from livekit.agents import (
38
37
  from .log import logger
39
38
  from .models import TTSEncoding, TTSModels
40
39
 
41
- _Encoding = Literal["mp3", "pcm"]
40
+ _DefaultEncoding: TTSEncoding = "mp3_44100"
42
41
 
43
42
 
44
43
  def _sample_rate_from_format(output_format: TTSEncoding) -> int:
45
- split = output_format.split("_") # e.g: mp3_22050_32
44
+ split = output_format.split("_") # e.g: mp3_44100
46
45
  return int(split[1])
47
46
 
48
47
 
49
- def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
50
- if output_format.startswith("mp3"):
51
- return "mp3"
52
- elif output_format.startswith("pcm"):
53
- return "pcm"
54
-
55
- raise ValueError(f"Unknown format: {output_format}")
56
-
57
-
58
48
  @dataclass
59
49
  class VoiceSettings:
60
50
  stability: float # [0.0 - 1.0]
61
51
  similarity_boost: float # [0.0 - 1.0]
62
52
  style: float | None = None # [0.0 - 1.0]
53
+ speed: float | None = 1.0 # [0.8 - 1.2]
63
54
  use_speaker_boost: bool | None = False
64
55
 
65
56
 
@@ -76,12 +67,17 @@ DEFAULT_VOICE = Voice(
76
67
  name="Bella",
77
68
  category="premade",
78
69
  settings=VoiceSettings(
79
- stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
70
+ stability=0.71,
71
+ speed=1.0,
72
+ similarity_boost=0.5,
73
+ style=0.0,
74
+ use_speaker_boost=True,
80
75
  ),
81
76
  )
82
77
 
83
78
  API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
84
79
  AUTHORIZATION_HEADER = "xi-api-key"
80
+ WS_INACTIVITY_TIMEOUT = 300
85
81
 
86
82
 
87
83
  @dataclass
@@ -97,6 +93,7 @@ class _TTSOptions:
97
93
  word_tokenizer: tokenize.WordTokenizer
98
94
  chunk_length_schedule: list[int]
99
95
  enable_ssml_parsing: bool
96
+ inactivity_timeout: int
100
97
 
101
98
 
102
99
  class TTS(tts.TTS):
@@ -107,11 +104,9 @@ class TTS(tts.TTS):
107
104
  model: TTSModels | str = "eleven_flash_v2_5",
108
105
  api_key: str | None = None,
109
106
  base_url: str | None = None,
110
- encoding: TTSEncoding = "mp3_22050_32",
111
- streaming_latency: int = 3,
112
- word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
113
- ignore_punctuation=False # punctuation can help for intonation
114
- ),
107
+ streaming_latency: int = 0,
108
+ inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
109
+ word_tokenizer: Optional[tokenize.WordTokenizer] = None,
115
110
  enable_ssml_parsing: bool = False,
116
111
  chunk_length_schedule: list[int] = [80, 120, 200, 260], # range is [50, 500]
117
112
  http_session: aiohttp.ClientSession | None = None,
@@ -127,8 +122,8 @@ class TTS(tts.TTS):
127
122
  model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
128
123
  api_key (str | None): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
129
124
  base_url (str | None): Custom base URL for the API. Optional.
130
- encoding (TTSEncoding): Audio encoding format. Defaults to "mp3_22050_32".
131
- streaming_latency (int): Latency in seconds for streaming. Defaults to 3.
125
+ streaming_latency (int): Optimize for streaming latency, defaults to 0 - disabled. 4 for max latency optimizations. deprecated
126
+ inactivity_timeout (int): Inactivity timeout in seconds for the websocket connection. Defaults to 300.
132
127
  word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
133
128
  enable_ssml_parsing (bool): Enable SSML parsing for input text. Defaults to False.
134
129
  chunk_length_schedule (list[int]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
@@ -140,7 +135,7 @@ class TTS(tts.TTS):
140
135
  capabilities=tts.TTSCapabilities(
141
136
  streaming=True,
142
137
  ),
143
- sample_rate=_sample_rate_from_format(encoding),
138
+ sample_rate=_sample_rate_from_format(_DefaultEncoding),
144
139
  num_channels=1,
145
140
  )
146
141
 
@@ -156,20 +151,46 @@ class TTS(tts.TTS):
156
151
  "ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable"
157
152
  )
158
153
 
154
+ if word_tokenizer is None:
155
+ word_tokenizer = tokenize.basic.WordTokenizer(
156
+ ignore_punctuation=False # punctuation can help for intonation
157
+ )
158
+
159
159
  self._opts = _TTSOptions(
160
160
  voice=voice,
161
161
  model=model,
162
162
  api_key=api_key,
163
163
  base_url=base_url or API_BASE_URL_V1,
164
- encoding=encoding,
164
+ encoding=_DefaultEncoding,
165
165
  sample_rate=self.sample_rate,
166
166
  streaming_latency=streaming_latency,
167
167
  word_tokenizer=word_tokenizer,
168
168
  chunk_length_schedule=chunk_length_schedule,
169
169
  enable_ssml_parsing=enable_ssml_parsing,
170
170
  language=language,
171
+ inactivity_timeout=inactivity_timeout,
171
172
  )
172
173
  self._session = http_session
174
+ self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
175
+ connect_cb=self._connect_ws,
176
+ close_cb=self._close_ws,
177
+ max_session_duration=inactivity_timeout,
178
+ mark_refreshed_on_get=True,
179
+ )
180
+ self._streams = weakref.WeakSet[SynthesizeStream]()
181
+
182
+ async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
183
+ session = self._ensure_session()
184
+ return await asyncio.wait_for(
185
+ session.ws_connect(
186
+ _stream_url(self._opts),
187
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
188
+ ),
189
+ self._conn_options.timeout,
190
+ )
191
+
192
+ async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
193
+ await ws.close()
173
194
 
174
195
  def _ensure_session(self) -> aiohttp.ClientSession:
175
196
  if not self._session:
@@ -177,6 +198,9 @@ class TTS(tts.TTS):
177
198
 
178
199
  return self._session
179
200
 
201
+ def prewarm(self) -> None:
202
+ self._pool.prewarm()
203
+
180
204
  async def list_voices(self) -> List[Voice]:
181
205
  async with self._ensure_session().get(
182
206
  f"{self._opts.base_url}/voices",
@@ -205,7 +229,7 @@ class TTS(tts.TTS):
205
229
  self,
206
230
  text: str,
207
231
  *,
208
- conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
232
+ conn_options: Optional[APIConnectOptions] = None,
209
233
  ) -> "ChunkedStream":
210
234
  return ChunkedStream(
211
235
  tts=self,
@@ -216,14 +240,18 @@ class TTS(tts.TTS):
216
240
  )
217
241
 
218
242
  def stream(
219
- self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
243
+ self, *, conn_options: Optional[APIConnectOptions] = None
220
244
  ) -> "SynthesizeStream":
221
- return SynthesizeStream(
222
- tts=self,
223
- conn_options=conn_options,
224
- opts=self._opts,
225
- session=self._ensure_session(),
226
- )
245
+ stream = SynthesizeStream(tts=self, pool=self._pool, opts=self._opts)
246
+ self._streams.add(stream)
247
+ return stream
248
+
249
+ async def aclose(self) -> None:
250
+ for stream in list(self._streams):
251
+ await stream.aclose()
252
+ self._streams.clear()
253
+ await self._pool.aclose()
254
+ await super().aclose()
227
255
 
228
256
 
229
257
  class ChunkedStream(tts.ChunkedStream):
@@ -235,20 +263,14 @@ class ChunkedStream(tts.ChunkedStream):
235
263
  tts: TTS,
236
264
  input_text: str,
237
265
  opts: _TTSOptions,
238
- conn_options: APIConnectOptions,
266
+ conn_options: Optional[APIConnectOptions] = None,
239
267
  session: aiohttp.ClientSession,
240
268
  ) -> None:
241
269
  super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
242
270
  self._opts, self._session = opts, session
243
- if _encoding_from_format(self._opts.encoding) == "mp3":
244
- self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
245
271
 
246
272
  async def _run(self) -> None:
247
273
  request_id = utils.shortuuid()
248
- bstream = utils.audio.AudioByteStream(
249
- sample_rate=self._opts.sample_rate, num_channels=1
250
- )
251
-
252
274
  voice_settings = (
253
275
  _strip_nones(dataclasses.asdict(self._opts.voice.settings))
254
276
  if self._opts.voice.settings
@@ -260,6 +282,12 @@ class ChunkedStream(tts.ChunkedStream):
260
282
  "voice_settings": voice_settings,
261
283
  }
262
284
 
285
+ decoder = utils.codecs.AudioStreamDecoder(
286
+ sample_rate=self._opts.sample_rate,
287
+ num_channels=1,
288
+ )
289
+
290
+ decode_task: asyncio.Task | None = None
263
291
  try:
264
292
  async with self._session.post(
265
293
  _synthesize_url(self._opts),
@@ -271,32 +299,21 @@ class ChunkedStream(tts.ChunkedStream):
271
299
  logger.error("11labs returned non-audio data: %s", content)
272
300
  return
273
301
 
274
- encoding = _encoding_from_format(self._opts.encoding)
275
- if encoding == "mp3":
276
- async for bytes_data, _ in resp.content.iter_chunks():
277
- for frame in self._mp3_decoder.decode_chunk(bytes_data):
278
- for frame in bstream.write(frame.data.tobytes()):
279
- self._event_ch.send_nowait(
280
- tts.SynthesizedAudio(
281
- request_id=request_id,
282
- frame=frame,
283
- )
284
- )
285
- else:
286
- async for bytes_data, _ in resp.content.iter_chunks():
287
- for frame in bstream.write(bytes_data):
288
- self._event_ch.send_nowait(
289
- tts.SynthesizedAudio(
290
- request_id=request_id,
291
- frame=frame,
292
- )
293
- )
294
-
295
- for frame in bstream.flush():
296
- self._event_ch.send_nowait(
297
- tts.SynthesizedAudio(request_id=request_id, frame=frame)
298
- )
299
-
302
+ async def _decode_loop():
303
+ try:
304
+ async for bytes_data, _ in resp.content.iter_chunks():
305
+ decoder.push(bytes_data)
306
+ finally:
307
+ decoder.end_input()
308
+
309
+ decode_task = asyncio.create_task(_decode_loop())
310
+ emitter = tts.SynthesizedAudioEmitter(
311
+ event_ch=self._event_ch,
312
+ request_id=request_id,
313
+ )
314
+ async for frame in decoder:
315
+ emitter.push(frame)
316
+ emitter.flush()
300
317
  except asyncio.TimeoutError as e:
301
318
  raise APITimeoutError() from e
302
319
  except aiohttp.ClientResponseError as e:
@@ -308,6 +325,10 @@ class ChunkedStream(tts.ChunkedStream):
308
325
  ) from e
309
326
  except Exception as e:
310
327
  raise APIConnectionError() from e
328
+ finally:
329
+ if decode_task:
330
+ await utils.aio.gracefully_cancel(decode_task)
331
+ await decoder.aclose()
311
332
 
312
333
 
313
334
  class SynthesizeStream(tts.SynthesizeStream):
@@ -317,15 +338,14 @@ class SynthesizeStream(tts.SynthesizeStream):
317
338
  self,
318
339
  *,
319
340
  tts: TTS,
320
- session: aiohttp.ClientSession,
321
- conn_options: APIConnectOptions,
341
+ pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
322
342
  opts: _TTSOptions,
323
343
  ):
324
- super().__init__(tts=tts, conn_options=conn_options)
325
- self._opts, self._session = opts, session
326
- self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
344
+ super().__init__(tts=tts)
345
+ self._opts, self._pool = opts, pool
327
346
 
328
347
  async def _run(self) -> None:
348
+ request_id = utils.shortuuid()
329
349
  self._segments_ch = utils.aio.Chan[tokenize.WordStream]()
330
350
 
331
351
  @utils.log_exceptions(logger=logger)
@@ -338,190 +358,179 @@ class SynthesizeStream(tts.SynthesizeStream):
338
358
  # new segment (after flush for e.g)
339
359
  word_stream = self._opts.word_tokenizer.stream()
340
360
  self._segments_ch.send_nowait(word_stream)
341
-
342
361
  word_stream.push_text(input)
343
362
  elif isinstance(input, self._FlushSentinel):
344
363
  if word_stream is not None:
345
364
  word_stream.end_input()
346
-
347
365
  word_stream = None
348
-
366
+ if word_stream is not None:
367
+ word_stream.end_input()
349
368
  self._segments_ch.close()
350
369
 
351
370
  @utils.log_exceptions(logger=logger)
352
- async def _run():
371
+ async def _process_segments():
353
372
  async for word_stream in self._segments_ch:
354
- await self._run_ws(word_stream)
373
+ await self._run_ws(word_stream, request_id)
355
374
 
356
375
  tasks = [
357
376
  asyncio.create_task(_tokenize_input()),
358
- asyncio.create_task(_run()),
377
+ asyncio.create_task(_process_segments()),
359
378
  ]
360
379
  try:
361
380
  await asyncio.gather(*tasks)
381
+ except asyncio.TimeoutError as e:
382
+ raise APITimeoutError() from e
383
+ except aiohttp.ClientResponseError as e:
384
+ raise APIStatusError(
385
+ message=e.message,
386
+ status_code=e.status,
387
+ request_id=request_id,
388
+ body=None,
389
+ ) from e
390
+ except Exception as e:
391
+ raise APIConnectionError() from e
362
392
  finally:
363
393
  await utils.aio.gracefully_cancel(*tasks)
364
394
 
365
395
  async def _run_ws(
366
396
  self,
367
397
  word_stream: tokenize.WordStream,
368
- max_retry: int = 3,
398
+ request_id: str,
369
399
  ) -> None:
370
- ws_conn: aiohttp.ClientWebSocketResponse | None = None
371
- for try_i in range(max_retry):
372
- retry_delay = 5
373
- try:
374
- if try_i > 0:
375
- await asyncio.sleep(retry_delay)
400
+ async with self._pool.connection() as ws_conn:
401
+ segment_id = utils.shortuuid()
402
+ expected_text = "" # accumulate all tokens sent
376
403
 
377
- ws_conn = await self._session.ws_connect(
378
- _stream_url(self._opts),
379
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
380
- )
381
- break
382
- except Exception as e:
383
- logger.warning(
384
- f"failed to connect to 11labs, retrying in {retry_delay}s",
385
- exc_info=e,
386
- )
387
-
388
- if ws_conn is None:
389
- raise Exception(f"failed to connect to 11labs after {max_retry} retries")
390
-
391
- request_id = utils.shortuuid()
392
- segment_id = utils.shortuuid()
393
-
394
- # 11labs protocol expects the first message to be an "init msg"
395
- init_pkt = dict(
396
- text=" ",
397
- try_trigger_generation=True,
398
- voice_settings=_strip_nones(dataclasses.asdict(self._opts.voice.settings))
399
- if self._opts.voice.settings
400
- else None,
401
- generation_config=dict(
402
- chunk_length_schedule=self._opts.chunk_length_schedule
403
- ),
404
- )
405
- await ws_conn.send_str(json.dumps(init_pkt))
406
- eos_sent = False
407
-
408
- async def send_task():
409
- nonlocal eos_sent
410
-
411
- xml_content = []
412
- async for data in word_stream:
413
- text = data.token
414
-
415
- # send the xml phoneme in one go
416
- if (
417
- self._opts.enable_ssml_parsing
418
- and data.token.startswith("<phoneme")
419
- or xml_content
420
- ):
421
- xml_content.append(text)
422
- if data.token.find("</phoneme>") > -1:
423
- text = self._opts.word_tokenizer.format_words(xml_content)
424
- xml_content = []
425
- else:
426
- continue
427
-
428
- # try_trigger_generation=True is a bad practice, we expose
429
- # chunk_length_schedule instead
430
- data_pkt = dict(
431
- text=f"{text} ", # must always end with a space
432
- try_trigger_generation=False,
433
- )
434
- self._mark_started()
435
- await ws_conn.send_str(json.dumps(data_pkt))
436
-
437
- if xml_content:
438
- logger.warning("11labs stream ended with incomplete xml content")
439
-
440
- # no more token, mark eos
441
- eos_pkt = dict(text="")
442
- await ws_conn.send_str(json.dumps(eos_pkt))
443
- eos_sent = True
444
-
445
- async def recv_task():
446
- nonlocal eos_sent
447
- audio_bstream = utils.audio.AudioByteStream(
404
+ decoder = utils.codecs.AudioStreamDecoder(
448
405
  sample_rate=self._opts.sample_rate,
449
406
  num_channels=1,
450
407
  )
451
408
 
452
- last_frame: rtc.AudioFrame | None = None
453
-
454
- def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
455
- nonlocal last_frame
456
- if last_frame is not None:
457
- self._event_ch.send_nowait(
458
- tts.SynthesizedAudio(
459
- request_id=request_id,
460
- segment_id=segment_id,
461
- frame=last_frame,
462
- is_final=is_final,
463
- )
464
- )
465
-
466
- last_frame = None
467
-
468
- while True:
469
- msg = await ws_conn.receive()
470
- if msg.type in (
471
- aiohttp.WSMsgType.CLOSED,
472
- aiohttp.WSMsgType.CLOSE,
473
- aiohttp.WSMsgType.CLOSING,
474
- ):
475
- if not eos_sent:
409
+ # 11labs protocol expects the first message to be an "init msg"
410
+ init_pkt = dict(
411
+ text=" ",
412
+ voice_settings=_strip_nones(
413
+ dataclasses.asdict(self._opts.voice.settings)
414
+ )
415
+ if self._opts.voice.settings
416
+ else None,
417
+ generation_config=dict(
418
+ chunk_length_schedule=self._opts.chunk_length_schedule
419
+ ),
420
+ )
421
+ await ws_conn.send_str(json.dumps(init_pkt))
422
+
423
+ @utils.log_exceptions(logger=logger)
424
+ async def send_task():
425
+ nonlocal expected_text
426
+ xml_content = []
427
+ async for data in word_stream:
428
+ text = data.token
429
+ expected_text += text
430
+ # send the xml phoneme in one go
431
+ if (
432
+ self._opts.enable_ssml_parsing
433
+ and data.token.startswith("<phoneme")
434
+ or xml_content
435
+ ):
436
+ xml_content.append(text)
437
+ if text.find("</phoneme>") > -1:
438
+ text = self._opts.word_tokenizer.format_words(xml_content)
439
+ xml_content = []
440
+ else:
441
+ continue
442
+
443
+ data_pkt = dict(text=f"{text} ") # must always end with a space
444
+ self._mark_started()
445
+ await ws_conn.send_str(json.dumps(data_pkt))
446
+ if xml_content:
447
+ logger.warning("11labs stream ended with incomplete xml content")
448
+ await ws_conn.send_str(json.dumps({"flush": True}))
449
+
450
+ # consumes from decoder and generates events
451
+ @utils.log_exceptions(logger=logger)
452
+ async def generate_task():
453
+ emitter = tts.SynthesizedAudioEmitter(
454
+ event_ch=self._event_ch,
455
+ request_id=request_id,
456
+ segment_id=segment_id,
457
+ )
458
+ async for frame in decoder:
459
+ emitter.push(frame)
460
+ emitter.flush()
461
+
462
+ # receives from ws and decodes audio
463
+ @utils.log_exceptions(logger=logger)
464
+ async def recv_task():
465
+ nonlocal expected_text
466
+ received_text = ""
467
+
468
+ while True:
469
+ msg = await ws_conn.receive()
470
+ if msg.type in (
471
+ aiohttp.WSMsgType.CLOSED,
472
+ aiohttp.WSMsgType.CLOSE,
473
+ aiohttp.WSMsgType.CLOSING,
474
+ ):
476
475
  raise APIStatusError(
477
476
  "11labs connection closed unexpectedly, not all tokens have been consumed",
478
477
  request_id=request_id,
479
478
  )
480
- return
481
479
 
482
- if msg.type != aiohttp.WSMsgType.TEXT:
483
- logger.warning("unexpected 11labs message type %s", msg.type)
484
- continue
485
-
486
- data = json.loads(msg.data)
487
- encoding = _encoding_from_format(self._opts.encoding)
488
- if data.get("audio"):
489
- b64data = base64.b64decode(data["audio"])
490
- if encoding == "mp3":
491
- for frame in self._mp3_decoder.decode_chunk(b64data):
492
- for frame in audio_bstream.write(frame.data.tobytes()):
493
- _send_last_frame(segment_id=segment_id, is_final=False)
494
- last_frame = frame
480
+ if msg.type != aiohttp.WSMsgType.TEXT:
481
+ logger.warning("unexpected 11labs message type %s", msg.type)
482
+ continue
495
483
 
484
+ data = json.loads(msg.data)
485
+ if data.get("audio"):
486
+ b64data = base64.b64decode(data["audio"])
487
+ decoder.push(b64data)
488
+
489
+ if alignment := data.get("normalizedAlignment"):
490
+ received_text += "".join(
491
+ alignment.get("chars", [])
492
+ ).replace(" ", "")
493
+ if received_text == expected_text:
494
+ decoder.end_input()
495
+ break
496
+ elif data.get("error"):
497
+ raise APIStatusError(
498
+ message=data["error"],
499
+ status_code=500,
500
+ request_id=request_id,
501
+ body=None,
502
+ )
496
503
  else:
497
- for frame in audio_bstream.write(b64data):
498
- _send_last_frame(segment_id=segment_id, is_final=False)
499
- last_frame = frame
500
-
501
- elif data.get("isFinal"):
502
- for frame in audio_bstream.flush():
503
- _send_last_frame(segment_id=segment_id, is_final=False)
504
- last_frame = frame
505
-
506
- _send_last_frame(segment_id=segment_id, is_final=True)
507
-
508
- pass
509
- elif data.get("error"):
510
- logger.error("11labs reported an error: %s", data["error"])
511
- else:
512
- logger.error("unexpected 11labs message %s", data)
513
-
514
- tasks = [
515
- asyncio.create_task(send_task()),
516
- asyncio.create_task(recv_task()),
517
- ]
504
+ raise APIStatusError(
505
+ message=f"unexpected 11labs message {data}",
506
+ status_code=500,
507
+ request_id=request_id,
508
+ body=None,
509
+ )
518
510
 
519
- try:
520
- await asyncio.gather(*tasks)
521
- finally:
522
- await utils.aio.gracefully_cancel(*tasks)
523
- if ws_conn is not None:
524
- await ws_conn.close()
511
+ tasks = [
512
+ asyncio.create_task(send_task()),
513
+ asyncio.create_task(recv_task()),
514
+ asyncio.create_task(generate_task()),
515
+ ]
516
+ try:
517
+ await asyncio.gather(*tasks)
518
+ except asyncio.TimeoutError as e:
519
+ raise APITimeoutError() from e
520
+ except aiohttp.ClientResponseError as e:
521
+ raise APIStatusError(
522
+ message=e.message,
523
+ status_code=e.status,
524
+ request_id=request_id,
525
+ body=None,
526
+ ) from e
527
+ except APIStatusError:
528
+ raise
529
+ except Exception as e:
530
+ raise APIConnectionError() from e
531
+ finally:
532
+ await utils.aio.gracefully_cancel(*tasks)
533
+ await decoder.aclose()
525
534
 
526
535
 
527
536
  def _dict_to_voices_list(data: dict[str, Any]):
@@ -547,11 +556,13 @@ def _synthesize_url(opts: _TTSOptions) -> str:
547
556
  voice_id = opts.voice.id
548
557
  model_id = opts.model
549
558
  output_format = opts.encoding
550
- latency = opts.streaming_latency
551
- return (
559
+ url = (
552
560
  f"{base_url}/text-to-speech/{voice_id}/stream?"
553
- f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
561
+ f"model_id={model_id}&output_format={output_format}"
554
562
  )
563
+ if opts.streaming_latency:
564
+ url += f"&optimize_streaming_latency={opts.streaming_latency}"
565
+ return url
555
566
 
556
567
 
557
568
  def _stream_url(opts: _TTSOptions) -> str:
@@ -559,14 +570,16 @@ def _stream_url(opts: _TTSOptions) -> str:
559
570
  voice_id = opts.voice.id
560
571
  model_id = opts.model
561
572
  output_format = opts.encoding
562
- latency = opts.streaming_latency
563
573
  enable_ssml = str(opts.enable_ssml_parsing).lower()
564
574
  language = opts.language
575
+ inactivity_timeout = opts.inactivity_timeout
565
576
  url = (
566
577
  f"{base_url}/text-to-speech/{voice_id}/stream-input?"
567
- f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}&"
568
- f"enable_ssml_parsing={enable_ssml}"
578
+ f"model_id={model_id}&output_format={output_format}&"
579
+ f"enable_ssml_parsing={enable_ssml}&inactivity_timeout={inactivity_timeout}"
569
580
  )
570
581
  if language is not None:
571
582
  url += f"&language_code={language}"
583
+ if opts.streaming_latency:
584
+ url += f"&optimize_streaming_latency={opts.streaming_latency}"
572
585
  return url
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.7.13"
15
+ __version__ = "0.8.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.7.13
3
+ Version: 0.8.0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit-agents[codecs]>=0.12.11
22
+ Requires-Dist: livekit-agents[codecs]<1.0.0,>=0.12.16
23
23
  Dynamic: classifier
24
24
  Dynamic: description
25
25
  Dynamic: description-content-type
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=nB43wLS1ilzS7IxLYVSQxBjKPnbiPl4AHpHAOlG2i00,273
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=KCZnuAngDZck4zIMMgp0BLV0GS31kKChMvdvXUVZ8vY,20491
6
+ livekit/plugins/elevenlabs/version.py,sha256=fObgfvFfJb5Vj0qY1hgEiVKSo6z6atjrJvwAVl4KvR4,600
7
+ livekit_plugins_elevenlabs-0.8.0.dist-info/METADATA,sha256=BwddENtvF9zqxTgjgIsHyavyRfA82TBISYEVwFfo2vs,1529
8
+ livekit_plugins_elevenlabs-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
9
+ livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=cVoaMYNlUXZzP-HOpbtU16OM9m-bACnSat8-o87tTyk,435
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=dFeEtnNYR0sIDjQZARvUb6cG3VUD4gUNU3hpbFGpyNo,19744
6
- livekit/plugins/elevenlabs/version.py,sha256=ePihhrwb0N1YVz4mZBMdwgECen0up-RbS8yrvmQGHt4,601
7
- livekit_plugins_elevenlabs-0.7.13.dist-info/METADATA,sha256=en67LviRFvRsErhZ5qvb8UvbmQc2nLcYSijszgdMj1Q,1523
8
- livekit_plugins_elevenlabs-0.7.13.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
9
- livekit_plugins_elevenlabs-0.7.13.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.7.13.dist-info/RECORD,,