livekit-plugins-elevenlabs 0.7.4__py3-none-any.whl → 0.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,3 +37,12 @@ class ElevenLabsPlugin(Plugin):
37
37
 
38
38
 
39
39
  Plugin.register_plugin(ElevenLabsPlugin())
40
+
41
+ # Cleanup docs of unexported modules
42
+ _module = dir()
43
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
44
+
45
+ __pdoc__ = {}
46
+
47
+ for n in NOT_IN_ALL:
48
+ __pdoc__[n] = False
@@ -24,7 +24,14 @@ from typing import Any, List, Literal
24
24
 
25
25
  import aiohttp
26
26
  from livekit import rtc
27
- from livekit.agents import tokenize, tts, utils
27
+ from livekit.agents import (
28
+ APIConnectionError,
29
+ APIStatusError,
30
+ APITimeoutError,
31
+ tokenize,
32
+ tts,
33
+ utils,
34
+ )
28
35
 
29
36
  from .log import logger
30
37
  from .models import TTSEncoding, TTSModels
@@ -79,7 +86,7 @@ AUTHORIZATION_HEADER = "xi-api-key"
79
86
  class _TTSOptions:
80
87
  api_key: str
81
88
  voice: Voice
82
- model_id: TTSModels
89
+ model: TTSModels | str
83
90
  base_url: str
84
91
  encoding: TTSEncoding
85
92
  sample_rate: int
@@ -94,7 +101,7 @@ class TTS(tts.TTS):
94
101
  self,
95
102
  *,
96
103
  voice: Voice = DEFAULT_VOICE,
97
- model_id: TTSModels = "eleven_turbo_v2_5",
104
+ model: TTSModels | str = "eleven_turbo_v2_5",
98
105
  api_key: str | None = None,
99
106
  base_url: str | None = None,
100
107
  encoding: TTSEncoding = "mp3_22050_32",
@@ -105,12 +112,23 @@ class TTS(tts.TTS):
105
112
  enable_ssml_parsing: bool = False,
106
113
  chunk_length_schedule: list[int] = [80, 120, 200, 260], # range is [50, 500]
107
114
  http_session: aiohttp.ClientSession | None = None,
115
+ # deprecated
116
+ model_id: TTSModels | str | None = None,
108
117
  ) -> None:
109
118
  """
110
119
  Create a new instance of ElevenLabs TTS.
111
120
 
112
- ``api_key`` must be set to your ElevenLabs API key, either using the argument or by setting
113
- the ``ELEVEN_API_KEY`` environmental variable.
121
+ Args:
122
+ voice (Voice): Voice configuration. Defaults to `DEFAULT_VOICE`.
123
+ model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
124
+ api_key (str | None): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
125
+ base_url (str | None): Custom base URL for the API. Optional.
126
+ encoding (TTSEncoding): Audio encoding format. Defaults to "mp3_22050_32".
127
+ streaming_latency (int): Latency in seconds for streaming. Defaults to 3.
128
+ word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
129
+ enable_ssml_parsing (bool): Enable SSML parsing for input text. Defaults to False.
130
+ chunk_length_schedule (list[int]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
131
+ http_session (aiohttp.ClientSession | None): Custom HTTP session for API requests. Optional.
114
132
  """
115
133
 
116
134
  super().__init__(
@@ -120,13 +138,22 @@ class TTS(tts.TTS):
120
138
  sample_rate=_sample_rate_from_format(encoding),
121
139
  num_channels=1,
122
140
  )
141
+
142
+ if model_id is not None:
143
+ logger.warning(
144
+ "model_id is deprecated and will be removed in 1.5.0, use model instead",
145
+ )
146
+ model = model_id
147
+
123
148
  api_key = api_key or os.environ.get("ELEVEN_API_KEY")
124
149
  if not api_key:
125
- raise ValueError("ELEVEN_API_KEY must be set")
150
+ raise ValueError(
151
+ "ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable"
152
+ )
126
153
 
127
154
  self._opts = _TTSOptions(
128
155
  voice=voice,
129
- model_id=model_id,
156
+ model=model,
130
157
  api_key=api_key,
131
158
  base_url=base_url or API_BASE_URL_V1,
132
159
  encoding=encoding,
@@ -151,31 +178,43 @@ class TTS(tts.TTS):
151
178
  ) as resp:
152
179
  return _dict_to_voices_list(await resp.json())
153
180
 
181
+ def update_options(
182
+ self,
183
+ *,
184
+ voice: Voice = DEFAULT_VOICE,
185
+ model: TTSModels | str = "eleven_turbo_v2_5",
186
+ ) -> None:
187
+ """
188
+ Args:
189
+ voice (Voice): Voice configuration. Defaults to `DEFAULT_VOICE`.
190
+ model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
191
+ """
192
+ self._opts.model = model or self._opts.model
193
+ self._opts.voice = voice or self._opts.voice
194
+
154
195
  def synthesize(self, text: str) -> "ChunkedStream":
155
- return ChunkedStream(text, self._opts, self._ensure_session())
196
+ return ChunkedStream(self, text, self._opts, self._ensure_session())
156
197
 
157
198
  def stream(self) -> "SynthesizeStream":
158
- return SynthesizeStream(self._ensure_session(), self._opts)
199
+ return SynthesizeStream(self, self._ensure_session(), self._opts)
159
200
 
160
201
 
161
202
  class ChunkedStream(tts.ChunkedStream):
162
203
  """Synthesize using the chunked api endpoint"""
163
204
 
164
205
  def __init__(
165
- self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
206
+ self, tts: TTS, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
166
207
  ) -> None:
167
- super().__init__()
168
- self._text, self._opts, self._session = text, opts, session
208
+ super().__init__(tts, text)
209
+ self._opts, self._session = opts, session
169
210
  if _encoding_from_format(self._opts.encoding) == "mp3":
170
211
  self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
171
212
 
172
- @utils.log_exceptions(logger=logger)
173
213
  async def _main_task(self) -> None:
214
+ request_id = utils.shortuuid()
174
215
  bstream = utils.audio.AudioByteStream(
175
216
  sample_rate=self._opts.sample_rate, num_channels=1
176
217
  )
177
- request_id = utils.shortuuid()
178
- segment_id = utils.shortuuid()
179
218
 
180
219
  voice_settings = (
181
220
  _strip_nones(dataclasses.asdict(self._opts.voice.settings))
@@ -183,59 +222,71 @@ class ChunkedStream(tts.ChunkedStream):
183
222
  else None
184
223
  )
185
224
  data = {
186
- "text": self._text,
187
- "model_id": self._opts.model_id,
225
+ "text": self._input_text,
226
+ "model_id": self._opts.model,
188
227
  "voice_settings": voice_settings,
189
228
  }
190
229
 
191
- async with self._session.post(
192
- _synthesize_url(self._opts),
193
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
194
- json=data,
195
- ) as resp:
196
- if not resp.content_type.startswith("audio/"):
197
- content = await resp.text()
198
- logger.error("11labs returned non-audio data: %s", content)
199
- return
200
- encoding = _encoding_from_format(self._opts.encoding)
201
- if encoding == "mp3":
202
- async for bytes_data, _ in resp.content.iter_chunks():
203
- for frame in self._mp3_decoder.decode_chunk(bytes_data):
204
- self._event_ch.send_nowait(
205
- tts.SynthesizedAudio(
206
- request_id=request_id,
207
- segment_id=segment_id,
208
- frame=frame,
209
- )
210
- )
211
- else:
212
- async for bytes_data, _ in resp.content.iter_chunks():
213
- for frame in bstream.write(bytes_data):
214
- self._event_ch.send_nowait(
215
- tts.SynthesizedAudio(
216
- request_id=request_id,
217
- segment_id=segment_id,
218
- frame=frame,
230
+ try:
231
+ async with self._session.post(
232
+ _synthesize_url(self._opts),
233
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
234
+ json=data,
235
+ ) as resp:
236
+ if not resp.content_type.startswith("audio/"):
237
+ content = await resp.text()
238
+ logger.error("11labs returned non-audio data: %s", content)
239
+ return
240
+
241
+ encoding = _encoding_from_format(self._opts.encoding)
242
+ if encoding == "mp3":
243
+ async for bytes_data, _ in resp.content.iter_chunks():
244
+ for frame in self._mp3_decoder.decode_chunk(bytes_data):
245
+ for frame in bstream.write(frame.data.tobytes()):
246
+ self._event_ch.send_nowait(
247
+ tts.SynthesizedAudio(
248
+ request_id=request_id,
249
+ frame=frame,
250
+ )
251
+ )
252
+ else:
253
+ async for bytes_data, _ in resp.content.iter_chunks():
254
+ for frame in bstream.write(bytes_data):
255
+ self._event_ch.send_nowait(
256
+ tts.SynthesizedAudio(
257
+ request_id=request_id,
258
+ frame=frame,
259
+ )
219
260
  )
220
- )
221
261
 
222
262
  for frame in bstream.flush():
223
263
  self._event_ch.send_nowait(
224
- tts.SynthesizedAudio(
225
- request_id=request_id, segment_id=segment_id, frame=frame
226
- )
264
+ tts.SynthesizedAudio(request_id=request_id, frame=frame)
227
265
  )
228
266
 
267
+ except asyncio.TimeoutError as e:
268
+ raise APITimeoutError() from e
269
+ except aiohttp.ClientResponseError as e:
270
+ raise APIStatusError(
271
+ message=e.message,
272
+ status_code=e.status,
273
+ request_id=None,
274
+ body=None,
275
+ ) from e
276
+ except Exception as e:
277
+ raise APIConnectionError() from e
278
+
229
279
 
230
280
  class SynthesizeStream(tts.SynthesizeStream):
231
281
  """Streamed API using websockets"""
232
282
 
233
283
  def __init__(
234
284
  self,
285
+ tts: TTS,
235
286
  session: aiohttp.ClientSession,
236
287
  opts: _TTSOptions,
237
288
  ):
238
- super().__init__()
289
+ super().__init__(tts)
239
290
  self._opts, self._session = opts, session
240
291
  self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
241
292
 
@@ -323,15 +374,34 @@ class SynthesizeStream(tts.SynthesizeStream):
323
374
  async def send_task():
324
375
  nonlocal eos_sent
325
376
 
377
+ xml_content = []
326
378
  async for data in word_stream:
379
+ text = data.token
380
+
381
+ # send the xml phoneme in one go
382
+ if (
383
+ self._opts.enable_ssml_parsing
384
+ and data.token.startswith("<phoneme")
385
+ or xml_content
386
+ ):
387
+ xml_content.append(text)
388
+ if data.token.find("</phoneme>") > -1:
389
+ text = self._opts.word_tokenizer.format_words(xml_content)
390
+ xml_content = []
391
+ else:
392
+ continue
393
+
327
394
  # try_trigger_generation=True is a bad practice, we expose
328
395
  # chunk_length_schedule instead
329
396
  data_pkt = dict(
330
- text=f"{data.token} ", # must always end with a space
397
+ text=f"{text} ", # must always end with a space
331
398
  try_trigger_generation=False,
332
399
  )
333
400
  await ws_conn.send_str(json.dumps(data_pkt))
334
401
 
402
+ if xml_content:
403
+ logger.warning("11labs stream ended with incomplete xml content")
404
+
335
405
  # no more token, mark eos
336
406
  eos_pkt = dict(text="")
337
407
  await ws_conn.send_str(json.dumps(eos_pkt))
@@ -339,6 +409,26 @@ class SynthesizeStream(tts.SynthesizeStream):
339
409
 
340
410
  async def recv_task():
341
411
  nonlocal eos_sent
412
+ audio_bstream = utils.audio.AudioByteStream(
413
+ sample_rate=self._opts.sample_rate,
414
+ num_channels=1,
415
+ )
416
+
417
+ last_frame: rtc.AudioFrame | None = None
418
+
419
+ def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
420
+ nonlocal last_frame
421
+ if last_frame is not None:
422
+ self._event_ch.send_nowait(
423
+ tts.SynthesizedAudio(
424
+ request_id=request_id,
425
+ segment_id=segment_id,
426
+ frame=last_frame,
427
+ is_final=is_final,
428
+ )
429
+ )
430
+
431
+ last_frame = None
342
432
 
343
433
  while True:
344
434
  msg = await ws_conn.receive()
@@ -357,11 +447,33 @@ class SynthesizeStream(tts.SynthesizeStream):
357
447
  logger.warning("unexpected 11labs message type %s", msg.type)
358
448
  continue
359
449
 
360
- self._process_stream_event(
361
- data=json.loads(msg.data),
362
- request_id=request_id,
363
- segment_id=segment_id,
364
- )
450
+ data = json.loads(msg.data)
451
+ encoding = _encoding_from_format(self._opts.encoding)
452
+ if data.get("audio"):
453
+ b64data = base64.b64decode(data["audio"])
454
+ if encoding == "mp3":
455
+ for frame in self._mp3_decoder.decode_chunk(b64data):
456
+ for frame in audio_bstream.write(frame.data.tobytes()):
457
+ _send_last_frame(segment_id=segment_id, is_final=False)
458
+ last_frame = frame
459
+
460
+ else:
461
+ for frame in audio_bstream.write(b64data):
462
+ _send_last_frame(segment_id=segment_id, is_final=False)
463
+ last_frame = frame
464
+
465
+ elif data.get("isFinal"):
466
+ for frame in audio_bstream.flush():
467
+ _send_last_frame(segment_id=segment_id, is_final=False)
468
+ last_frame = frame
469
+
470
+ _send_last_frame(segment_id=segment_id, is_final=True)
471
+
472
+ pass
473
+ elif data.get("error"):
474
+ logger.error("11labs reported an error: %s", data["error"])
475
+ else:
476
+ logger.error("unexpected 11labs message %s", data)
365
477
 
366
478
  tasks = [
367
479
  asyncio.create_task(send_task()),
@@ -373,40 +485,6 @@ class SynthesizeStream(tts.SynthesizeStream):
373
485
  finally:
374
486
  await utils.aio.gracefully_cancel(*tasks)
375
487
 
376
- def _process_stream_event(
377
- self, *, data: dict, request_id: str, segment_id: str
378
- ) -> None:
379
- encoding = _encoding_from_format(self._opts.encoding)
380
- if data.get("audio"):
381
- b64data = base64.b64decode(data["audio"])
382
- if encoding == "mp3":
383
- for frame in self._mp3_decoder.decode_chunk(b64data):
384
- self._event_ch.send_nowait(
385
- tts.SynthesizedAudio(
386
- request_id=request_id,
387
- segment_id=segment_id,
388
- frame=frame,
389
- )
390
- )
391
- else:
392
- chunk_frame = rtc.AudioFrame(
393
- data=b64data,
394
- sample_rate=self._opts.sample_rate,
395
- num_channels=1,
396
- samples_per_channel=len(b64data) // 2,
397
- )
398
- self._event_ch.send_nowait(
399
- tts.SynthesizedAudio(
400
- request_id=request_id,
401
- segment_id=segment_id,
402
- frame=chunk_frame,
403
- )
404
- )
405
- elif data.get("error"):
406
- logger.error("11labs reported an error: %s", data["error"])
407
- elif not data.get("isFinal"):
408
- logger.error("unexpected 11labs message %s", data)
409
-
410
488
 
411
489
  def _dict_to_voices_list(data: dict[str, Any]):
412
490
  voices: List[Voice] = []
@@ -429,7 +507,7 @@ def _strip_nones(data: dict[str, Any]):
429
507
  def _synthesize_url(opts: _TTSOptions) -> str:
430
508
  base_url = opts.base_url
431
509
  voice_id = opts.voice.id
432
- model_id = opts.model_id
510
+ model_id = opts.model
433
511
  output_format = opts.encoding
434
512
  latency = opts.streaming_latency
435
513
  return (
@@ -441,7 +519,7 @@ def _synthesize_url(opts: _TTSOptions) -> str:
441
519
  def _stream_url(opts: _TTSOptions) -> str:
442
520
  base_url = opts.base_url
443
521
  voice_id = opts.voice.id
444
- model_id = opts.model_id
522
+ model_id = opts.model
445
523
  output_format = opts.encoding
446
524
  latency = opts.streaming_latency
447
525
  enable_ssml = str(opts.enable_ssml_parsing).lower()
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.7.4"
15
+ __version__ = "0.7.6"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.7.4
3
+ Version: 0.7.6
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit-agents[codecs] >=0.8.0.dev0
22
+ Requires-Dist: livekit-agents[codecs] >=0.11
23
23
 
24
24
  # LiveKit Plugins Elevenlabs
25
25
 
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=ddBUlDT4707f64WDJASR0B60X0yQ-LRHK1ZpTuBJXK8,387
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=mxdypC-zSbS1R15FmztT49ssk_arkKGUPe_d5uVqOUk,18422
6
+ livekit/plugins/elevenlabs/version.py,sha256=vOFNGWowZUhIrmyHBGtCx5dGhCp1T2FPt0h7KU_XKJg,600
7
+ livekit_plugins_elevenlabs-0.7.6.dist-info/METADATA,sha256=DY1JbHdgfNivv0p0xA5ZRenYUGEYC33yX4TcNh__srg,1305
8
+ livekit_plugins_elevenlabs-0.7.6.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
9
+ livekit_plugins_elevenlabs-0.7.6.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.7.6.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.2)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=cYRVIPXkRvB3-jK9bKZ9rYiMBACytWlCSq6yoZXaSgA,1080
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=ddBUlDT4707f64WDJASR0B60X0yQ-LRHK1ZpTuBJXK8,387
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=D0NXdOU94vFkYznWkx-tDRi8eBBSj-uMtv-E6s4abds,15099
6
- livekit/plugins/elevenlabs/version.py,sha256=UblqPqnLJ1iqLGElaqb_uNunR14phGN2btPpCGRFrYk,600
7
- livekit_plugins_elevenlabs-0.7.4.dist-info/METADATA,sha256=v69Po6l03UZl89vzcNhM3Wd2yezLTHjcN1q5I-sbfgg,1311
8
- livekit_plugins_elevenlabs-0.7.4.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
9
- livekit_plugins_elevenlabs-0.7.4.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.7.4.dist-info/RECORD,,