livekit-plugins-elevenlabs 0.7.5__py3-none-any.whl → 0.7.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,3 +37,12 @@ class ElevenLabsPlugin(Plugin):
37
37
 
38
38
 
39
39
  Plugin.register_plugin(ElevenLabsPlugin())
40
+
41
+ # Cleanup docs of unexported modules
42
+ _module = dir()
43
+ NOT_IN_ALL = [m for m in _module if m not in __all__]
44
+
45
+ __pdoc__ = {}
46
+
47
+ for n in NOT_IN_ALL:
48
+ __pdoc__[n] = False
@@ -24,7 +24,14 @@ from typing import Any, List, Literal
24
24
 
25
25
  import aiohttp
26
26
  from livekit import rtc
27
- from livekit.agents import tokenize, tts, utils
27
+ from livekit.agents import (
28
+ APIConnectionError,
29
+ APIStatusError,
30
+ APITimeoutError,
31
+ tokenize,
32
+ tts,
33
+ utils,
34
+ )
28
35
 
29
36
  from .log import logger
30
37
  from .models import TTSEncoding, TTSModels
@@ -79,7 +86,8 @@ AUTHORIZATION_HEADER = "xi-api-key"
79
86
  class _TTSOptions:
80
87
  api_key: str
81
88
  voice: Voice
82
- model_id: TTSModels
89
+ model: TTSModels | str
90
+ language: str | None
83
91
  base_url: str
84
92
  encoding: TTSEncoding
85
93
  sample_rate: int
@@ -94,7 +102,7 @@ class TTS(tts.TTS):
94
102
  self,
95
103
  *,
96
104
  voice: Voice = DEFAULT_VOICE,
97
- model_id: TTSModels = "eleven_turbo_v2_5",
105
+ model: TTSModels | str = "eleven_turbo_v2_5",
98
106
  api_key: str | None = None,
99
107
  base_url: str | None = None,
100
108
  encoding: TTSEncoding = "mp3_22050_32",
@@ -105,12 +113,25 @@ class TTS(tts.TTS):
105
113
  enable_ssml_parsing: bool = False,
106
114
  chunk_length_schedule: list[int] = [80, 120, 200, 260], # range is [50, 500]
107
115
  http_session: aiohttp.ClientSession | None = None,
116
+ # deprecated
117
+ model_id: TTSModels | str | None = None,
118
+ language: str | None = None,
108
119
  ) -> None:
109
120
  """
110
121
  Create a new instance of ElevenLabs TTS.
111
122
 
112
- ``api_key`` must be set to your ElevenLabs API key, either using the argument or by setting
113
- the ``ELEVEN_API_KEY`` environmental variable.
123
+ Args:
124
+ voice (Voice): Voice configuration. Defaults to `DEFAULT_VOICE`.
125
+ model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
126
+ api_key (str | None): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
127
+ base_url (str | None): Custom base URL for the API. Optional.
128
+ encoding (TTSEncoding): Audio encoding format. Defaults to "mp3_22050_32".
129
+ streaming_latency (int): Latency in seconds for streaming. Defaults to 3.
130
+ word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
131
+ enable_ssml_parsing (bool): Enable SSML parsing for input text. Defaults to False.
132
+ chunk_length_schedule (list[int]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
133
+ http_session (aiohttp.ClientSession | None): Custom HTTP session for API requests. Optional.
134
+ language (str | None): Language code for the TTS model, as of 10/24/24 only valid for "eleven_turbo_v2_5". Optional.
114
135
  """
115
136
 
116
137
  super().__init__(
@@ -120,13 +141,22 @@ class TTS(tts.TTS):
120
141
  sample_rate=_sample_rate_from_format(encoding),
121
142
  num_channels=1,
122
143
  )
144
+
145
+ if model_id is not None:
146
+ logger.warning(
147
+ "model_id is deprecated and will be removed in 1.5.0, use model instead",
148
+ )
149
+ model = model_id
150
+
123
151
  api_key = api_key or os.environ.get("ELEVEN_API_KEY")
124
152
  if not api_key:
125
- raise ValueError("ELEVEN_API_KEY must be set")
153
+ raise ValueError(
154
+ "ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable"
155
+ )
126
156
 
127
157
  self._opts = _TTSOptions(
128
158
  voice=voice,
129
- model_id=model_id,
159
+ model=model,
130
160
  api_key=api_key,
131
161
  base_url=base_url or API_BASE_URL_V1,
132
162
  encoding=encoding,
@@ -135,6 +165,7 @@ class TTS(tts.TTS):
135
165
  word_tokenizer=word_tokenizer,
136
166
  chunk_length_schedule=chunk_length_schedule,
137
167
  enable_ssml_parsing=enable_ssml_parsing,
168
+ language=language,
138
169
  )
139
170
  self._session = http_session
140
171
 
@@ -151,31 +182,43 @@ class TTS(tts.TTS):
151
182
  ) as resp:
152
183
  return _dict_to_voices_list(await resp.json())
153
184
 
185
+ def update_options(
186
+ self,
187
+ *,
188
+ voice: Voice = DEFAULT_VOICE,
189
+ model: TTSModels | str = "eleven_turbo_v2_5",
190
+ ) -> None:
191
+ """
192
+ Args:
193
+ voice (Voice): Voice configuration. Defaults to `DEFAULT_VOICE`.
194
+ model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
195
+ """
196
+ self._opts.model = model or self._opts.model
197
+ self._opts.voice = voice or self._opts.voice
198
+
154
199
  def synthesize(self, text: str) -> "ChunkedStream":
155
- return ChunkedStream(text, self._opts, self._ensure_session())
200
+ return ChunkedStream(self, text, self._opts, self._ensure_session())
156
201
 
157
202
  def stream(self) -> "SynthesizeStream":
158
- return SynthesizeStream(self._ensure_session(), self._opts)
203
+ return SynthesizeStream(self, self._ensure_session(), self._opts)
159
204
 
160
205
 
161
206
  class ChunkedStream(tts.ChunkedStream):
162
207
  """Synthesize using the chunked api endpoint"""
163
208
 
164
209
  def __init__(
165
- self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
210
+ self, tts: TTS, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
166
211
  ) -> None:
167
- super().__init__()
168
- self._text, self._opts, self._session = text, opts, session
212
+ super().__init__(tts, text)
213
+ self._opts, self._session = opts, session
169
214
  if _encoding_from_format(self._opts.encoding) == "mp3":
170
215
  self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
171
216
 
172
- @utils.log_exceptions(logger=logger)
173
217
  async def _main_task(self) -> None:
218
+ request_id = utils.shortuuid()
174
219
  bstream = utils.audio.AudioByteStream(
175
220
  sample_rate=self._opts.sample_rate, num_channels=1
176
221
  )
177
- request_id = utils.shortuuid()
178
- segment_id = utils.shortuuid()
179
222
 
180
223
  voice_settings = (
181
224
  _strip_nones(dataclasses.asdict(self._opts.voice.settings))
@@ -183,50 +226,59 @@ class ChunkedStream(tts.ChunkedStream):
183
226
  else None
184
227
  )
185
228
  data = {
186
- "text": self._text,
187
- "model_id": self._opts.model_id,
229
+ "text": self._input_text,
230
+ "model_id": self._opts.model,
188
231
  "voice_settings": voice_settings,
189
232
  }
190
233
 
191
- async with self._session.post(
192
- _synthesize_url(self._opts),
193
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
194
- json=data,
195
- ) as resp:
196
- if not resp.content_type.startswith("audio/"):
197
- content = await resp.text()
198
- logger.error("11labs returned non-audio data: %s", content)
199
- return
200
-
201
- encoding = _encoding_from_format(self._opts.encoding)
202
- if encoding == "mp3":
203
- async for bytes_data, _ in resp.content.iter_chunks():
204
- for frame in self._mp3_decoder.decode_chunk(bytes_data):
205
- for frame in bstream.write(frame.data.tobytes()):
234
+ try:
235
+ async with self._session.post(
236
+ _synthesize_url(self._opts),
237
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
238
+ json=data,
239
+ ) as resp:
240
+ if not resp.content_type.startswith("audio/"):
241
+ content = await resp.text()
242
+ logger.error("11labs returned non-audio data: %s", content)
243
+ return
244
+
245
+ encoding = _encoding_from_format(self._opts.encoding)
246
+ if encoding == "mp3":
247
+ async for bytes_data, _ in resp.content.iter_chunks():
248
+ for frame in self._mp3_decoder.decode_chunk(bytes_data):
249
+ for frame in bstream.write(frame.data.tobytes()):
250
+ self._event_ch.send_nowait(
251
+ tts.SynthesizedAudio(
252
+ request_id=request_id,
253
+ frame=frame,
254
+ )
255
+ )
256
+ else:
257
+ async for bytes_data, _ in resp.content.iter_chunks():
258
+ for frame in bstream.write(bytes_data):
206
259
  self._event_ch.send_nowait(
207
260
  tts.SynthesizedAudio(
208
261
  request_id=request_id,
209
- segment_id=segment_id,
210
262
  frame=frame,
211
263
  )
212
264
  )
213
- else:
214
- async for bytes_data, _ in resp.content.iter_chunks():
215
- for frame in bstream.write(bytes_data):
216
- self._event_ch.send_nowait(
217
- tts.SynthesizedAudio(
218
- request_id=request_id,
219
- segment_id=segment_id,
220
- frame=frame,
221
- )
222
- )
223
265
 
224
- for frame in bstream.flush():
225
- self._event_ch.send_nowait(
226
- tts.SynthesizedAudio(
227
- request_id=request_id, segment_id=segment_id, frame=frame
266
+ for frame in bstream.flush():
267
+ self._event_ch.send_nowait(
268
+ tts.SynthesizedAudio(request_id=request_id, frame=frame)
228
269
  )
229
- )
270
+
271
+ except asyncio.TimeoutError as e:
272
+ raise APITimeoutError() from e
273
+ except aiohttp.ClientResponseError as e:
274
+ raise APIStatusError(
275
+ message=e.message,
276
+ status_code=e.status,
277
+ request_id=None,
278
+ body=None,
279
+ ) from e
280
+ except Exception as e:
281
+ raise APIConnectionError() from e
230
282
 
231
283
 
232
284
  class SynthesizeStream(tts.SynthesizeStream):
@@ -234,10 +286,11 @@ class SynthesizeStream(tts.SynthesizeStream):
234
286
 
235
287
  def __init__(
236
288
  self,
289
+ tts: TTS,
237
290
  session: aiohttp.ClientSession,
238
291
  opts: _TTSOptions,
239
292
  ):
240
- super().__init__()
293
+ super().__init__(tts)
241
294
  self._opts, self._session = opts, session
242
295
  self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
243
296
 
@@ -360,6 +413,26 @@ class SynthesizeStream(tts.SynthesizeStream):
360
413
 
361
414
  async def recv_task():
362
415
  nonlocal eos_sent
416
+ audio_bstream = utils.audio.AudioByteStream(
417
+ sample_rate=self._opts.sample_rate,
418
+ num_channels=1,
419
+ )
420
+
421
+ last_frame: rtc.AudioFrame | None = None
422
+
423
+ def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
424
+ nonlocal last_frame
425
+ if last_frame is not None:
426
+ self._event_ch.send_nowait(
427
+ tts.SynthesizedAudio(
428
+ request_id=request_id,
429
+ segment_id=segment_id,
430
+ frame=last_frame,
431
+ is_final=is_final,
432
+ )
433
+ )
434
+
435
+ last_frame = None
363
436
 
364
437
  while True:
365
438
  msg = await ws_conn.receive()
@@ -378,11 +451,33 @@ class SynthesizeStream(tts.SynthesizeStream):
378
451
  logger.warning("unexpected 11labs message type %s", msg.type)
379
452
  continue
380
453
 
381
- self._process_stream_event(
382
- data=json.loads(msg.data),
383
- request_id=request_id,
384
- segment_id=segment_id,
385
- )
454
+ data = json.loads(msg.data)
455
+ encoding = _encoding_from_format(self._opts.encoding)
456
+ if data.get("audio"):
457
+ b64data = base64.b64decode(data["audio"])
458
+ if encoding == "mp3":
459
+ for frame in self._mp3_decoder.decode_chunk(b64data):
460
+ for frame in audio_bstream.write(frame.data.tobytes()):
461
+ _send_last_frame(segment_id=segment_id, is_final=False)
462
+ last_frame = frame
463
+
464
+ else:
465
+ for frame in audio_bstream.write(b64data):
466
+ _send_last_frame(segment_id=segment_id, is_final=False)
467
+ last_frame = frame
468
+
469
+ elif data.get("isFinal"):
470
+ for frame in audio_bstream.flush():
471
+ _send_last_frame(segment_id=segment_id, is_final=False)
472
+ last_frame = frame
473
+
474
+ _send_last_frame(segment_id=segment_id, is_final=True)
475
+
476
+ pass
477
+ elif data.get("error"):
478
+ logger.error("11labs reported an error: %s", data["error"])
479
+ else:
480
+ logger.error("unexpected 11labs message %s", data)
386
481
 
387
482
  tasks = [
388
483
  asyncio.create_task(send_task()),
@@ -394,40 +489,6 @@ class SynthesizeStream(tts.SynthesizeStream):
394
489
  finally:
395
490
  await utils.aio.gracefully_cancel(*tasks)
396
491
 
397
- def _process_stream_event(
398
- self, *, data: dict, request_id: str, segment_id: str
399
- ) -> None:
400
- encoding = _encoding_from_format(self._opts.encoding)
401
- if data.get("audio"):
402
- b64data = base64.b64decode(data["audio"])
403
- if encoding == "mp3":
404
- for frame in self._mp3_decoder.decode_chunk(b64data):
405
- self._event_ch.send_nowait(
406
- tts.SynthesizedAudio(
407
- request_id=request_id,
408
- segment_id=segment_id,
409
- frame=frame,
410
- )
411
- )
412
- else:
413
- chunk_frame = rtc.AudioFrame(
414
- data=b64data,
415
- sample_rate=self._opts.sample_rate,
416
- num_channels=1,
417
- samples_per_channel=len(b64data) // 2,
418
- )
419
- self._event_ch.send_nowait(
420
- tts.SynthesizedAudio(
421
- request_id=request_id,
422
- segment_id=segment_id,
423
- frame=chunk_frame,
424
- )
425
- )
426
- elif data.get("error"):
427
- logger.error("11labs reported an error: %s", data["error"])
428
- elif not data.get("isFinal"):
429
- logger.error("unexpected 11labs message %s", data)
430
-
431
492
 
432
493
  def _dict_to_voices_list(data: dict[str, Any]):
433
494
  voices: List[Voice] = []
@@ -450,7 +511,7 @@ def _strip_nones(data: dict[str, Any]):
450
511
  def _synthesize_url(opts: _TTSOptions) -> str:
451
512
  base_url = opts.base_url
452
513
  voice_id = opts.voice.id
453
- model_id = opts.model_id
514
+ model_id = opts.model
454
515
  output_format = opts.encoding
455
516
  latency = opts.streaming_latency
456
517
  return (
@@ -462,12 +523,16 @@ def _synthesize_url(opts: _TTSOptions) -> str:
462
523
  def _stream_url(opts: _TTSOptions) -> str:
463
524
  base_url = opts.base_url
464
525
  voice_id = opts.voice.id
465
- model_id = opts.model_id
526
+ model_id = opts.model
466
527
  output_format = opts.encoding
467
528
  latency = opts.streaming_latency
468
529
  enable_ssml = str(opts.enable_ssml_parsing).lower()
469
- return (
530
+ language = opts.language
531
+ url = (
470
532
  f"{base_url}/text-to-speech/{voice_id}/stream-input?"
471
533
  f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}&"
472
534
  f"enable_ssml_parsing={enable_ssml}"
473
535
  )
536
+ if language is not None:
537
+ url += f"&language_code={language}"
538
+ return url
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.7.5"
15
+ __version__ = "0.7.7"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.7.5
3
+ Version: 0.7.7
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit-agents[codecs] >=0.8.0.dev0
22
+ Requires-Dist: livekit-agents[codecs] >=0.11
23
23
 
24
24
  # LiveKit Plugins Elevenlabs
25
25
 
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=ddBUlDT4707f64WDJASR0B60X0yQ-LRHK1ZpTuBJXK8,387
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=GgpXXBumLW2r1vKGZ_k-k8rYCQJVahioPMr2aJeSWwk,18760
6
+ livekit/plugins/elevenlabs/version.py,sha256=78n--2R9Gwuh35Oy92hkYHXCMK_Er2s6VCfDuPQa2Ic,600
7
+ livekit_plugins_elevenlabs-0.7.7.dist-info/METADATA,sha256=nTXxc7ODYH7VljmXYPAeNUjMRTE20XB7fBl0micpQQ4,1305
8
+ livekit_plugins_elevenlabs-0.7.7.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
9
+ livekit_plugins_elevenlabs-0.7.7.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.7.7.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.1.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=cYRVIPXkRvB3-jK9bKZ9rYiMBACytWlCSq6yoZXaSgA,1080
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=ddBUlDT4707f64WDJASR0B60X0yQ-LRHK1ZpTuBJXK8,387
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=L9d4KppfqP9tP-PvaE3YKbezovhSboejmIk97xOmdEA,15868
6
- livekit/plugins/elevenlabs/version.py,sha256=4VoyPg1xoLZO0SP38sbtfe-ePEx82VqZVWRBBUr1wgA,600
7
- livekit_plugins_elevenlabs-0.7.5.dist-info/METADATA,sha256=KMqAU3UsRzO4wFl-Y8GfT5-Bb7s_bnm8JmuETbQ2cJo,1311
8
- livekit_plugins_elevenlabs-0.7.5.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
9
- livekit_plugins_elevenlabs-0.7.5.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.7.5.dist-info/RECORD,,