livekit-plugins-elevenlabs 0.8.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,7 +13,7 @@
13
13
  # limitations under the License.
14
14
 
15
15
  from .models import TTSEncoding, TTSModels
16
- from .tts import DEFAULT_VOICE, TTS, Voice, VoiceSettings
16
+ from .tts import DEFAULT_VOICE_ID, TTS, Voice, VoiceSettings
17
17
  from .version import __version__
18
18
 
19
19
  __all__ = [
@@ -22,7 +22,7 @@ __all__ = [
22
22
  "VoiceSettings",
23
23
  "TTSEncoding",
24
24
  "TTSModels",
25
- "DEFAULT_VOICE",
25
+ "DEFAULT_VOICE_ID",
26
26
  "__version__",
27
27
  ]
28
28
 
@@ -10,4 +10,12 @@ TTSModels = Literal[
10
10
  "eleven_flash_v2",
11
11
  ]
12
12
 
13
- TTSEncoding = Literal["mp3_44100",]
13
+ TTSEncoding = Literal[
14
+ "mp3_22050_32",
15
+ "mp3_44100",
16
+ "mp3_44100_32",
17
+ "mp3_44100_64",
18
+ "mp3_44100_96",
19
+ "mp3_44100_128",
20
+ "mp3_44100_192",
21
+ ]
@@ -21,9 +21,10 @@ import json
21
21
  import os
22
22
  import weakref
23
23
  from dataclasses import dataclass
24
- from typing import Any, List, Optional
24
+ from typing import Any
25
25
 
26
26
  import aiohttp
27
+
27
28
  from livekit.agents import (
28
29
  APIConnectionError,
29
30
  APIConnectOptions,
@@ -33,11 +34,19 @@ from livekit.agents import (
33
34
  tts,
34
35
  utils,
35
36
  )
37
+ from livekit.agents.types import (
38
+ DEFAULT_API_CONNECT_OPTIONS,
39
+ NOT_GIVEN,
40
+ NotGivenOr,
41
+ )
42
+ from livekit.agents.utils import is_given
36
43
 
37
44
  from .log import logger
38
45
  from .models import TTSEncoding, TTSModels
39
46
 
40
- _DefaultEncoding: TTSEncoding = "mp3_44100"
47
+ # by default, use 22.05kHz sample rate at 32kbps
48
+ # in our testing, reduce TTFB by about ~110ms
49
+ _DefaultEncoding: TTSEncoding = "mp3_22050_32"
41
50
 
42
51
 
43
52
  def _sample_rate_from_format(output_format: TTSEncoding) -> int:
@@ -49,9 +58,9 @@ def _sample_rate_from_format(output_format: TTSEncoding) -> int:
49
58
  class VoiceSettings:
50
59
  stability: float # [0.0 - 1.0]
51
60
  similarity_boost: float # [0.0 - 1.0]
52
- style: float | None = None # [0.0 - 1.0]
53
- speed: float | None = 1.0 # [0.8 - 1.2]
54
- use_speaker_boost: bool | None = False
61
+ style: NotGivenOr[float] = NOT_GIVEN # [0.0 - 1.0]
62
+ speed: NotGivenOr[float] = NOT_GIVEN # [0.8 - 1.2]
63
+ use_speaker_boost: NotGivenOr[bool] = NOT_GIVEN
55
64
 
56
65
 
57
66
  @dataclass
@@ -59,22 +68,9 @@ class Voice:
59
68
  id: str
60
69
  name: str
61
70
  category: str
62
- settings: VoiceSettings | None = None
63
-
64
-
65
- DEFAULT_VOICE = Voice(
66
- id="EXAVITQu4vr4xnSDxMaL",
67
- name="Bella",
68
- category="premade",
69
- settings=VoiceSettings(
70
- stability=0.71,
71
- speed=1.0,
72
- similarity_boost=0.5,
73
- style=0.0,
74
- use_speaker_boost=True,
75
- ),
76
- )
77
71
 
72
+
73
+ DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
78
74
  API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
79
75
  AUTHORIZATION_HEADER = "xi-api-key"
80
76
  WS_INACTIVITY_TIMEOUT = 300
@@ -83,13 +79,14 @@ WS_INACTIVITY_TIMEOUT = 300
83
79
  @dataclass
84
80
  class _TTSOptions:
85
81
  api_key: str
86
- voice: Voice
82
+ voice_id: str
83
+ voice_settings: NotGivenOr[VoiceSettings]
87
84
  model: TTSModels | str
88
- language: str | None
85
+ language: NotGivenOr[str]
89
86
  base_url: str
90
87
  encoding: TTSEncoding
91
88
  sample_rate: int
92
- streaming_latency: int
89
+ streaming_latency: NotGivenOr[int]
93
90
  word_tokenizer: tokenize.WordTokenizer
94
91
  chunk_length_schedule: list[int]
95
92
  enable_ssml_parsing: bool
@@ -100,68 +97,70 @@ class TTS(tts.TTS):
100
97
  def __init__(
101
98
  self,
102
99
  *,
103
- voice: Voice = DEFAULT_VOICE,
100
+ voice_id: str = DEFAULT_VOICE_ID,
101
+ voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN,
104
102
  model: TTSModels | str = "eleven_flash_v2_5",
105
- api_key: str | None = None,
106
- base_url: str | None = None,
107
- streaming_latency: int = 0,
103
+ encoding: NotGivenOr[TTSEncoding] = NOT_GIVEN,
104
+ api_key: NotGivenOr[str] = NOT_GIVEN,
105
+ base_url: NotGivenOr[str] = NOT_GIVEN,
106
+ streaming_latency: NotGivenOr[int] = NOT_GIVEN,
108
107
  inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
109
- word_tokenizer: Optional[tokenize.WordTokenizer] = None,
108
+ word_tokenizer: NotGivenOr[tokenize.WordTokenizer] = NOT_GIVEN,
110
109
  enable_ssml_parsing: bool = False,
111
- chunk_length_schedule: list[int] = [80, 120, 200, 260], # range is [50, 500]
110
+ chunk_length_schedule: NotGivenOr[list[int]] = NOT_GIVEN, # range is [50, 500]
112
111
  http_session: aiohttp.ClientSession | None = None,
113
- # deprecated
114
- model_id: TTSModels | str | None = None,
115
- language: str | None = None,
112
+ language: NotGivenOr[str] = NOT_GIVEN,
116
113
  ) -> None:
117
114
  """
118
115
  Create a new instance of ElevenLabs TTS.
119
116
 
120
117
  Args:
121
- voice (Voice): Voice configuration. Defaults to `DEFAULT_VOICE`.
118
+ voice_id (str): Voice ID. Defaults to `DEFAULT_VOICE_ID`.
119
+ voice_settings (NotGivenOr[VoiceSettings]): Voice settings.
122
120
  model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
123
- api_key (str | None): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
124
- base_url (str | None): Custom base URL for the API. Optional.
125
- streaming_latency (int): Optimize for streaming latency, defaults to 0 - disabled. 4 for max latency optimizations. deprecated
121
+ api_key (NotGivenOr[str]): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
122
+ base_url (NotGivenOr[str]): Custom base URL for the API. Optional.
123
+ streaming_latency (NotGivenOr[int]): Optimize for streaming latency, defaults to 0 - disabled. 4 for max latency optimizations. deprecated
126
124
  inactivity_timeout (int): Inactivity timeout in seconds for the websocket connection. Defaults to 300.
127
- word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
125
+ word_tokenizer (NotGivenOr[tokenize.WordTokenizer]): Tokenizer for processing text. Defaults to basic WordTokenizer.
128
126
  enable_ssml_parsing (bool): Enable SSML parsing for input text. Defaults to False.
129
- chunk_length_schedule (list[int]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
127
+ chunk_length_schedule (NotGivenOr[list[int]]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
130
128
  http_session (aiohttp.ClientSession | None): Custom HTTP session for API requests. Optional.
131
- language (str | None): Language code for the TTS model, as of 10/24/24 only valid for "eleven_turbo_v2_5". Optional.
132
- """
129
+ language (NotGivenOr[str]): Language code for the TTS model, as of 10/24/24 only valid for "eleven_turbo_v2_5".
130
+ """ # noqa: E501
131
+
132
+ if not is_given(chunk_length_schedule):
133
+ chunk_length_schedule = [80, 120, 200, 260]
134
+
135
+ if not is_given(encoding):
136
+ encoding = _DefaultEncoding
133
137
 
134
138
  super().__init__(
135
139
  capabilities=tts.TTSCapabilities(
136
140
  streaming=True,
137
141
  ),
138
- sample_rate=_sample_rate_from_format(_DefaultEncoding),
142
+ sample_rate=_sample_rate_from_format(encoding),
139
143
  num_channels=1,
140
144
  )
141
145
 
142
- if model_id is not None:
143
- logger.warning(
144
- "model_id is deprecated and will be removed in 1.5.0, use model instead",
145
- )
146
- model = model_id
147
-
148
- api_key = api_key or os.environ.get("ELEVEN_API_KEY")
149
- if not api_key:
146
+ elevenlabs_api_key = api_key if is_given(api_key) else os.environ.get("ELEVEN_API_KEY")
147
+ if not elevenlabs_api_key:
150
148
  raise ValueError(
151
- "ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable"
149
+ "ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable" # noqa: E501
152
150
  )
153
151
 
154
- if word_tokenizer is None:
152
+ if not is_given(word_tokenizer):
155
153
  word_tokenizer = tokenize.basic.WordTokenizer(
156
154
  ignore_punctuation=False # punctuation can help for intonation
157
155
  )
158
156
 
159
157
  self._opts = _TTSOptions(
160
- voice=voice,
158
+ voice_id=voice_id,
159
+ voice_settings=voice_settings,
161
160
  model=model,
162
- api_key=api_key,
163
- base_url=base_url or API_BASE_URL_V1,
164
- encoding=_DefaultEncoding,
161
+ api_key=elevenlabs_api_key,
162
+ base_url=base_url if is_given(base_url) else API_BASE_URL_V1,
163
+ encoding=encoding,
165
164
  sample_rate=self.sample_rate,
166
165
  streaming_latency=streaming_latency,
167
166
  word_tokenizer=word_tokenizer,
@@ -171,37 +170,15 @@ class TTS(tts.TTS):
171
170
  inactivity_timeout=inactivity_timeout,
172
171
  )
173
172
  self._session = http_session
174
- self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
175
- connect_cb=self._connect_ws,
176
- close_cb=self._close_ws,
177
- max_session_duration=inactivity_timeout,
178
- mark_refreshed_on_get=True,
179
- )
180
173
  self._streams = weakref.WeakSet[SynthesizeStream]()
181
174
 
182
- async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
183
- session = self._ensure_session()
184
- return await asyncio.wait_for(
185
- session.ws_connect(
186
- _stream_url(self._opts),
187
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
188
- ),
189
- self._conn_options.timeout,
190
- )
191
-
192
- async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
193
- await ws.close()
194
-
195
175
  def _ensure_session(self) -> aiohttp.ClientSession:
196
176
  if not self._session:
197
177
  self._session = utils.http_context.http_session()
198
178
 
199
179
  return self._session
200
180
 
201
- def prewarm(self) -> None:
202
- self._pool.prewarm()
203
-
204
- async def list_voices(self) -> List[Voice]:
181
+ async def list_voices(self) -> list[Voice]:
205
182
  async with self._ensure_session().get(
206
183
  f"{self._opts.base_url}/voices",
207
184
  headers={AUTHORIZATION_HEADER: self._opts.api_key},
@@ -211,26 +188,33 @@ class TTS(tts.TTS):
211
188
  def update_options(
212
189
  self,
213
190
  *,
214
- voice: Voice = DEFAULT_VOICE,
215
- model: TTSModels | str = "eleven_turbo_v2_5",
216
- language: str | None = None,
191
+ voice_id: NotGivenOr[str] = NOT_GIVEN,
192
+ voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN,
193
+ model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
194
+ language: NotGivenOr[str] = NOT_GIVEN,
217
195
  ) -> None:
218
196
  """
219
197
  Args:
220
- voice (Voice): Voice configuration. Defaults to `DEFAULT_VOICE`.
221
- model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
222
- language (str | None): Language code for the TTS model. Optional.
198
+ voice_id (NotGivenOr[str]): Voice ID.
199
+ voice_settings (NotGivenOr[VoiceSettings]): Voice settings.
200
+ model (NotGivenOr[TTSModels | str]): TTS model to use.
201
+ language (NotGivenOr[str]): Language code for the TTS model.
223
202
  """
224
- self._opts.model = model or self._opts.model
225
- self._opts.voice = voice or self._opts.voice
226
- self._opts.language = language or self._opts.language
203
+ if is_given(model):
204
+ self._opts.model = model
205
+ if is_given(voice_id):
206
+ self._opts.voice_id = voice_id
207
+ if is_given(voice_settings):
208
+ self._opts.voice_settings = voice_settings
209
+ if is_given(language):
210
+ self._opts.language = language
227
211
 
228
212
  def synthesize(
229
213
  self,
230
214
  text: str,
231
215
  *,
232
- conn_options: Optional[APIConnectOptions] = None,
233
- ) -> "ChunkedStream":
216
+ conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
217
+ ) -> ChunkedStream:
234
218
  return ChunkedStream(
235
219
  tts=self,
236
220
  input_text=text,
@@ -240,9 +224,14 @@ class TTS(tts.TTS):
240
224
  )
241
225
 
242
226
  def stream(
243
- self, *, conn_options: Optional[APIConnectOptions] = None
244
- ) -> "SynthesizeStream":
245
- stream = SynthesizeStream(tts=self, pool=self._pool, opts=self._opts)
227
+ self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
228
+ ) -> SynthesizeStream:
229
+ stream = SynthesizeStream(
230
+ tts=self,
231
+ conn_options=conn_options,
232
+ opts=self._opts,
233
+ session=self._ensure_session(),
234
+ )
246
235
  self._streams.add(stream)
247
236
  return stream
248
237
 
@@ -250,7 +239,6 @@ class TTS(tts.TTS):
250
239
  for stream in list(self._streams):
251
240
  await stream.aclose()
252
241
  self._streams.clear()
253
- await self._pool.aclose()
254
242
  await super().aclose()
255
243
 
256
244
 
@@ -263,7 +251,7 @@ class ChunkedStream(tts.ChunkedStream):
263
251
  tts: TTS,
264
252
  input_text: str,
265
253
  opts: _TTSOptions,
266
- conn_options: Optional[APIConnectOptions] = None,
254
+ conn_options: APIConnectOptions,
267
255
  session: aiohttp.ClientSession,
268
256
  ) -> None:
269
257
  super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
@@ -272,8 +260,8 @@ class ChunkedStream(tts.ChunkedStream):
272
260
  async def _run(self) -> None:
273
261
  request_id = utils.shortuuid()
274
262
  voice_settings = (
275
- _strip_nones(dataclasses.asdict(self._opts.voice.settings))
276
- if self._opts.voice.settings
263
+ _strip_nones(dataclasses.asdict(self._opts.voice_settings))
264
+ if is_given(self._opts.voice_settings)
277
265
  else None
278
266
  )
279
267
  data = {
@@ -338,11 +326,12 @@ class SynthesizeStream(tts.SynthesizeStream):
338
326
  self,
339
327
  *,
340
328
  tts: TTS,
341
- pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
329
+ session: aiohttp.ClientSession,
342
330
  opts: _TTSOptions,
331
+ conn_options: APIConnectOptions,
343
332
  ):
344
- super().__init__(tts=tts)
345
- self._opts, self._pool = opts, pool
333
+ super().__init__(tts=tts, conn_options=conn_options)
334
+ self._opts, self._session = opts, session
346
335
 
347
336
  async def _run(self) -> None:
348
337
  request_id = utils.shortuuid()
@@ -397,177 +386,177 @@ class SynthesizeStream(tts.SynthesizeStream):
397
386
  word_stream: tokenize.WordStream,
398
387
  request_id: str,
399
388
  ) -> None:
400
- async with self._pool.connection() as ws_conn:
401
- segment_id = utils.shortuuid()
402
- expected_text = "" # accumulate all tokens sent
403
-
404
- decoder = utils.codecs.AudioStreamDecoder(
405
- sample_rate=self._opts.sample_rate,
406
- num_channels=1,
407
- )
389
+ ws_conn = await self._session.ws_connect(
390
+ _stream_url(self._opts),
391
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
392
+ )
408
393
 
409
- # 11labs protocol expects the first message to be an "init msg"
410
- init_pkt = dict(
411
- text=" ",
412
- voice_settings=_strip_nones(
413
- dataclasses.asdict(self._opts.voice.settings)
414
- )
415
- if self._opts.voice.settings
416
- else None,
417
- generation_config=dict(
418
- chunk_length_schedule=self._opts.chunk_length_schedule
419
- ),
420
- )
421
- await ws_conn.send_str(json.dumps(init_pkt))
422
-
423
- @utils.log_exceptions(logger=logger)
424
- async def send_task():
425
- nonlocal expected_text
426
- xml_content = []
427
- async for data in word_stream:
428
- text = data.token
429
- expected_text += text
430
- # send the xml phoneme in one go
431
- if (
432
- self._opts.enable_ssml_parsing
433
- and data.token.startswith("<phoneme")
434
- or xml_content
435
- ):
436
- xml_content.append(text)
437
- if text.find("</phoneme>") > -1:
438
- text = self._opts.word_tokenizer.format_words(xml_content)
439
- xml_content = []
440
- else:
441
- continue
442
-
443
- data_pkt = dict(text=f"{text} ") # must always end with a space
444
- self._mark_started()
445
- await ws_conn.send_str(json.dumps(data_pkt))
446
- if xml_content:
447
- logger.warning("11labs stream ended with incomplete xml content")
448
- await ws_conn.send_str(json.dumps({"flush": True}))
449
-
450
- # consumes from decoder and generates events
451
- @utils.log_exceptions(logger=logger)
452
- async def generate_task():
453
- emitter = tts.SynthesizedAudioEmitter(
454
- event_ch=self._event_ch,
455
- request_id=request_id,
456
- segment_id=segment_id,
457
- )
458
- async for frame in decoder:
459
- emitter.push(frame)
460
- emitter.flush()
394
+ segment_id = utils.shortuuid()
395
+ decoder = utils.codecs.AudioStreamDecoder(
396
+ sample_rate=self._opts.sample_rate,
397
+ num_channels=1,
398
+ )
461
399
 
462
- # receives from ws and decodes audio
463
- @utils.log_exceptions(logger=logger)
464
- async def recv_task():
465
- nonlocal expected_text
466
- received_text = ""
467
-
468
- while True:
469
- msg = await ws_conn.receive()
470
- if msg.type in (
471
- aiohttp.WSMsgType.CLOSED,
472
- aiohttp.WSMsgType.CLOSE,
473
- aiohttp.WSMsgType.CLOSING,
474
- ):
475
- raise APIStatusError(
476
- "11labs connection closed unexpectedly, not all tokens have been consumed",
477
- request_id=request_id,
478
- )
400
+ # 11labs protocol expects the first message to be an "init msg"
401
+ init_pkt = {
402
+ "text": " ",
403
+ "voice_settings": _strip_nones(dataclasses.asdict(self._opts.voice_settings))
404
+ if is_given(self._opts.voice_settings)
405
+ else None,
406
+ "generation_config": {"chunk_length_schedule": self._opts.chunk_length_schedule},
407
+ }
408
+ await ws_conn.send_str(json.dumps(init_pkt))
409
+ eos_sent = False
479
410
 
480
- if msg.type != aiohttp.WSMsgType.TEXT:
481
- logger.warning("unexpected 11labs message type %s", msg.type)
411
+ @utils.log_exceptions(logger=logger)
412
+ async def send_task():
413
+ nonlocal eos_sent
414
+ xml_content = []
415
+ async for data in word_stream:
416
+ text = data.token
417
+ # send the xml phoneme in one go
418
+ if (
419
+ self._opts.enable_ssml_parsing
420
+ and data.token.startswith("<phoneme")
421
+ or xml_content
422
+ ):
423
+ xml_content.append(text)
424
+ if data.token.find("</phoneme>") > -1:
425
+ text = self._opts.word_tokenizer.format_words(xml_content)
426
+ xml_content = []
427
+ else:
482
428
  continue
483
429
 
484
- data = json.loads(msg.data)
485
- if data.get("audio"):
486
- b64data = base64.b64decode(data["audio"])
487
- decoder.push(b64data)
488
-
489
- if alignment := data.get("normalizedAlignment"):
490
- received_text += "".join(
491
- alignment.get("chars", [])
492
- ).replace(" ", "")
493
- if received_text == expected_text:
494
- decoder.end_input()
495
- break
496
- elif data.get("error"):
497
- raise APIStatusError(
498
- message=data["error"],
499
- status_code=500,
500
- request_id=request_id,
501
- body=None,
502
- )
503
- else:
430
+ data_pkt = {"text": f"{text} "} # must always end with a space
431
+ self._mark_started()
432
+ await ws_conn.send_str(json.dumps(data_pkt))
433
+ if xml_content:
434
+ logger.warning("11labs stream ended with incomplete xml content")
435
+
436
+ # no more token, mark eos
437
+ eos_pkt = {"text": ""}
438
+ await ws_conn.send_str(json.dumps(eos_pkt))
439
+ eos_sent = True
440
+
441
+ # consumes from decoder and generates events
442
+ @utils.log_exceptions(logger=logger)
443
+ async def generate_task():
444
+ emitter = tts.SynthesizedAudioEmitter(
445
+ event_ch=self._event_ch,
446
+ request_id=request_id,
447
+ segment_id=segment_id,
448
+ )
449
+ async for frame in decoder:
450
+ emitter.push(frame)
451
+ emitter.flush()
452
+
453
+ # receives from ws and decodes audio
454
+ @utils.log_exceptions(logger=logger)
455
+ async def recv_task():
456
+ nonlocal eos_sent
457
+
458
+ while True:
459
+ msg = await ws_conn.receive()
460
+ if msg.type in (
461
+ aiohttp.WSMsgType.CLOSED,
462
+ aiohttp.WSMsgType.CLOSE,
463
+ aiohttp.WSMsgType.CLOSING,
464
+ ):
465
+ if not eos_sent:
504
466
  raise APIStatusError(
505
- message=f"unexpected 11labs message {data}",
506
- status_code=500,
467
+ "11labs connection closed unexpectedly, not all tokens have been consumed", # noqa: E501
507
468
  request_id=request_id,
508
- body=None,
509
469
  )
470
+ return
510
471
 
511
- tasks = [
512
- asyncio.create_task(send_task()),
513
- asyncio.create_task(recv_task()),
514
- asyncio.create_task(generate_task()),
515
- ]
516
- try:
517
- await asyncio.gather(*tasks)
518
- except asyncio.TimeoutError as e:
519
- raise APITimeoutError() from e
520
- except aiohttp.ClientResponseError as e:
521
- raise APIStatusError(
522
- message=e.message,
523
- status_code=e.status,
524
- request_id=request_id,
525
- body=None,
526
- ) from e
527
- except APIStatusError:
528
- raise
529
- except Exception as e:
530
- raise APIConnectionError() from e
531
- finally:
532
- await utils.aio.gracefully_cancel(*tasks)
533
- await decoder.aclose()
472
+ if msg.type != aiohttp.WSMsgType.TEXT:
473
+ logger.warning("unexpected 11labs message type %s", msg.type)
474
+ continue
475
+
476
+ data = json.loads(msg.data)
477
+ if data.get("audio"):
478
+ b64data = base64.b64decode(data["audio"])
479
+ decoder.push(b64data)
480
+
481
+ elif data.get("isFinal"):
482
+ decoder.end_input()
483
+ break
484
+ elif data.get("error"):
485
+ raise APIStatusError(
486
+ message=data["error"],
487
+ status_code=500,
488
+ request_id=request_id,
489
+ body=None,
490
+ )
491
+ else:
492
+ raise APIStatusError(
493
+ message=f"unexpected 11labs message {data}",
494
+ status_code=500,
495
+ request_id=request_id,
496
+ body=None,
497
+ )
498
+
499
+ tasks = [
500
+ asyncio.create_task(send_task()),
501
+ asyncio.create_task(recv_task()),
502
+ asyncio.create_task(generate_task()),
503
+ ]
504
+ try:
505
+ await asyncio.gather(*tasks)
506
+ except asyncio.TimeoutError as e:
507
+ raise APITimeoutError() from e
508
+ except aiohttp.ClientResponseError as e:
509
+ raise APIStatusError(
510
+ message=e.message,
511
+ status_code=e.status,
512
+ request_id=request_id,
513
+ body=None,
514
+ ) from e
515
+ except APIStatusError:
516
+ raise
517
+ except Exception as e:
518
+ raise APIConnectionError() from e
519
+ finally:
520
+ await utils.aio.gracefully_cancel(*tasks)
521
+ await decoder.aclose()
522
+ if ws_conn is not None:
523
+ await ws_conn.close()
534
524
 
535
525
 
536
526
  def _dict_to_voices_list(data: dict[str, Any]):
537
- voices: List[Voice] = []
527
+ voices: list[Voice] = []
538
528
  for voice in data["voices"]:
539
529
  voices.append(
540
530
  Voice(
541
531
  id=voice["voice_id"],
542
532
  name=voice["name"],
543
533
  category=voice["category"],
544
- settings=None,
545
534
  )
546
535
  )
547
536
  return voices
548
537
 
549
538
 
550
539
  def _strip_nones(data: dict[str, Any]):
551
- return {k: v for k, v in data.items() if v is not None}
540
+ return {k: v for k, v in data.items() if is_given(v) and v is not None}
552
541
 
553
542
 
554
543
  def _synthesize_url(opts: _TTSOptions) -> str:
555
544
  base_url = opts.base_url
556
- voice_id = opts.voice.id
545
+ voice_id = opts.voice_id
557
546
  model_id = opts.model
558
547
  output_format = opts.encoding
559
548
  url = (
560
549
  f"{base_url}/text-to-speech/{voice_id}/stream?"
561
550
  f"model_id={model_id}&output_format={output_format}"
562
551
  )
563
- if opts.streaming_latency:
552
+ if is_given(opts.streaming_latency):
564
553
  url += f"&optimize_streaming_latency={opts.streaming_latency}"
565
554
  return url
566
555
 
567
556
 
568
557
  def _stream_url(opts: _TTSOptions) -> str:
569
558
  base_url = opts.base_url
570
- voice_id = opts.voice.id
559
+ voice_id = opts.voice_id
571
560
  model_id = opts.model
572
561
  output_format = opts.encoding
573
562
  enable_ssml = str(opts.enable_ssml_parsing).lower()
@@ -578,8 +567,8 @@ def _stream_url(opts: _TTSOptions) -> str:
578
567
  f"model_id={model_id}&output_format={output_format}&"
579
568
  f"enable_ssml_parsing={enable_ssml}&inactivity_timeout={inactivity_timeout}"
580
569
  )
581
- if language is not None:
570
+ if is_given(language):
582
571
  url += f"&language_code={language}"
583
- if opts.streaming_latency:
572
+ if is_given(opts.streaming_latency):
584
573
  url += f"&optimize_streaming_latency={opts.streaming_latency}"
585
574
  return url
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.8.0"
15
+ __version__ = "1.0.0"
@@ -1,35 +1,25 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.8.0
3
+ Version: 1.0.0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
- Home-page: https://github.com/livekit/agents
6
- License: Apache-2.0
7
5
  Project-URL: Documentation, https://docs.livekit.io
8
6
  Project-URL: Website, https://livekit.io/
9
7
  Project-URL: Source, https://github.com/livekit/agents
10
- Keywords: webrtc,realtime,audio,video,livekit,elevenlabs
8
+ Author-email: LiveKit <hello@livekit.io>
9
+ License-Expression: Apache-2.0
10
+ Keywords: audio,elevenlabs,livekit,realtime,video,webrtc
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
- Classifier: Topic :: Multimedia :: Sound/Audio
14
- Classifier: Topic :: Multimedia :: Video
15
- Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
13
  Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3 :: Only
17
15
  Classifier: Programming Language :: Python :: 3.9
18
16
  Classifier: Programming Language :: Python :: 3.10
19
- Classifier: Programming Language :: Python :: 3 :: Only
17
+ Classifier: Topic :: Multimedia :: Sound/Audio
18
+ Classifier: Topic :: Multimedia :: Video
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Requires-Python: >=3.9.0
21
+ Requires-Dist: livekit-agents[codecs]>=1.0.0
21
22
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit-agents[codecs]<1.0.0,>=0.12.16
23
- Dynamic: classifier
24
- Dynamic: description
25
- Dynamic: description-content-type
26
- Dynamic: home-page
27
- Dynamic: keywords
28
- Dynamic: license
29
- Dynamic: project-url
30
- Dynamic: requires-dist
31
- Dynamic: requires-python
32
- Dynamic: summary
33
23
 
34
24
  # LiveKit Plugins Elevenlabs
35
25
 
@@ -0,0 +1,9 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=Va24UYTuuosmRuTcuzd_DIHYQOgV-wSYKJIXmOSB2Go,1255
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=p_wHEz15bdsNEqwzN831ysm70PNWQ-xeN__BKvGPZxA,401
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=gs9p4TwBAYX3vlsNn2XQ-oyPNUGcuvgix8K7vChRMmc,19985
6
+ livekit/plugins/elevenlabs/version.py,sha256=nW89L_U9N4ukT3wAO3BeTqOaa87zLUOsEFz8TkiKIP8,600
7
+ livekit_plugins_elevenlabs-1.0.0.dist-info/METADATA,sha256=EL7wso-EPaWpWwQ5OtxwDaIueFvHrSBEy7PPCigZ8SI,1312
8
+ livekit_plugins_elevenlabs-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
9
+ livekit_plugins_elevenlabs-1.0.0.dist-info/RECORD,,
@@ -1,5 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.2)
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
-
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=nB43wLS1ilzS7IxLYVSQxBjKPnbiPl4AHpHAOlG2i00,273
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=KCZnuAngDZck4zIMMgp0BLV0GS31kKChMvdvXUVZ8vY,20491
6
- livekit/plugins/elevenlabs/version.py,sha256=fObgfvFfJb5Vj0qY1hgEiVKSo6z6atjrJvwAVl4KvR4,600
7
- livekit_plugins_elevenlabs-0.8.0.dist-info/METADATA,sha256=BwddENtvF9zqxTgjgIsHyavyRfA82TBISYEVwFfo2vs,1529
8
- livekit_plugins_elevenlabs-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
9
- livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- livekit