livekit-plugins-elevenlabs 0.4.dev1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,10 +12,19 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from .models import TTSEncoding, TTSModels
15
16
  from .tts import DEFAULT_VOICE, TTS, Voice, VoiceSettings
16
17
  from .version import __version__
17
18
 
18
- __all__ = ["TTS", "Voice", "VoiceSettings", "DEFAULT_VOICE", "__version__"]
19
+ __all__ = [
20
+ "TTS",
21
+ "Voice",
22
+ "VoiceSettings",
23
+ "TTSEncoding",
24
+ "TTSModels",
25
+ "DEFAULT_VOICE",
26
+ "__version__",
27
+ ]
19
28
 
20
29
  from livekit.agents import Plugin
21
30
 
@@ -6,3 +6,15 @@ TTSModels = Literal[
6
6
  "eleven_multilingual_v2",
7
7
  "eleven_turbo_v2",
8
8
  ]
9
+
10
+ TTSEncoding = Literal[
11
+ "mp3_22050_32",
12
+ "mp3_44100_32",
13
+ "mp3_44100_64",
14
+ "mp3_44100_96",
15
+ "mp3_44100_128",
16
+ "mp3_44100_192",
17
+ "pcm_16000",
18
+ "pcm_22050",
19
+ "pcm_44100",
20
+ ]
@@ -12,6 +12,8 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from __future__ import annotations
16
+
15
17
  import asyncio
16
18
  import base64
17
19
  import contextlib
@@ -19,14 +21,36 @@ import dataclasses
19
21
  import json
20
22
  import os
21
23
  from dataclasses import dataclass
22
- from typing import AsyncIterable, List
24
+ from typing import List, Literal, Optional
23
25
 
24
26
  import aiohttp
25
27
  from livekit import rtc
26
- from livekit.agents import aio, tts
28
+ from livekit.agents import aio, codecs, tokenize, tts, utils
27
29
 
28
30
  from .log import logger
29
- from .models import TTSModels
31
+ from .models import (
32
+ TTSEncoding,
33
+ TTSModels,
34
+ )
35
+
36
+ _Encoding = Literal[
37
+ "mp3",
38
+ "pcm",
39
+ ]
40
+
41
+
42
+ def _sample_rate_from_format(output_format: TTSEncoding) -> int:
43
+ split = output_format.split("_") # e.g: mp3_22050_32
44
+ return int(split[1])
45
+
46
+
47
+ def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
48
+ if output_format.startswith("mp3"):
49
+ return "mp3"
50
+ elif output_format.startswith("pcm"):
51
+ return "pcm"
52
+
53
+ raise ValueError(f"Unknown format: {output_format}")
30
54
 
31
55
 
32
56
  @dataclass
@@ -59,13 +83,16 @@ AUTHORIZATION_HEADER = "xi-api-key"
59
83
 
60
84
 
61
85
  @dataclass
62
- class TTSOptions:
86
+ class _TTSOptions:
63
87
  api_key: str
64
88
  voice: Voice
65
89
  model_id: TTSModels
66
90
  base_url: str
91
+ encoding: TTSEncoding
67
92
  sample_rate: int
68
- latency: int
93
+ streaming_latency: int
94
+ word_tokenizer: tokenize.WordTokenizer
95
+ chunk_length_schedule: list[int]
69
96
 
70
97
 
71
98
  class TTS(tts.TTS):
@@ -76,140 +103,211 @@ class TTS(tts.TTS):
76
103
  model_id: TTSModels = "eleven_turbo_v2",
77
104
  api_key: str | None = None,
78
105
  base_url: str | None = None,
79
- sample_rate: int = 24000,
80
- latency: int = 3,
106
+ encoding: TTSEncoding = "mp3_22050_32",
107
+ streaming_latency: int = 3,
108
+ word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
109
+ ignore_punctuation=False # punctuation can help for intonation
110
+ ),
111
+ # default value of 11labs is [120, 160, 250, 290], but we want faster responses by default
112
+ # (range is 50-500)
113
+ chunk_length_schedule: list[int] = [80, 120, 200, 260],
114
+ http_session: aiohttp.ClientSession | None = None,
81
115
  ) -> None:
82
116
  super().__init__(
83
- streaming_supported=True, sample_rate=sample_rate, num_channels=1
117
+ streaming_supported=True,
118
+ sample_rate=_sample_rate_from_format(encoding),
119
+ num_channels=1,
84
120
  )
85
121
  api_key = api_key or os.environ.get("ELEVEN_API_KEY")
86
122
  if not api_key:
87
123
  raise ValueError("ELEVEN_API_KEY must be set")
88
124
 
89
- self._session = aiohttp.ClientSession()
90
- self._opts = TTSOptions(
125
+ self._opts = _TTSOptions(
91
126
  voice=voice,
92
127
  model_id=model_id,
93
128
  api_key=api_key,
94
129
  base_url=base_url or API_BASE_URL_V1,
95
- sample_rate=sample_rate,
96
- latency=latency,
130
+ encoding=encoding,
131
+ sample_rate=self.sample_rate,
132
+ streaming_latency=streaming_latency,
133
+ word_tokenizer=word_tokenizer,
134
+ chunk_length_schedule=chunk_length_schedule,
97
135
  )
136
+ self._session = http_session
137
+
138
+ def _ensure_session(self) -> aiohttp.ClientSession:
139
+ if not self._session:
140
+ self._session = utils.http_session()
141
+
142
+ return self._session
98
143
 
99
144
  async def list_voices(self) -> List[Voice]:
100
- async with self._session.get(
145
+ async with self._ensure_session().get(
101
146
  f"{self._opts.base_url}/voices",
102
147
  headers={AUTHORIZATION_HEADER: self._opts.api_key},
103
148
  ) as resp:
104
- data = await resp.json()
105
- return dict_to_voices_list(data)
149
+ return _dict_to_voices_list(await resp.json())
106
150
 
107
151
  def synthesize(
108
152
  self,
109
153
  text: str,
110
- ) -> AsyncIterable[tts.SynthesizedAudio]:
111
- voice = self._opts.voice
112
- url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
154
+ ) -> "ChunkedStream":
155
+ return ChunkedStream(text, self._opts, self._ensure_session())
113
156
 
114
- async def generator():
115
- try:
116
- async with self._session.post(
117
- url,
118
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
119
- json=dict(
120
- text=text,
121
- model_id=self._opts.model_id,
122
- voice_settings=dataclasses.asdict(voice.settings)
123
- if voice.settings
124
- else None,
125
- ),
126
- ) as resp:
127
- data = await resp.read()
128
- yield tts.SynthesizedAudio(
129
- text=text,
157
+ def stream(
158
+ self,
159
+ ) -> "SynthesizeStream":
160
+ return SynthesizeStream(self._ensure_session(), self._opts)
161
+
162
+
163
+ class ChunkedStream(tts.ChunkedStream):
164
+ """Synthesize using the chunked api endpoint"""
165
+
166
+ def __init__(
167
+ self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
168
+ ) -> None:
169
+ self._opts = opts
170
+ self._text = text
171
+ self._session = session
172
+ self._task: asyncio.Task | None = None
173
+ self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
174
+
175
+ def _synthesize_url(self) -> str:
176
+ base_url = self._opts.base_url
177
+ voice_id = self._opts.voice.id
178
+ model_id = self._opts.model_id
179
+ sample_rate = _sample_rate_from_format(self._opts.encoding)
180
+ latency = self._opts.streaming_latency
181
+ url = (
182
+ f"{base_url}/text-to-speech/{voice_id}/stream?"
183
+ f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
184
+ )
185
+ return url
186
+
187
+ async def _main_task(self):
188
+ try:
189
+ await self._run()
190
+ except Exception:
191
+ logger.exception("11labs main task failed in chunked stream")
192
+ finally:
193
+ self._queue.put_nowait(None)
194
+
195
+ async def _run(self) -> None:
196
+ async with self._session.post(
197
+ self._synthesize_url(),
198
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
199
+ json=dict(
200
+ text=self._text,
201
+ model_id=self._opts.model_id,
202
+ voice_settings=(
203
+ dataclasses.asdict(self._opts.voice.settings)
204
+ if self._opts.voice.settings
205
+ else None
206
+ ),
207
+ ),
208
+ ) as resp:
209
+ # avoid very small frames. chunk by 10ms 16bits
210
+ bytes_per_frame = (self._opts.sample_rate // 100) * 2
211
+ buf = bytearray()
212
+ async for data, _ in resp.content.iter_chunks():
213
+ buf.extend(data)
214
+
215
+ while len(buf) >= bytes_per_frame:
216
+ frame_data = buf[:bytes_per_frame]
217
+ buf = buf[bytes_per_frame:]
218
+
219
+ self._queue.put_nowait(
220
+ tts.SynthesizedAudio(
221
+ text=self._text,
222
+ data=rtc.AudioFrame(
223
+ data=frame_data,
224
+ sample_rate=self._opts.sample_rate,
225
+ num_channels=1,
226
+ samples_per_channel=len(frame_data) // 2,
227
+ ),
228
+ )
229
+ )
230
+
231
+ # send any remaining data
232
+ if len(buf) > 0:
233
+ self._queue.put_nowait(
234
+ tts.SynthesizedAudio(
235
+ text=self._text,
130
236
  data=rtc.AudioFrame(
131
- data=data,
237
+ data=buf,
132
238
  sample_rate=self._opts.sample_rate,
133
239
  num_channels=1,
134
- samples_per_channel=len(data) // 2, # 16-bit
240
+ samples_per_channel=len(buf) // 2,
135
241
  ),
136
242
  )
137
- except Exception as e:
138
- logger.error(f"failed to synthesize: {e}")
243
+ )
139
244
 
140
- return generator()
245
+ async def __anext__(self) -> tts.SynthesizedAudio:
246
+ if not self._task:
247
+ self._task = asyncio.create_task(self._main_task())
141
248
 
142
- def stream(
143
- self,
144
- ) -> "SynthesizeStream":
145
- return SynthesizeStream(self._session, self._opts)
249
+ frame = await self._queue.get()
250
+ if frame is None:
251
+ raise StopAsyncIteration
252
+
253
+ return frame
254
+
255
+ async def aclose(self) -> None:
256
+ if not self._task:
257
+ return
258
+
259
+ self._task.cancel()
260
+ with contextlib.suppress(asyncio.CancelledError):
261
+ await self._task
146
262
 
147
263
 
148
264
  class SynthesizeStream(tts.SynthesizeStream):
149
- _STREAM_EOS = ""
265
+ """Streamed API using websockets"""
266
+
267
+ @dataclass
268
+ class _SegmentConnection:
269
+ audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
270
+ task: asyncio.Task
150
271
 
151
272
  def __init__(
152
273
  self,
153
274
  session: aiohttp.ClientSession,
154
- opts: TTSOptions,
155
- max_retry: int = 32,
275
+ opts: _TTSOptions,
276
+ max_retry_per_segment: int = 3,
156
277
  ):
157
278
  self._opts = opts
158
279
  self._session = session
159
-
160
- self._queue = asyncio.Queue[str | None]()
161
- self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
280
+ self._main_task = asyncio.create_task(self._run(max_retry_per_segment))
281
+ self._event_queue = asyncio.Queue[Optional[tts.SynthesisEvent]]()
162
282
  self._closed = False
163
- self._text = ""
164
-
165
- self._main_task = asyncio.create_task(self._run(max_retry))
283
+ self._word_stream = opts.word_tokenizer.stream()
166
284
 
167
285
  def _stream_url(self) -> str:
168
286
  base_url = self._opts.base_url
169
287
  voice_id = self._opts.voice.id
170
288
  model_id = self._opts.model_id
171
- sample_rate = self._opts.sample_rate
172
- latency = self._opts.latency
173
- return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
289
+ output_format = self._opts.encoding
290
+ latency = self._opts.streaming_latency
291
+ url = (
292
+ f"{base_url}/text-to-speech/{voice_id}/stream-input?"
293
+ f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
294
+ )
295
+
296
+ return url
174
297
 
175
298
  def push_text(self, token: str | None) -> None:
176
299
  if self._closed:
177
300
  raise ValueError("cannot push to a closed stream")
178
301
 
179
302
  if token is None:
180
- self._flush_if_needed()
303
+ self._word_stream.mark_segment_end()
181
304
  return
182
305
 
183
- if len(token) == 0:
184
- # 11labs marks the EOS with an empty string, avoid users from pushing empty strings
185
- return
186
-
187
- # TODO: Naive word boundary detection may not be good enough for all languages
188
- # fmt: off
189
- splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
190
- # fmt: on
191
-
192
- self._text += token
193
-
194
- while True:
195
- last_split = -1
196
- for i, c in enumerate(self._text):
197
- if c in splitters:
198
- last_split = i
199
- break
200
-
201
- if last_split == -1:
202
- break
203
-
204
- seg = self._text[: last_split + 1]
205
- seg = seg.strip() + " " # 11labs expects a space at the end
206
- self._queue.put_nowait(seg)
207
- self._text = self._text[last_split + 1 :]
306
+ self._word_stream.push_text(token)
208
307
 
209
308
  async def aclose(self, *, wait: bool = True) -> None:
210
- self._flush_if_needed()
211
- self._queue.put_nowait(None)
212
309
  self._closed = True
310
+ await self._word_stream.aclose()
213
311
 
214
312
  if not wait:
215
313
  self._main_task.cancel()
@@ -217,158 +315,196 @@ class SynthesizeStream(tts.SynthesizeStream):
217
315
  with contextlib.suppress(asyncio.CancelledError):
218
316
  await self._main_task
219
317
 
220
- def _flush_if_needed(self) -> None:
221
- seg = self._text.strip()
222
- if len(seg) > 0:
223
- self._queue.put_nowait(seg + " ")
224
-
225
- self._text = ""
226
- self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
318
+ async def _run(self, max_retry_per_segment: int) -> None:
319
+ conns_q = asyncio.Queue[Optional[SynthesizeStream._SegmentConnection]]()
227
320
 
228
- async def _run(self, max_retry: int) -> None:
229
- retry_count = 0
230
- ws: aiohttp.ClientWebSocketResponse | None = None
231
- ws_task: asyncio.Task | None = None
232
- data_tx: aio.ChanSender[str] | None = None
233
-
234
- try:
321
+ async def _forward_events() -> None:
322
+ """forward events from the ws connections to the event queue.
323
+ This is used to keep the right order."""
235
324
  while True:
236
- ws_connected = ws is not None and not ws.closed
237
- try:
238
- data = await self._queue.get()
239
-
240
- if data is None:
241
- if ws_task is not None:
242
- await ws_task
243
- break
244
-
245
- if not ws_connected:
246
- if data == SynthesizeStream._STREAM_EOS:
247
- continue
248
-
249
- with contextlib.suppress(asyncio.CancelledError):
250
- if ws_task is not None:
251
- await ws_task
252
-
253
- ws = await self._session.ws_connect(
254
- self._stream_url(),
255
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
256
- )
257
- data_tx, data_rx = aio.channel()
258
- ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
259
-
260
- assert data_tx is not None
261
- assert ws_task is not None
262
- assert ws is not None
325
+ c = await conns_q.get()
326
+ if c is None:
327
+ break # no more segment, stream closed
263
328
 
264
- data_tx.send_nowait(data)
329
+ self._event_queue.put_nowait(
330
+ tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
331
+ )
265
332
 
266
- except Exception:
267
- if retry_count >= max_retry:
268
- logger.exception(
269
- f"failed to connect to 11labs after {max_retry} retries"
333
+ async for frame in c.audio_rx:
334
+ self._event_queue.put_nowait(
335
+ tts.SynthesisEvent(
336
+ type=tts.SynthesisEventType.AUDIO, audio=frame
270
337
  )
271
- break
338
+ )
272
339
 
273
- retry_delay = min(retry_count * 5, 5) # max 5s
274
- retry_count += 1
340
+ self._event_queue.put_nowait(
341
+ tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
342
+ )
275
343
 
276
- logger.warning(
277
- f"failed to connect to 11labs, retrying in {retry_delay}s"
344
+ async def _read_tokens() -> None:
345
+ """read tokens from the word stream and create connections for each segment,
346
+ (this also allows concurrent connections to 11labs)"""
347
+
348
+ cur_segment: SynthesizeStream._SegmentConnection | None = None
349
+ token_tx: aio.ChanSender[str] | None = None
350
+ async for ev in self._word_stream:
351
+ if ev.type == tokenize.TokenEventType.STARTED:
352
+ token_tx, token_rx = aio.channel()
353
+ audio_tx: aio.ChanSender[tts.SynthesizedAudio]
354
+ audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
355
+ audio_tx, audio_rx = aio.channel()
356
+ task = asyncio.create_task(
357
+ self._run_ws(max_retry_per_segment, audio_tx, token_rx)
278
358
  )
279
- await asyncio.sleep(retry_delay)
359
+ cur_segment = SynthesizeStream._SegmentConnection(audio_rx, task)
360
+ conns_q.put_nowait(cur_segment)
361
+ elif ev.type == tokenize.TokenEventType.TOKEN:
362
+ assert token_tx is not None
363
+ token_tx.send_nowait(ev.token)
364
+ elif ev.type == tokenize.TokenEventType.FINISHED:
365
+ assert token_tx is not None
366
+ token_tx.close()
367
+ cur_segment = token_tx = None
368
+
369
+ conns_q.put_nowait(None)
280
370
 
371
+ try:
372
+ await asyncio.gather(_forward_events(), _read_tokens())
281
373
  except Exception:
282
374
  logger.exception("11labs task failed")
283
- finally:
284
- with contextlib.suppress(asyncio.CancelledError):
285
- if ws_task is not None:
286
- ws_task.cancel()
287
- await ws_task
288
375
 
289
- self._event_queue.put_nowait(None)
376
+ self._event_queue.put_nowait(None)
290
377
 
291
378
  async def _run_ws(
292
- self, ws: aiohttp.ClientWebSocketResponse, data_rx: aio.ChanReceiver[str]
379
+ self,
380
+ max_retry: int,
381
+ audio_tx: aio.ChanSender[tts.SynthesizedAudio],
382
+ token_rx: aio.ChanReceiver[str],
293
383
  ) -> None:
294
- closing_ws = False
384
+ # try to connect to 11labs
385
+ ws_conn: aiohttp.ClientWebSocketResponse | None = None
386
+ for try_i in range(max_retry):
387
+ try:
388
+ ws_conn = await self._session.ws_connect(
389
+ self._stream_url(),
390
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
391
+ )
295
392
 
296
- self._event_queue.put_nowait(
297
- tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
298
- )
393
+ voice_settings = None
394
+ if self._opts.voice.settings is not None:
395
+ voice_settings = dataclasses.asdict(self._opts.voice.settings)
299
396
 
300
- async def send_task():
301
- nonlocal closing_ws
397
+ init_pkt = dict(
398
+ text=" ",
399
+ try_trigger_generation=True,
400
+ voice_settings=voice_settings,
401
+ generation_config=dict(
402
+ chunk_length_schedule=self._opts.chunk_length_schedule,
403
+ ),
404
+ )
405
+ await ws_conn.send_str(json.dumps(init_pkt))
406
+ except Exception:
407
+ if try_i + 1 == max_retry:
408
+ logger.exception(
409
+ f"failed to connect to 11labs after {max_retry} retries"
410
+ )
411
+ return
302
412
 
303
- # 11labs stream must be initialized with a space
304
- voice = self._opts.voice
305
- voice_settings = (
306
- dataclasses.asdict(voice.settings) if voice.settings else None
307
- )
308
- init_pkt = dict(
309
- text=" ",
310
- voice_settings=voice_settings,
311
- )
312
- await ws.send_str(json.dumps(init_pkt))
413
+ retry_delay = min(try_i * 5, 5) # max 5s
414
+ logger.warning(
415
+ f"failed to connect to 11labs, retrying in {retry_delay}s"
416
+ )
417
+ await asyncio.sleep(retry_delay)
313
418
 
314
- while True:
315
- data = await data_rx.recv()
419
+ assert ws_conn is not None
420
+
421
+ all_tokens_consumed = False
422
+
423
+ async def send_task():
424
+ async for token in token_rx:
425
+ if token == "":
426
+ continue # empty token is closing the stream in 11labs protocol
427
+
428
+ # try_trigger_generation=True is a bad practice, we expose
429
+ # chunk_length_schedule instead
316
430
  data_pkt = dict(
317
- text=data,
431
+ text=f"{token} ", # must always end with a space
318
432
  try_trigger_generation=False,
319
433
  )
320
- if data == SynthesizeStream._STREAM_EOS:
321
- closing_ws = True
434
+ await ws_conn.send_str(json.dumps(data_pkt))
322
435
 
323
- await ws.send_str(json.dumps(data_pkt))
436
+ # no more token, mark eos
437
+ flush_pkt = dict(
438
+ text="",
439
+ )
440
+ await ws_conn.send_str(json.dumps(flush_pkt))
324
441
 
325
- if closing_ws:
326
- return
442
+ nonlocal all_tokens_consumed
443
+ all_tokens_consumed = True
327
444
 
328
445
  async def recv_task():
329
- nonlocal closing_ws
446
+ encoding = _encoding_from_format(self._opts.encoding)
447
+ mp3_decoder = codecs.Mp3StreamDecoder()
330
448
  while True:
331
- msg = await ws.receive()
449
+ msg = await ws_conn.receive()
332
450
  if msg.type in (
333
451
  aiohttp.WSMsgType.CLOSED,
334
452
  aiohttp.WSMsgType.CLOSE,
335
453
  aiohttp.WSMsgType.CLOSING,
336
454
  ):
337
- if closing_ws: # close is expected
338
- return
455
+ if all_tokens_consumed:
456
+ return # close is expected
339
457
 
340
- raise Exception("11labs connection closed unexpectedly")
458
+ raise Exception(
459
+ "11labs connection closed unexpectedly, not all tokens have been consumed"
460
+ )
341
461
 
342
462
  if msg.type != aiohttp.WSMsgType.TEXT:
463
+ # audio frames are serialized in base64..
343
464
  logger.warning("unexpected 11labs message type %s", msg.type)
344
465
  continue
345
466
 
346
467
  data: dict = json.loads(msg.data)
347
- if data.get("audio"):
348
- b64data = base64.b64decode(data["audio"])
349
- frame = rtc.AudioFrame(
350
- data=b64data,
351
- sample_rate=self._opts.sample_rate,
352
- num_channels=1,
353
- samples_per_channel=len(data) // 2,
354
- )
355
- self._event_queue.put_nowait(
356
- tts.SynthesisEvent(
357
- type=tts.SynthesisEventType.AUDIO,
358
- audio=tts.SynthesizedAudio(text="", data=frame),
468
+ audio = data.get("audio")
469
+
470
+ if data.get("error"):
471
+ logger.error("11labs error %s", data)
472
+ return
473
+ elif audio is not None:
474
+ if audio == "":
475
+ # 11labs sometimes sends empty audio, ignore
476
+ continue
477
+
478
+ b64data = base64.b64decode(audio)
479
+ frame: rtc.AudioFrame
480
+ if encoding == "mp3":
481
+ frames = mp3_decoder.decode_chunk(b64data)
482
+ frame = utils.merge_frames(frames)
483
+ else:
484
+ frame = rtc.AudioFrame(
485
+ data=b64data,
486
+ sample_rate=self._opts.sample_rate,
487
+ num_channels=1,
488
+ samples_per_channel=len(b64data) // 2,
359
489
  )
360
- )
490
+
491
+ text = ""
492
+ if data.get("alignment"):
493
+ text = "".join(data["alignment"].get("chars", ""))
494
+
495
+ audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
496
+ continue
361
497
  elif data.get("isFinal"):
362
- return
498
+ return # last message
499
+
500
+ logger.error("unexpected 11labs message %s", data)
363
501
 
364
502
  try:
365
503
  await asyncio.gather(send_task(), recv_task())
366
504
  except Exception:
367
- logger.exception("11labs connection failed")
505
+ logger.exception("11labs ws connection failed")
368
506
  finally:
369
- self._event_queue.put_nowait(
370
- tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
371
- )
507
+ audio_tx.close()
372
508
 
373
509
  async def __anext__(self) -> tts.SynthesisEvent:
374
510
  evt = await self._event_queue.get()
@@ -378,7 +514,7 @@ class SynthesizeStream(tts.SynthesizeStream):
378
514
  return evt
379
515
 
380
516
 
381
- def dict_to_voices_list(data: dict) -> List[Voice]:
517
+ def _dict_to_voices_list(data: dict) -> List[Voice]:
382
518
  voices = []
383
519
  for voice in data["voices"]:
384
520
  voices.append(
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.4.dev1"
15
+ __version__ = "0.5.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.4.dev1
3
+ Version: 0.5.0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
22
  Requires-Dist: livekit ~=0.11
23
- Requires-Dist: livekit-agents ~=0.6.dev1
23
+ Requires-Dist: livekit-agents[codecs] ~=0.7.0
24
24
  Requires-Dist: aiohttp >=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=ez1ybDPt7GfKAKgPkxZFRB7Vyd-_i-0hfUMI79GQ5w4,1091
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=8jTchztgpiTokHEaWUK8PPxWWfvm5SMrOGsJpzxbYAw,362
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=GTcyQwBVVPzCYLgsnw9q5oFOq9cV3hIKndDaBPSFMr4,17738
6
+ livekit/plugins/elevenlabs/version.py,sha256=pZ7bgeWLjw4VCWymU1ntHaHorKRusUkm56y6tZe5gmQ,600
7
+ livekit_plugins_elevenlabs-0.5.0.dist-info/METADATA,sha256=nmaTaWHwzuzT9nBjaLsJlzTAanMsxl7lv8wH5Sq7boI,1367
8
+ livekit_plugins_elevenlabs-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
+ livekit_plugins_elevenlabs-0.5.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.5.0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=5PO_KjUzIMnHPD_iRyotLqR7qMIjpJYqR52K8wdnzts,12396
6
- livekit/plugins/elevenlabs/version.py,sha256=aVtU3btnJkphqNZWTe3vkERuaW8-zZIFltfU6GWbC40,603
7
- livekit_plugins_elevenlabs-0.4.dev1.dist-info/METADATA,sha256=Cvmffq7O7TUMYfo-fxnpZIjTqbpxrA3hg0-gdmdlwkc,1365
8
- livekit_plugins_elevenlabs-0.4.dev1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
- livekit_plugins_elevenlabs-0.4.dev1.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.4.dev1.dist-info/RECORD,,