livekit-plugins-elevenlabs 0.4.dev2__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/PKG-INFO +2 -2
  2. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/livekit/plugins/elevenlabs/__init__.py +10 -1
  3. livekit_plugins_elevenlabs-0.5.0/livekit/plugins/elevenlabs/models.py +20 -0
  4. livekit_plugins_elevenlabs-0.5.0/livekit/plugins/elevenlabs/tts.py +528 -0
  5. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/livekit/plugins/elevenlabs/version.py +1 -1
  6. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/livekit_plugins_elevenlabs.egg-info/PKG-INFO +2 -2
  7. livekit_plugins_elevenlabs-0.5.0/livekit_plugins_elevenlabs.egg-info/requires.txt +3 -0
  8. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/setup.py +1 -1
  9. livekit_plugins_elevenlabs-0.4.dev2/livekit/plugins/elevenlabs/models.py +0 -8
  10. livekit_plugins_elevenlabs-0.4.dev2/livekit/plugins/elevenlabs/tts.py +0 -392
  11. livekit_plugins_elevenlabs-0.4.dev2/livekit_plugins_elevenlabs.egg-info/requires.txt +0 -3
  12. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/README.md +0 -0
  13. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/livekit/plugins/elevenlabs/log.py +0 -0
  14. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/livekit/plugins/elevenlabs/py.typed +0 -0
  15. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/livekit_plugins_elevenlabs.egg-info/SOURCES.txt +0 -0
  16. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/livekit_plugins_elevenlabs.egg-info/dependency_links.txt +0 -0
  17. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/livekit_plugins_elevenlabs.egg-info/top_level.txt +0 -0
  18. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/pyproject.toml +0 -0
  19. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.4.dev2
3
+ Version: 0.5.0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
22
  Requires-Dist: livekit~=0.11
23
- Requires-Dist: livekit-agents~=0.6.dev1
23
+ Requires-Dist: livekit-agents[codecs]~=0.7.0
24
24
  Requires-Dist: aiohttp>=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -12,10 +12,19 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from .models import TTSEncoding, TTSModels
15
16
  from .tts import DEFAULT_VOICE, TTS, Voice, VoiceSettings
16
17
  from .version import __version__
17
18
 
18
- __all__ = ["TTS", "Voice", "VoiceSettings", "DEFAULT_VOICE", "__version__"]
19
+ __all__ = [
20
+ "TTS",
21
+ "Voice",
22
+ "VoiceSettings",
23
+ "TTSEncoding",
24
+ "TTSModels",
25
+ "DEFAULT_VOICE",
26
+ "__version__",
27
+ ]
19
28
 
20
29
  from livekit.agents import Plugin
21
30
 
@@ -0,0 +1,20 @@
1
+ from typing import Literal
2
+
3
+ TTSModels = Literal[
4
+ "eleven_monolingual_v1",
5
+ "eleven_multilingual_v1",
6
+ "eleven_multilingual_v2",
7
+ "eleven_turbo_v2",
8
+ ]
9
+
10
+ TTSEncoding = Literal[
11
+ "mp3_22050_32",
12
+ "mp3_44100_32",
13
+ "mp3_44100_64",
14
+ "mp3_44100_96",
15
+ "mp3_44100_128",
16
+ "mp3_44100_192",
17
+ "pcm_16000",
18
+ "pcm_22050",
19
+ "pcm_44100",
20
+ ]
@@ -0,0 +1,528 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import contextlib
20
+ import dataclasses
21
+ import json
22
+ import os
23
+ from dataclasses import dataclass
24
+ from typing import List, Literal, Optional
25
+
26
+ import aiohttp
27
+ from livekit import rtc
28
+ from livekit.agents import aio, codecs, tokenize, tts, utils
29
+
30
+ from .log import logger
31
+ from .models import (
32
+ TTSEncoding,
33
+ TTSModels,
34
+ )
35
+
36
+ _Encoding = Literal[
37
+ "mp3",
38
+ "pcm",
39
+ ]
40
+
41
+
42
+ def _sample_rate_from_format(output_format: TTSEncoding) -> int:
43
+ split = output_format.split("_") # e.g: mp3_22050_32
44
+ return int(split[1])
45
+
46
+
47
+ def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
48
+ if output_format.startswith("mp3"):
49
+ return "mp3"
50
+ elif output_format.startswith("pcm"):
51
+ return "pcm"
52
+
53
+ raise ValueError(f"Unknown format: {output_format}")
54
+
55
+
56
+ @dataclass
57
+ class VoiceSettings:
58
+ stability: float # [0.0 - 1.0]
59
+ similarity_boost: float # [0.0 - 1.0]
60
+ style: float | None = None # [0.0 - 1.0]
61
+ use_speaker_boost: bool | None = False
62
+
63
+
64
+ @dataclass
65
+ class Voice:
66
+ id: str
67
+ name: str
68
+ category: str
69
+ settings: VoiceSettings | None = None
70
+
71
+
72
+ DEFAULT_VOICE = Voice(
73
+ id="EXAVITQu4vr4xnSDxMaL",
74
+ name="Bella",
75
+ category="premade",
76
+ settings=VoiceSettings(
77
+ stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
78
+ ),
79
+ )
80
+
81
+ API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
82
+ AUTHORIZATION_HEADER = "xi-api-key"
83
+
84
+
85
+ @dataclass
86
+ class _TTSOptions:
87
+ api_key: str
88
+ voice: Voice
89
+ model_id: TTSModels
90
+ base_url: str
91
+ encoding: TTSEncoding
92
+ sample_rate: int
93
+ streaming_latency: int
94
+ word_tokenizer: tokenize.WordTokenizer
95
+ chunk_length_schedule: list[int]
96
+
97
+
98
+ class TTS(tts.TTS):
99
+ def __init__(
100
+ self,
101
+ *,
102
+ voice: Voice = DEFAULT_VOICE,
103
+ model_id: TTSModels = "eleven_turbo_v2",
104
+ api_key: str | None = None,
105
+ base_url: str | None = None,
106
+ encoding: TTSEncoding = "mp3_22050_32",
107
+ streaming_latency: int = 3,
108
+ word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
109
+ ignore_punctuation=False # punctuation can help for intonation
110
+ ),
111
+ # default value of 11labs is [120, 160, 250, 290], but we want faster responses by default
112
+ # (range is 50-500)
113
+ chunk_length_schedule: list[int] = [80, 120, 200, 260],
114
+ http_session: aiohttp.ClientSession | None = None,
115
+ ) -> None:
116
+ super().__init__(
117
+ streaming_supported=True,
118
+ sample_rate=_sample_rate_from_format(encoding),
119
+ num_channels=1,
120
+ )
121
+ api_key = api_key or os.environ.get("ELEVEN_API_KEY")
122
+ if not api_key:
123
+ raise ValueError("ELEVEN_API_KEY must be set")
124
+
125
+ self._opts = _TTSOptions(
126
+ voice=voice,
127
+ model_id=model_id,
128
+ api_key=api_key,
129
+ base_url=base_url or API_BASE_URL_V1,
130
+ encoding=encoding,
131
+ sample_rate=self.sample_rate,
132
+ streaming_latency=streaming_latency,
133
+ word_tokenizer=word_tokenizer,
134
+ chunk_length_schedule=chunk_length_schedule,
135
+ )
136
+ self._session = http_session
137
+
138
+ def _ensure_session(self) -> aiohttp.ClientSession:
139
+ if not self._session:
140
+ self._session = utils.http_session()
141
+
142
+ return self._session
143
+
144
+ async def list_voices(self) -> List[Voice]:
145
+ async with self._ensure_session().get(
146
+ f"{self._opts.base_url}/voices",
147
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
148
+ ) as resp:
149
+ return _dict_to_voices_list(await resp.json())
150
+
151
+ def synthesize(
152
+ self,
153
+ text: str,
154
+ ) -> "ChunkedStream":
155
+ return ChunkedStream(text, self._opts, self._ensure_session())
156
+
157
+ def stream(
158
+ self,
159
+ ) -> "SynthesizeStream":
160
+ return SynthesizeStream(self._ensure_session(), self._opts)
161
+
162
+
163
+ class ChunkedStream(tts.ChunkedStream):
164
+ """Synthesize using the chunked api endpoint"""
165
+
166
+ def __init__(
167
+ self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
168
+ ) -> None:
169
+ self._opts = opts
170
+ self._text = text
171
+ self._session = session
172
+ self._task: asyncio.Task | None = None
173
+ self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
174
+
175
+ def _synthesize_url(self) -> str:
176
+ base_url = self._opts.base_url
177
+ voice_id = self._opts.voice.id
178
+ model_id = self._opts.model_id
179
+ sample_rate = _sample_rate_from_format(self._opts.encoding)
180
+ latency = self._opts.streaming_latency
181
+ url = (
182
+ f"{base_url}/text-to-speech/{voice_id}/stream?"
183
+ f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
184
+ )
185
+ return url
186
+
187
+ async def _main_task(self):
188
+ try:
189
+ await self._run()
190
+ except Exception:
191
+ logger.exception("11labs main task failed in chunked stream")
192
+ finally:
193
+ self._queue.put_nowait(None)
194
+
195
+ async def _run(self) -> None:
196
+ async with self._session.post(
197
+ self._synthesize_url(),
198
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
199
+ json=dict(
200
+ text=self._text,
201
+ model_id=self._opts.model_id,
202
+ voice_settings=(
203
+ dataclasses.asdict(self._opts.voice.settings)
204
+ if self._opts.voice.settings
205
+ else None
206
+ ),
207
+ ),
208
+ ) as resp:
209
+ # avoid very small frames. chunk by 10ms 16bits
210
+ bytes_per_frame = (self._opts.sample_rate // 100) * 2
211
+ buf = bytearray()
212
+ async for data, _ in resp.content.iter_chunks():
213
+ buf.extend(data)
214
+
215
+ while len(buf) >= bytes_per_frame:
216
+ frame_data = buf[:bytes_per_frame]
217
+ buf = buf[bytes_per_frame:]
218
+
219
+ self._queue.put_nowait(
220
+ tts.SynthesizedAudio(
221
+ text=self._text,
222
+ data=rtc.AudioFrame(
223
+ data=frame_data,
224
+ sample_rate=self._opts.sample_rate,
225
+ num_channels=1,
226
+ samples_per_channel=len(frame_data) // 2,
227
+ ),
228
+ )
229
+ )
230
+
231
+ # send any remaining data
232
+ if len(buf) > 0:
233
+ self._queue.put_nowait(
234
+ tts.SynthesizedAudio(
235
+ text=self._text,
236
+ data=rtc.AudioFrame(
237
+ data=buf,
238
+ sample_rate=self._opts.sample_rate,
239
+ num_channels=1,
240
+ samples_per_channel=len(buf) // 2,
241
+ ),
242
+ )
243
+ )
244
+
245
+ async def __anext__(self) -> tts.SynthesizedAudio:
246
+ if not self._task:
247
+ self._task = asyncio.create_task(self._main_task())
248
+
249
+ frame = await self._queue.get()
250
+ if frame is None:
251
+ raise StopAsyncIteration
252
+
253
+ return frame
254
+
255
+ async def aclose(self) -> None:
256
+ if not self._task:
257
+ return
258
+
259
+ self._task.cancel()
260
+ with contextlib.suppress(asyncio.CancelledError):
261
+ await self._task
262
+
263
+
264
+ class SynthesizeStream(tts.SynthesizeStream):
265
+ """Streamed API using websockets"""
266
+
267
+ @dataclass
268
+ class _SegmentConnection:
269
+ audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
270
+ task: asyncio.Task
271
+
272
+ def __init__(
273
+ self,
274
+ session: aiohttp.ClientSession,
275
+ opts: _TTSOptions,
276
+ max_retry_per_segment: int = 3,
277
+ ):
278
+ self._opts = opts
279
+ self._session = session
280
+ self._main_task = asyncio.create_task(self._run(max_retry_per_segment))
281
+ self._event_queue = asyncio.Queue[Optional[tts.SynthesisEvent]]()
282
+ self._closed = False
283
+ self._word_stream = opts.word_tokenizer.stream()
284
+
285
+ def _stream_url(self) -> str:
286
+ base_url = self._opts.base_url
287
+ voice_id = self._opts.voice.id
288
+ model_id = self._opts.model_id
289
+ output_format = self._opts.encoding
290
+ latency = self._opts.streaming_latency
291
+ url = (
292
+ f"{base_url}/text-to-speech/{voice_id}/stream-input?"
293
+ f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
294
+ )
295
+
296
+ return url
297
+
298
+ def push_text(self, token: str | None) -> None:
299
+ if self._closed:
300
+ raise ValueError("cannot push to a closed stream")
301
+
302
+ if token is None:
303
+ self._word_stream.mark_segment_end()
304
+ return
305
+
306
+ self._word_stream.push_text(token)
307
+
308
+ async def aclose(self, *, wait: bool = True) -> None:
309
+ self._closed = True
310
+ await self._word_stream.aclose()
311
+
312
+ if not wait:
313
+ self._main_task.cancel()
314
+
315
+ with contextlib.suppress(asyncio.CancelledError):
316
+ await self._main_task
317
+
318
+ async def _run(self, max_retry_per_segment: int) -> None:
319
+ conns_q = asyncio.Queue[Optional[SynthesizeStream._SegmentConnection]]()
320
+
321
+ async def _forward_events() -> None:
322
+ """forward events from the ws connections to the event queue.
323
+ This is used to keep the right order."""
324
+ while True:
325
+ c = await conns_q.get()
326
+ if c is None:
327
+ break # no more segment, stream closed
328
+
329
+ self._event_queue.put_nowait(
330
+ tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
331
+ )
332
+
333
+ async for frame in c.audio_rx:
334
+ self._event_queue.put_nowait(
335
+ tts.SynthesisEvent(
336
+ type=tts.SynthesisEventType.AUDIO, audio=frame
337
+ )
338
+ )
339
+
340
+ self._event_queue.put_nowait(
341
+ tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
342
+ )
343
+
344
+ async def _read_tokens() -> None:
345
+ """read tokens from the word stream and create connections for each segment,
346
+ (this also allows concurrent connections to 11labs)"""
347
+
348
+ cur_segment: SynthesizeStream._SegmentConnection | None = None
349
+ token_tx: aio.ChanSender[str] | None = None
350
+ async for ev in self._word_stream:
351
+ if ev.type == tokenize.TokenEventType.STARTED:
352
+ token_tx, token_rx = aio.channel()
353
+ audio_tx: aio.ChanSender[tts.SynthesizedAudio]
354
+ audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
355
+ audio_tx, audio_rx = aio.channel()
356
+ task = asyncio.create_task(
357
+ self._run_ws(max_retry_per_segment, audio_tx, token_rx)
358
+ )
359
+ cur_segment = SynthesizeStream._SegmentConnection(audio_rx, task)
360
+ conns_q.put_nowait(cur_segment)
361
+ elif ev.type == tokenize.TokenEventType.TOKEN:
362
+ assert token_tx is not None
363
+ token_tx.send_nowait(ev.token)
364
+ elif ev.type == tokenize.TokenEventType.FINISHED:
365
+ assert token_tx is not None
366
+ token_tx.close()
367
+ cur_segment = token_tx = None
368
+
369
+ conns_q.put_nowait(None)
370
+
371
+ try:
372
+ await asyncio.gather(_forward_events(), _read_tokens())
373
+ except Exception:
374
+ logger.exception("11labs task failed")
375
+
376
+ self._event_queue.put_nowait(None)
377
+
378
+ async def _run_ws(
379
+ self,
380
+ max_retry: int,
381
+ audio_tx: aio.ChanSender[tts.SynthesizedAudio],
382
+ token_rx: aio.ChanReceiver[str],
383
+ ) -> None:
384
+ # try to connect to 11labs
385
+ ws_conn: aiohttp.ClientWebSocketResponse | None = None
386
+ for try_i in range(max_retry):
387
+ try:
388
+ ws_conn = await self._session.ws_connect(
389
+ self._stream_url(),
390
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
391
+ )
392
+
393
+ voice_settings = None
394
+ if self._opts.voice.settings is not None:
395
+ voice_settings = dataclasses.asdict(self._opts.voice.settings)
396
+
397
+ init_pkt = dict(
398
+ text=" ",
399
+ try_trigger_generation=True,
400
+ voice_settings=voice_settings,
401
+ generation_config=dict(
402
+ chunk_length_schedule=self._opts.chunk_length_schedule,
403
+ ),
404
+ )
405
+ await ws_conn.send_str(json.dumps(init_pkt))
406
+ except Exception:
407
+ if try_i + 1 == max_retry:
408
+ logger.exception(
409
+ f"failed to connect to 11labs after {max_retry} retries"
410
+ )
411
+ return
412
+
413
+ retry_delay = min(try_i * 5, 5) # max 5s
414
+ logger.warning(
415
+ f"failed to connect to 11labs, retrying in {retry_delay}s"
416
+ )
417
+ await asyncio.sleep(retry_delay)
418
+
419
+ assert ws_conn is not None
420
+
421
+ all_tokens_consumed = False
422
+
423
+ async def send_task():
424
+ async for token in token_rx:
425
+ if token == "":
426
+ continue # empty token is closing the stream in 11labs protocol
427
+
428
+ # try_trigger_generation=True is a bad practice, we expose
429
+ # chunk_length_schedule instead
430
+ data_pkt = dict(
431
+ text=f"{token} ", # must always end with a space
432
+ try_trigger_generation=False,
433
+ )
434
+ await ws_conn.send_str(json.dumps(data_pkt))
435
+
436
+ # no more token, mark eos
437
+ flush_pkt = dict(
438
+ text="",
439
+ )
440
+ await ws_conn.send_str(json.dumps(flush_pkt))
441
+
442
+ nonlocal all_tokens_consumed
443
+ all_tokens_consumed = True
444
+
445
+ async def recv_task():
446
+ encoding = _encoding_from_format(self._opts.encoding)
447
+ mp3_decoder = codecs.Mp3StreamDecoder()
448
+ while True:
449
+ msg = await ws_conn.receive()
450
+ if msg.type in (
451
+ aiohttp.WSMsgType.CLOSED,
452
+ aiohttp.WSMsgType.CLOSE,
453
+ aiohttp.WSMsgType.CLOSING,
454
+ ):
455
+ if all_tokens_consumed:
456
+ return # close is expected
457
+
458
+ raise Exception(
459
+ "11labs connection closed unexpectedly, not all tokens have been consumed"
460
+ )
461
+
462
+ if msg.type != aiohttp.WSMsgType.TEXT:
463
+ # audio frames are serialized in base64..
464
+ logger.warning("unexpected 11labs message type %s", msg.type)
465
+ continue
466
+
467
+ data: dict = json.loads(msg.data)
468
+ audio = data.get("audio")
469
+
470
+ if data.get("error"):
471
+ logger.error("11labs error %s", data)
472
+ return
473
+ elif audio is not None:
474
+ if audio == "":
475
+ # 11labs sometimes sends empty audio, ignore
476
+ continue
477
+
478
+ b64data = base64.b64decode(audio)
479
+ frame: rtc.AudioFrame
480
+ if encoding == "mp3":
481
+ frames = mp3_decoder.decode_chunk(b64data)
482
+ frame = utils.merge_frames(frames)
483
+ else:
484
+ frame = rtc.AudioFrame(
485
+ data=b64data,
486
+ sample_rate=self._opts.sample_rate,
487
+ num_channels=1,
488
+ samples_per_channel=len(b64data) // 2,
489
+ )
490
+
491
+ text = ""
492
+ if data.get("alignment"):
493
+ text = "".join(data["alignment"].get("chars", ""))
494
+
495
+ audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
496
+ continue
497
+ elif data.get("isFinal"):
498
+ return # last message
499
+
500
+ logger.error("unexpected 11labs message %s", data)
501
+
502
+ try:
503
+ await asyncio.gather(send_task(), recv_task())
504
+ except Exception:
505
+ logger.exception("11labs ws connection failed")
506
+ finally:
507
+ audio_tx.close()
508
+
509
+ async def __anext__(self) -> tts.SynthesisEvent:
510
+ evt = await self._event_queue.get()
511
+ if evt is None:
512
+ raise StopAsyncIteration
513
+
514
+ return evt
515
+
516
+
517
+ def _dict_to_voices_list(data: dict) -> List[Voice]:
518
+ voices = []
519
+ for voice in data["voices"]:
520
+ voices.append(
521
+ Voice(
522
+ id=voice["voice_id"],
523
+ name=voice["name"],
524
+ category=voice["category"],
525
+ settings=None,
526
+ )
527
+ )
528
+ return voices
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.4.dev2"
15
+ __version__ = "0.5.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.4.dev2
3
+ Version: 0.5.0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
22
  Requires-Dist: livekit~=0.11
23
- Requires-Dist: livekit-agents~=0.6.dev1
23
+ Requires-Dist: livekit-agents[codecs]~=0.7.0
24
24
  Requires-Dist: aiohttp>=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -0,0 +1,3 @@
1
+ livekit~=0.11
2
+ livekit-agents[codecs]~=0.7.0
3
+ aiohttp>=3.8.5
@@ -51,7 +51,7 @@ setuptools.setup(
51
51
  python_requires=">=3.9.0",
52
52
  install_requires=[
53
53
  "livekit ~= 0.11",
54
- "livekit-agents~=0.6.dev1",
54
+ "livekit-agents[codecs]~=0.7.0",
55
55
  "aiohttp >= 3.8.5",
56
56
  ],
57
57
  package_data={
@@ -1,8 +0,0 @@
1
- from typing import Literal
2
-
3
- TTSModels = Literal[
4
- "eleven_monolingual_v1",
5
- "eleven_multilingual_v1",
6
- "eleven_multilingual_v2",
7
- "eleven_turbo_v2",
8
- ]
@@ -1,392 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import asyncio
16
- import base64
17
- import contextlib
18
- import dataclasses
19
- import json
20
- import os
21
- from dataclasses import dataclass
22
- from typing import AsyncIterable, List
23
-
24
- import aiohttp
25
- from livekit import rtc
26
- from livekit.agents import aio, tts
27
-
28
- from .log import logger
29
- from .models import TTSModels
30
-
31
-
32
- @dataclass
33
- class VoiceSettings:
34
- stability: float # [0.0 - 1.0]
35
- similarity_boost: float # [0.0 - 1.0]
36
- style: float | None = None # [0.0 - 1.0]
37
- use_speaker_boost: bool | None = False
38
-
39
-
40
- @dataclass
41
- class Voice:
42
- id: str
43
- name: str
44
- category: str
45
- settings: VoiceSettings | None = None
46
-
47
-
48
- DEFAULT_VOICE = Voice(
49
- id="EXAVITQu4vr4xnSDxMaL",
50
- name="Bella",
51
- category="premade",
52
- settings=VoiceSettings(
53
- stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
54
- ),
55
- )
56
-
57
- API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
58
- AUTHORIZATION_HEADER = "xi-api-key"
59
-
60
-
61
- @dataclass
62
- class TTSOptions:
63
- api_key: str
64
- voice: Voice
65
- model_id: TTSModels
66
- base_url: str
67
- sample_rate: int
68
- latency: int
69
-
70
-
71
- class TTS(tts.TTS):
72
- def __init__(
73
- self,
74
- *,
75
- voice: Voice = DEFAULT_VOICE,
76
- model_id: TTSModels = "eleven_turbo_v2",
77
- api_key: str | None = None,
78
- base_url: str | None = None,
79
- sample_rate: int = 24000,
80
- latency: int = 3,
81
- ) -> None:
82
- super().__init__(
83
- streaming_supported=True, sample_rate=sample_rate, num_channels=1
84
- )
85
- api_key = api_key or os.environ.get("ELEVEN_API_KEY")
86
- if not api_key:
87
- raise ValueError("ELEVEN_API_KEY must be set")
88
-
89
- self._session = aiohttp.ClientSession()
90
- self._opts = TTSOptions(
91
- voice=voice,
92
- model_id=model_id,
93
- api_key=api_key,
94
- base_url=base_url or API_BASE_URL_V1,
95
- sample_rate=sample_rate,
96
- latency=latency,
97
- )
98
-
99
- async def list_voices(self) -> List[Voice]:
100
- async with self._session.get(
101
- f"{self._opts.base_url}/voices",
102
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
103
- ) as resp:
104
- data = await resp.json()
105
- return dict_to_voices_list(data)
106
-
107
- def synthesize(
108
- self,
109
- text: str,
110
- ) -> AsyncIterable[tts.SynthesizedAudio]:
111
- voice = self._opts.voice
112
- url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
113
-
114
- async def generator():
115
- try:
116
- async with self._session.post(
117
- url,
118
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
119
- json=dict(
120
- text=text,
121
- model_id=self._opts.model_id,
122
- voice_settings=dataclasses.asdict(voice.settings)
123
- if voice.settings
124
- else None,
125
- ),
126
- ) as resp:
127
- data = await resp.read()
128
- yield tts.SynthesizedAudio(
129
- text=text,
130
- data=rtc.AudioFrame(
131
- data=data,
132
- sample_rate=self._opts.sample_rate,
133
- num_channels=1,
134
- samples_per_channel=len(data) // 2, # 16-bit
135
- ),
136
- )
137
- except Exception as e:
138
- logger.error(f"failed to synthesize: {e}")
139
-
140
- return generator()
141
-
142
- def stream(
143
- self,
144
- ) -> "SynthesizeStream":
145
- return SynthesizeStream(self._session, self._opts)
146
-
147
-
148
- class SynthesizeStream(tts.SynthesizeStream):
149
- _STREAM_EOS = ""
150
-
151
- def __init__(
152
- self,
153
- session: aiohttp.ClientSession,
154
- opts: TTSOptions,
155
- max_retry: int = 32,
156
- ):
157
- self._opts = opts
158
- self._session = session
159
-
160
- self._queue = asyncio.Queue[str | None]()
161
- self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
162
- self._closed = False
163
- self._text = ""
164
-
165
- self._main_task = asyncio.create_task(self._run(max_retry))
166
-
167
- def _stream_url(self) -> str:
168
- base_url = self._opts.base_url
169
- voice_id = self._opts.voice.id
170
- model_id = self._opts.model_id
171
- sample_rate = self._opts.sample_rate
172
- latency = self._opts.latency
173
- return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
174
-
175
- def push_text(self, token: str | None) -> None:
176
- if self._closed:
177
- raise ValueError("cannot push to a closed stream")
178
-
179
- if token is None:
180
- self._flush_if_needed()
181
- return
182
-
183
- if len(token) == 0:
184
- # 11labs marks the EOS with an empty string, avoid users from pushing empty strings
185
- return
186
-
187
- # TODO: Naive word boundary detection may not be good enough for all languages
188
- # fmt: off
189
- splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
190
- # fmt: on
191
-
192
- self._text += token
193
-
194
- while True:
195
- last_split = -1
196
- for i, c in enumerate(self._text):
197
- if c in splitters:
198
- last_split = i
199
- break
200
-
201
- if last_split == -1:
202
- break
203
-
204
- seg = self._text[: last_split + 1]
205
- seg = seg.strip() + " " # 11labs expects a space at the end
206
- self._queue.put_nowait(seg)
207
- self._text = self._text[last_split + 1 :]
208
-
209
- async def aclose(self, *, wait: bool = True) -> None:
210
- self._flush_if_needed()
211
- self._queue.put_nowait(None)
212
- self._closed = True
213
-
214
- if not wait:
215
- self._main_task.cancel()
216
-
217
- with contextlib.suppress(asyncio.CancelledError):
218
- await self._main_task
219
-
220
- def _flush_if_needed(self) -> None:
221
- seg = self._text.strip()
222
- if len(seg) > 0:
223
- self._queue.put_nowait(seg + " ")
224
-
225
- self._text = ""
226
- self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
227
-
228
- async def _run(self, max_retry: int) -> None:
229
- retry_count = 0
230
- ws: aiohttp.ClientWebSocketResponse | None = None
231
- ws_task: asyncio.Task | None = None
232
- data_tx: aio.ChanSender[str] | None = None
233
-
234
- try:
235
- while True:
236
- ws_connected = ws is not None and not ws.closed
237
- try:
238
- data = await self._queue.get()
239
-
240
- if data is None:
241
- if ws_task is not None:
242
- await ws_task
243
- break
244
-
245
- if not ws_connected:
246
- if data == SynthesizeStream._STREAM_EOS:
247
- continue
248
-
249
- with contextlib.suppress(asyncio.CancelledError):
250
- if ws_task is not None:
251
- await ws_task
252
-
253
- ws = await self._session.ws_connect(
254
- self._stream_url(),
255
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
256
- )
257
- data_tx, data_rx = aio.channel()
258
- ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
259
-
260
- assert data_tx is not None
261
- assert ws_task is not None
262
- assert ws is not None
263
-
264
- data_tx.send_nowait(data)
265
-
266
- except Exception:
267
- if retry_count >= max_retry:
268
- logger.exception(
269
- f"failed to connect to 11labs after {max_retry} retries"
270
- )
271
- break
272
-
273
- retry_delay = min(retry_count * 5, 5) # max 5s
274
- retry_count += 1
275
-
276
- logger.warning(
277
- f"failed to connect to 11labs, retrying in {retry_delay}s"
278
- )
279
- await asyncio.sleep(retry_delay)
280
-
281
- except Exception:
282
- logger.exception("11labs task failed")
283
- finally:
284
- with contextlib.suppress(asyncio.CancelledError):
285
- if ws_task is not None:
286
- ws_task.cancel()
287
- await ws_task
288
-
289
- self._event_queue.put_nowait(None)
290
-
291
- async def _run_ws(
292
- self, ws: aiohttp.ClientWebSocketResponse, data_rx: aio.ChanReceiver[str]
293
- ) -> None:
294
- closing_ws = False
295
-
296
- self._event_queue.put_nowait(
297
- tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
298
- )
299
-
300
- async def send_task():
301
- nonlocal closing_ws
302
-
303
- # 11labs stream must be initialized with a space
304
- voice = self._opts.voice
305
- voice_settings = (
306
- dataclasses.asdict(voice.settings) if voice.settings else None
307
- )
308
- init_pkt = dict(
309
- text=" ",
310
- voice_settings=voice_settings,
311
- )
312
- await ws.send_str(json.dumps(init_pkt))
313
-
314
- while True:
315
- data = await data_rx.recv()
316
- data_pkt = dict(
317
- text=data,
318
- try_trigger_generation=True,
319
- )
320
- if data == SynthesizeStream._STREAM_EOS:
321
- closing_ws = True
322
-
323
- await ws.send_str(json.dumps(data_pkt))
324
-
325
- if closing_ws:
326
- return
327
-
328
- async def recv_task():
329
- nonlocal closing_ws
330
- while True:
331
- msg = await ws.receive()
332
- if msg.type in (
333
- aiohttp.WSMsgType.CLOSED,
334
- aiohttp.WSMsgType.CLOSE,
335
- aiohttp.WSMsgType.CLOSING,
336
- ):
337
- if closing_ws: # close is expected
338
- return
339
-
340
- raise Exception("11labs connection closed unexpectedly")
341
-
342
- if msg.type != aiohttp.WSMsgType.TEXT:
343
- logger.warning("unexpected 11labs message type %s", msg.type)
344
- continue
345
-
346
- data: dict = json.loads(msg.data)
347
- if data.get("audio"):
348
- b64data = base64.b64decode(data["audio"])
349
- frame = rtc.AudioFrame(
350
- data=b64data,
351
- sample_rate=self._opts.sample_rate,
352
- num_channels=1,
353
- samples_per_channel=len(b64data) // 2,
354
- )
355
- self._event_queue.put_nowait(
356
- tts.SynthesisEvent(
357
- type=tts.SynthesisEventType.AUDIO,
358
- audio=tts.SynthesizedAudio(text="", data=frame),
359
- )
360
- )
361
- elif data.get("isFinal"):
362
- return
363
-
364
- try:
365
- await asyncio.gather(send_task(), recv_task())
366
- except Exception:
367
- logger.exception("11labs connection failed")
368
- finally:
369
- self._event_queue.put_nowait(
370
- tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
371
- )
372
-
373
- async def __anext__(self) -> tts.SynthesisEvent:
374
- evt = await self._event_queue.get()
375
- if evt is None:
376
- raise StopAsyncIteration
377
-
378
- return evt
379
-
380
-
381
- def dict_to_voices_list(data: dict) -> List[Voice]:
382
- voices = []
383
- for voice in data["voices"]:
384
- voices.append(
385
- Voice(
386
- id=voice["voice_id"],
387
- name=voice["name"],
388
- category=voice["category"],
389
- settings=None,
390
- )
391
- )
392
- return voices
@@ -1,3 +0,0 @@
1
- livekit~=0.11
2
- livekit-agents~=0.6.dev1
3
- aiohttp>=3.8.5