livekit-plugins-elevenlabs 0.4.dev2__tar.gz → 0.5.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (17) hide show
  1. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/PKG-INFO +2 -2
  2. livekit_plugins_elevenlabs-0.5.dev0/livekit/plugins/elevenlabs/tts.py +487 -0
  3. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/version.py +1 -1
  4. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/PKG-INFO +2 -2
  5. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/requires.txt +1 -1
  6. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/setup.py +1 -1
  7. livekit_plugins_elevenlabs-0.4.dev2/livekit/plugins/elevenlabs/tts.py +0 -392
  8. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/README.md +0 -0
  9. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/__init__.py +0 -0
  10. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/log.py +0 -0
  11. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/models.py +0 -0
  12. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit/plugins/elevenlabs/py.typed +0 -0
  13. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/SOURCES.txt +0 -0
  14. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/dependency_links.txt +0 -0
  15. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/livekit_plugins_elevenlabs.egg-info/top_level.txt +0 -0
  16. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/pyproject.toml +0 -0
  17. {livekit_plugins_elevenlabs-0.4.dev2 → livekit_plugins_elevenlabs-0.5.dev0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.4.dev2
3
+ Version: 0.5.dev0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
22
  Requires-Dist: livekit~=0.11
23
- Requires-Dist: livekit-agents~=0.6.dev1
23
+ Requires-Dist: livekit-agents~=0.7.dev0
24
24
  Requires-Dist: aiohttp>=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -0,0 +1,487 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import base64
19
+ import contextlib
20
+ import dataclasses
21
+ import json
22
+ import os
23
+ from dataclasses import dataclass
24
+ from typing import List, Optional
25
+
26
+ import aiohttp
27
+ from livekit import rtc
28
+ from livekit.agents import aio, tokenize, tts, utils
29
+
30
+ from .log import logger
31
+ from .models import TTSModels
32
+
33
+
34
+ @dataclass
35
+ class VoiceSettings:
36
+ stability: float # [0.0 - 1.0]
37
+ similarity_boost: float # [0.0 - 1.0]
38
+ style: float | None = None # [0.0 - 1.0]
39
+ use_speaker_boost: bool | None = False
40
+
41
+
42
+ @dataclass
43
+ class Voice:
44
+ id: str
45
+ name: str
46
+ category: str
47
+ settings: VoiceSettings | None = None
48
+
49
+
50
+ DEFAULT_VOICE = Voice(
51
+ id="EXAVITQu4vr4xnSDxMaL",
52
+ name="Bella",
53
+ category="premade",
54
+ settings=VoiceSettings(
55
+ stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
56
+ ),
57
+ )
58
+
59
+ API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
60
+ AUTHORIZATION_HEADER = "xi-api-key"
61
+
62
+
63
+ @dataclass
64
+ class _TTSOptions:
65
+ api_key: str
66
+ voice: Voice
67
+ model_id: TTSModels
68
+ base_url: str
69
+ sample_rate: int
70
+ streaming_latency: int
71
+ word_tokenizer: tokenize.WordTokenizer
72
+ chunk_length_schedule: list[int]
73
+
74
+
75
+ class TTS(tts.TTS):
76
+ def __init__(
77
+ self,
78
+ *,
79
+ voice: Voice = DEFAULT_VOICE,
80
+ model_id: TTSModels = "eleven_turbo_v2",
81
+ api_key: str | None = None,
82
+ base_url: str | None = None,
83
+ sample_rate: int = 24000,
84
+ streaming_latency: int = 3,
85
+ word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
86
+ ignore_punctuation=False # punctuation can help for intonation
87
+ ),
88
+ # default value of 11labs is [120, 160, 250, 290], but we want faster responses by default
89
+ # (range is 50-500)
90
+ chunk_length_schedule: list[int] = [80, 120, 200, 260],
91
+ http_session: aiohttp.ClientSession | None = None,
92
+ ) -> None:
93
+ super().__init__(
94
+ streaming_supported=True, sample_rate=sample_rate, num_channels=1
95
+ )
96
+ api_key = api_key or os.environ.get("ELEVEN_API_KEY")
97
+ if not api_key:
98
+ raise ValueError("ELEVEN_API_KEY must be set")
99
+
100
+ self._opts = _TTSOptions(
101
+ voice=voice,
102
+ model_id=model_id,
103
+ api_key=api_key,
104
+ base_url=base_url or API_BASE_URL_V1,
105
+ sample_rate=sample_rate,
106
+ streaming_latency=streaming_latency,
107
+ word_tokenizer=word_tokenizer,
108
+ chunk_length_schedule=chunk_length_schedule,
109
+ )
110
+ self._session = http_session
111
+
112
+ def _ensure_session(self) -> aiohttp.ClientSession:
113
+ if not self._session:
114
+ self._session = utils.http_session()
115
+
116
+ return self._session
117
+
118
+ async def list_voices(self) -> List[Voice]:
119
+ async with self._ensure_session().get(
120
+ f"{self._opts.base_url}/voices",
121
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
122
+ ) as resp:
123
+ return _dict_to_voices_list(await resp.json())
124
+
125
+ def synthesize(
126
+ self,
127
+ text: str,
128
+ ) -> "ChunkedStream":
129
+ return ChunkedStream(text, self._opts, self._ensure_session())
130
+
131
+ def stream(
132
+ self,
133
+ ) -> "SynthesizeStream":
134
+ return SynthesizeStream(self._ensure_session(), self._opts)
135
+
136
+
137
+ class ChunkedStream(tts.ChunkedStream):
138
+ """Synthesize using the chunked api endpoint"""
139
+
140
+ def __init__(
141
+ self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
142
+ ) -> None:
143
+ self._opts = opts
144
+ self._text = text
145
+ self._session = session
146
+ self._task: asyncio.Task | None = None
147
+ self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
148
+
149
+ def _synthesize_url(self) -> str:
150
+ base_url = self._opts.base_url
151
+ voice_id = self._opts.voice.id
152
+ model_id = self._opts.model_id
153
+ sample_rate = self._opts.sample_rate
154
+ latency = self._opts.streaming_latency
155
+ url = (
156
+ f"{base_url}/text-to-speech/{voice_id}/stream?"
157
+ f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
158
+ )
159
+ return url
160
+
161
+ async def _main_task(self):
162
+ try:
163
+ await self._run()
164
+ except Exception:
165
+ logger.exception("11labs main task failed in chunked stream")
166
+ finally:
167
+ self._queue.put_nowait(None)
168
+
169
+ async def _run(self) -> None:
170
+ async with self._session.post(
171
+ self._synthesize_url(),
172
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
173
+ json=dict(
174
+ text=self._text,
175
+ model_id=self._opts.model_id,
176
+ voice_settings=(
177
+ dataclasses.asdict(self._opts.voice.settings)
178
+ if self._opts.voice.settings
179
+ else None
180
+ ),
181
+ ),
182
+ ) as resp:
183
+ # avoid very small frames. chunk by 10ms 16bits
184
+ bytes_per_frame = (self._opts.sample_rate // 100) * 2
185
+ buf = bytearray()
186
+ async for data, _ in resp.content.iter_chunks():
187
+ buf.extend(data)
188
+
189
+ while len(buf) >= bytes_per_frame:
190
+ frame_data = buf[:bytes_per_frame]
191
+ buf = buf[bytes_per_frame:]
192
+
193
+ self._queue.put_nowait(
194
+ tts.SynthesizedAudio(
195
+ text=self._text,
196
+ data=rtc.AudioFrame(
197
+ data=frame_data,
198
+ sample_rate=self._opts.sample_rate,
199
+ num_channels=1,
200
+ samples_per_channel=len(frame_data) // 2,
201
+ ),
202
+ )
203
+ )
204
+
205
+ # send any remaining data
206
+ if len(buf) > 0:
207
+ self._queue.put_nowait(
208
+ tts.SynthesizedAudio(
209
+ text=self._text,
210
+ data=rtc.AudioFrame(
211
+ data=buf,
212
+ sample_rate=self._opts.sample_rate,
213
+ num_channels=1,
214
+ samples_per_channel=len(buf) // 2,
215
+ ),
216
+ )
217
+ )
218
+
219
+ async def __anext__(self) -> tts.SynthesizedAudio:
220
+ if not self._task:
221
+ self._task = asyncio.create_task(self._main_task())
222
+
223
+ frame = await self._queue.get()
224
+ if frame is None:
225
+ raise StopAsyncIteration
226
+
227
+ return frame
228
+
229
+ async def aclose(self) -> None:
230
+ if not self._task:
231
+ return
232
+
233
+ self._task.cancel()
234
+ with contextlib.suppress(asyncio.CancelledError):
235
+ await self._task
236
+
237
+
238
+ class SynthesizeStream(tts.SynthesizeStream):
239
+ """Streamed API using websockets"""
240
+
241
+ @dataclass
242
+ class _SegmentConnection:
243
+ audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
244
+ task: asyncio.Task
245
+
246
+ def __init__(
247
+ self,
248
+ session: aiohttp.ClientSession,
249
+ opts: _TTSOptions,
250
+ max_retry_per_segment: int = 3,
251
+ ):
252
+ self._opts = opts
253
+ self._session = session
254
+ self._main_task = asyncio.create_task(self._run(max_retry_per_segment))
255
+ self._event_queue = asyncio.Queue[Optional[tts.SynthesisEvent]]()
256
+ self._closed = False
257
+ self._word_stream = opts.word_tokenizer.stream()
258
+
259
+ def _stream_url(self) -> str:
260
+ base_url = self._opts.base_url
261
+ voice_id = self._opts.voice.id
262
+ model_id = self._opts.model_id
263
+ sample_rate = self._opts.sample_rate
264
+ latency = self._opts.streaming_latency
265
+ url = (
266
+ f"{base_url}/text-to-speech/{voice_id}/stream-input?"
267
+ f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
268
+ )
269
+
270
+ return url
271
+
272
+ def push_text(self, token: str | None) -> None:
273
+ if self._closed:
274
+ raise ValueError("cannot push to a closed stream")
275
+
276
+ if token is None:
277
+ self._word_stream.mark_segment_end()
278
+ return
279
+
280
+ self._word_stream.push_text(token)
281
+
282
+ async def aclose(self, *, wait: bool = True) -> None:
283
+ self._closed = True
284
+ await self._word_stream.aclose()
285
+
286
+ if not wait:
287
+ self._main_task.cancel()
288
+
289
+ with contextlib.suppress(asyncio.CancelledError):
290
+ await self._main_task
291
+
292
+ async def _run(self, max_retry_per_segment: int) -> None:
293
+ conns_q = asyncio.Queue[Optional[SynthesizeStream._SegmentConnection]]()
294
+
295
+ async def _forward_events() -> None:
296
+ """forward events from the ws connections to the event queue.
297
+ This is used to keep the right order."""
298
+ while True:
299
+ c = await conns_q.get()
300
+ if c is None:
301
+ break # no more segment, stream closed
302
+
303
+ self._event_queue.put_nowait(
304
+ tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
305
+ )
306
+
307
+ async for frame in c.audio_rx:
308
+ self._event_queue.put_nowait(
309
+ tts.SynthesisEvent(
310
+ type=tts.SynthesisEventType.AUDIO, audio=frame
311
+ )
312
+ )
313
+
314
+ self._event_queue.put_nowait(
315
+ tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
316
+ )
317
+
318
+ async def _read_tokens() -> None:
319
+ """read tokens from the word stream and create connections for each segment,
320
+ (this also allows concurrent connections to 11labs)"""
321
+
322
+ cur_segment: SynthesizeStream._SegmentConnection | None = None
323
+ token_tx: aio.ChanSender[str] | None = None
324
+ async for ev in self._word_stream:
325
+ if ev.type == tokenize.TokenEventType.STARTED:
326
+ token_tx, token_rx = aio.channel()
327
+ audio_tx: aio.ChanSender[tts.SynthesizedAudio]
328
+ audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
329
+ audio_tx, audio_rx = aio.channel()
330
+ task = asyncio.create_task(
331
+ self._run_ws(max_retry_per_segment, audio_tx, token_rx)
332
+ )
333
+ cur_segment = SynthesizeStream._SegmentConnection(audio_rx, task)
334
+ conns_q.put_nowait(cur_segment)
335
+ elif ev.type == tokenize.TokenEventType.TOKEN:
336
+ assert token_tx is not None
337
+ token_tx.send_nowait(ev.token)
338
+ elif ev.type == tokenize.TokenEventType.FINISHED:
339
+ assert token_tx is not None
340
+ token_tx.close()
341
+ cur_segment = token_tx = None
342
+
343
+ conns_q.put_nowait(None)
344
+
345
+ try:
346
+ await asyncio.gather(_forward_events(), _read_tokens())
347
+ except Exception:
348
+ logger.exception("11labs task failed")
349
+
350
+ self._event_queue.put_nowait(None)
351
+
352
+ async def _run_ws(
353
+ self,
354
+ max_retry: int,
355
+ audio_tx: aio.ChanSender[tts.SynthesizedAudio],
356
+ token_rx: aio.ChanReceiver[str],
357
+ ) -> None:
358
+ # try to connect to 11labs
359
+ ws_conn: aiohttp.ClientWebSocketResponse | None = None
360
+ for try_i in range(max_retry):
361
+ try:
362
+ ws_conn = await self._session.ws_connect(
363
+ self._stream_url(),
364
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
365
+ )
366
+
367
+ voice_settings = None
368
+ if self._opts.voice.settings is not None:
369
+ voice_settings = dataclasses.asdict(self._opts.voice.settings)
370
+
371
+ init_pkt = dict(
372
+ text=" ",
373
+ try_trigger_generation=True,
374
+ voice_settings=voice_settings,
375
+ generation_config=dict(
376
+ chunk_length_schedule=self._opts.chunk_length_schedule,
377
+ ),
378
+ )
379
+ await ws_conn.send_str(json.dumps(init_pkt))
380
+ except Exception:
381
+ if try_i + 1 == max_retry:
382
+ logger.exception(
383
+ f"failed to connect to 11labs after {max_retry} retries"
384
+ )
385
+ return
386
+
387
+ retry_delay = min(try_i * 5, 5) # max 5s
388
+ logger.warning(
389
+ f"failed to connect to 11labs, retrying in {retry_delay}s"
390
+ )
391
+ await asyncio.sleep(retry_delay)
392
+
393
+ assert ws_conn is not None
394
+
395
+ all_tokens_consumed = False
396
+
397
+ async def send_task():
398
+ async for token in token_rx:
399
+ if token == "":
400
+ continue # empty token is closing the stream in 11labs protocol
401
+
402
+ # try_trigger_generation=True is a bad practice, we expose
403
+ # chunk_length_schedule instead
404
+ data_pkt = dict(
405
+ text=f"{token} ", # must always end with a space
406
+ try_trigger_generation=False,
407
+ )
408
+ await ws_conn.send_str(json.dumps(data_pkt))
409
+
410
+ # no more token, mark eos
411
+ flush_pkt = dict(
412
+ text="",
413
+ )
414
+ await ws_conn.send_str(json.dumps(flush_pkt))
415
+
416
+ nonlocal all_tokens_consumed
417
+ all_tokens_consumed = True
418
+
419
+ async def recv_task():
420
+ while True:
421
+ msg = await ws_conn.receive()
422
+ if msg.type in (
423
+ aiohttp.WSMsgType.CLOSED,
424
+ aiohttp.WSMsgType.CLOSE,
425
+ aiohttp.WSMsgType.CLOSING,
426
+ ):
427
+ if all_tokens_consumed:
428
+ return # close is expected
429
+
430
+ raise Exception(
431
+ "11labs connection closed unexpectedly, not all tokens have been consumed"
432
+ )
433
+
434
+ if msg.type != aiohttp.WSMsgType.TEXT:
435
+ # audio frames are serialized in base64..
436
+ logger.warning("unexpected 11labs message type %s", msg.type)
437
+ continue
438
+
439
+ data: dict = json.loads(msg.data)
440
+ if data.get("audio"):
441
+ b64data = base64.b64decode(data["audio"])
442
+
443
+ frame = rtc.AudioFrame(
444
+ data=b64data,
445
+ sample_rate=self._opts.sample_rate,
446
+ num_channels=1,
447
+ samples_per_channel=len(b64data) // 2,
448
+ )
449
+
450
+ text = ""
451
+ if data.get("alignment"):
452
+ text = data["alignment"].get("chars", "")
453
+
454
+ audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
455
+ continue
456
+ elif data.get("isFinal"):
457
+ return # last message
458
+
459
+ logger.error("unexpected 11labs message %s", data)
460
+
461
+ try:
462
+ await asyncio.gather(send_task(), recv_task())
463
+ except Exception:
464
+ logger.exception("11labs ws connection failed")
465
+ finally:
466
+ audio_tx.close()
467
+
468
+ async def __anext__(self) -> tts.SynthesisEvent:
469
+ evt = await self._event_queue.get()
470
+ if evt is None:
471
+ raise StopAsyncIteration
472
+
473
+ return evt
474
+
475
+
476
+ def _dict_to_voices_list(data: dict) -> List[Voice]:
477
+ voices = []
478
+ for voice in data["voices"]:
479
+ voices.append(
480
+ Voice(
481
+ id=voice["voice_id"],
482
+ name=voice["name"],
483
+ category=voice["category"],
484
+ settings=None,
485
+ )
486
+ )
487
+ return voices
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.4.dev2"
15
+ __version__ = "0.5.dev0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.4.dev2
3
+ Version: 0.5.dev0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
22
  Requires-Dist: livekit~=0.11
23
- Requires-Dist: livekit-agents~=0.6.dev1
23
+ Requires-Dist: livekit-agents~=0.7.dev0
24
24
  Requires-Dist: aiohttp>=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -1,3 +1,3 @@
1
1
  livekit~=0.11
2
- livekit-agents~=0.6.dev1
2
+ livekit-agents~=0.7.dev0
3
3
  aiohttp>=3.8.5
@@ -51,7 +51,7 @@ setuptools.setup(
51
51
  python_requires=">=3.9.0",
52
52
  install_requires=[
53
53
  "livekit ~= 0.11",
54
- "livekit-agents~=0.6.dev1",
54
+ "livekit-agents~=0.7.dev0",
55
55
  "aiohttp >= 3.8.5",
56
56
  ],
57
57
  package_data={
@@ -1,392 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import asyncio
16
- import base64
17
- import contextlib
18
- import dataclasses
19
- import json
20
- import os
21
- from dataclasses import dataclass
22
- from typing import AsyncIterable, List
23
-
24
- import aiohttp
25
- from livekit import rtc
26
- from livekit.agents import aio, tts
27
-
28
- from .log import logger
29
- from .models import TTSModels
30
-
31
-
32
- @dataclass
33
- class VoiceSettings:
34
- stability: float # [0.0 - 1.0]
35
- similarity_boost: float # [0.0 - 1.0]
36
- style: float | None = None # [0.0 - 1.0]
37
- use_speaker_boost: bool | None = False
38
-
39
-
40
- @dataclass
41
- class Voice:
42
- id: str
43
- name: str
44
- category: str
45
- settings: VoiceSettings | None = None
46
-
47
-
48
- DEFAULT_VOICE = Voice(
49
- id="EXAVITQu4vr4xnSDxMaL",
50
- name="Bella",
51
- category="premade",
52
- settings=VoiceSettings(
53
- stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
54
- ),
55
- )
56
-
57
- API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
58
- AUTHORIZATION_HEADER = "xi-api-key"
59
-
60
-
61
- @dataclass
62
- class TTSOptions:
63
- api_key: str
64
- voice: Voice
65
- model_id: TTSModels
66
- base_url: str
67
- sample_rate: int
68
- latency: int
69
-
70
-
71
- class TTS(tts.TTS):
72
- def __init__(
73
- self,
74
- *,
75
- voice: Voice = DEFAULT_VOICE,
76
- model_id: TTSModels = "eleven_turbo_v2",
77
- api_key: str | None = None,
78
- base_url: str | None = None,
79
- sample_rate: int = 24000,
80
- latency: int = 3,
81
- ) -> None:
82
- super().__init__(
83
- streaming_supported=True, sample_rate=sample_rate, num_channels=1
84
- )
85
- api_key = api_key or os.environ.get("ELEVEN_API_KEY")
86
- if not api_key:
87
- raise ValueError("ELEVEN_API_KEY must be set")
88
-
89
- self._session = aiohttp.ClientSession()
90
- self._opts = TTSOptions(
91
- voice=voice,
92
- model_id=model_id,
93
- api_key=api_key,
94
- base_url=base_url or API_BASE_URL_V1,
95
- sample_rate=sample_rate,
96
- latency=latency,
97
- )
98
-
99
- async def list_voices(self) -> List[Voice]:
100
- async with self._session.get(
101
- f"{self._opts.base_url}/voices",
102
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
103
- ) as resp:
104
- data = await resp.json()
105
- return dict_to_voices_list(data)
106
-
107
- def synthesize(
108
- self,
109
- text: str,
110
- ) -> AsyncIterable[tts.SynthesizedAudio]:
111
- voice = self._opts.voice
112
- url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
113
-
114
- async def generator():
115
- try:
116
- async with self._session.post(
117
- url,
118
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
119
- json=dict(
120
- text=text,
121
- model_id=self._opts.model_id,
122
- voice_settings=dataclasses.asdict(voice.settings)
123
- if voice.settings
124
- else None,
125
- ),
126
- ) as resp:
127
- data = await resp.read()
128
- yield tts.SynthesizedAudio(
129
- text=text,
130
- data=rtc.AudioFrame(
131
- data=data,
132
- sample_rate=self._opts.sample_rate,
133
- num_channels=1,
134
- samples_per_channel=len(data) // 2, # 16-bit
135
- ),
136
- )
137
- except Exception as e:
138
- logger.error(f"failed to synthesize: {e}")
139
-
140
- return generator()
141
-
142
- def stream(
143
- self,
144
- ) -> "SynthesizeStream":
145
- return SynthesizeStream(self._session, self._opts)
146
-
147
-
148
- class SynthesizeStream(tts.SynthesizeStream):
149
- _STREAM_EOS = ""
150
-
151
- def __init__(
152
- self,
153
- session: aiohttp.ClientSession,
154
- opts: TTSOptions,
155
- max_retry: int = 32,
156
- ):
157
- self._opts = opts
158
- self._session = session
159
-
160
- self._queue = asyncio.Queue[str | None]()
161
- self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
162
- self._closed = False
163
- self._text = ""
164
-
165
- self._main_task = asyncio.create_task(self._run(max_retry))
166
-
167
- def _stream_url(self) -> str:
168
- base_url = self._opts.base_url
169
- voice_id = self._opts.voice.id
170
- model_id = self._opts.model_id
171
- sample_rate = self._opts.sample_rate
172
- latency = self._opts.latency
173
- return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
174
-
175
- def push_text(self, token: str | None) -> None:
176
- if self._closed:
177
- raise ValueError("cannot push to a closed stream")
178
-
179
- if token is None:
180
- self._flush_if_needed()
181
- return
182
-
183
- if len(token) == 0:
184
- # 11labs marks the EOS with an empty string, avoid users from pushing empty strings
185
- return
186
-
187
- # TODO: Naive word boundary detection may not be good enough for all languages
188
- # fmt: off
189
- splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
190
- # fmt: on
191
-
192
- self._text += token
193
-
194
- while True:
195
- last_split = -1
196
- for i, c in enumerate(self._text):
197
- if c in splitters:
198
- last_split = i
199
- break
200
-
201
- if last_split == -1:
202
- break
203
-
204
- seg = self._text[: last_split + 1]
205
- seg = seg.strip() + " " # 11labs expects a space at the end
206
- self._queue.put_nowait(seg)
207
- self._text = self._text[last_split + 1 :]
208
-
209
- async def aclose(self, *, wait: bool = True) -> None:
210
- self._flush_if_needed()
211
- self._queue.put_nowait(None)
212
- self._closed = True
213
-
214
- if not wait:
215
- self._main_task.cancel()
216
-
217
- with contextlib.suppress(asyncio.CancelledError):
218
- await self._main_task
219
-
220
- def _flush_if_needed(self) -> None:
221
- seg = self._text.strip()
222
- if len(seg) > 0:
223
- self._queue.put_nowait(seg + " ")
224
-
225
- self._text = ""
226
- self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
227
-
228
- async def _run(self, max_retry: int) -> None:
229
- retry_count = 0
230
- ws: aiohttp.ClientWebSocketResponse | None = None
231
- ws_task: asyncio.Task | None = None
232
- data_tx: aio.ChanSender[str] | None = None
233
-
234
- try:
235
- while True:
236
- ws_connected = ws is not None and not ws.closed
237
- try:
238
- data = await self._queue.get()
239
-
240
- if data is None:
241
- if ws_task is not None:
242
- await ws_task
243
- break
244
-
245
- if not ws_connected:
246
- if data == SynthesizeStream._STREAM_EOS:
247
- continue
248
-
249
- with contextlib.suppress(asyncio.CancelledError):
250
- if ws_task is not None:
251
- await ws_task
252
-
253
- ws = await self._session.ws_connect(
254
- self._stream_url(),
255
- headers={AUTHORIZATION_HEADER: self._opts.api_key},
256
- )
257
- data_tx, data_rx = aio.channel()
258
- ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
259
-
260
- assert data_tx is not None
261
- assert ws_task is not None
262
- assert ws is not None
263
-
264
- data_tx.send_nowait(data)
265
-
266
- except Exception:
267
- if retry_count >= max_retry:
268
- logger.exception(
269
- f"failed to connect to 11labs after {max_retry} retries"
270
- )
271
- break
272
-
273
- retry_delay = min(retry_count * 5, 5) # max 5s
274
- retry_count += 1
275
-
276
- logger.warning(
277
- f"failed to connect to 11labs, retrying in {retry_delay}s"
278
- )
279
- await asyncio.sleep(retry_delay)
280
-
281
- except Exception:
282
- logger.exception("11labs task failed")
283
- finally:
284
- with contextlib.suppress(asyncio.CancelledError):
285
- if ws_task is not None:
286
- ws_task.cancel()
287
- await ws_task
288
-
289
- self._event_queue.put_nowait(None)
290
-
291
- async def _run_ws(
292
- self, ws: aiohttp.ClientWebSocketResponse, data_rx: aio.ChanReceiver[str]
293
- ) -> None:
294
- closing_ws = False
295
-
296
- self._event_queue.put_nowait(
297
- tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
298
- )
299
-
300
- async def send_task():
301
- nonlocal closing_ws
302
-
303
- # 11labs stream must be initialized with a space
304
- voice = self._opts.voice
305
- voice_settings = (
306
- dataclasses.asdict(voice.settings) if voice.settings else None
307
- )
308
- init_pkt = dict(
309
- text=" ",
310
- voice_settings=voice_settings,
311
- )
312
- await ws.send_str(json.dumps(init_pkt))
313
-
314
- while True:
315
- data = await data_rx.recv()
316
- data_pkt = dict(
317
- text=data,
318
- try_trigger_generation=True,
319
- )
320
- if data == SynthesizeStream._STREAM_EOS:
321
- closing_ws = True
322
-
323
- await ws.send_str(json.dumps(data_pkt))
324
-
325
- if closing_ws:
326
- return
327
-
328
- async def recv_task():
329
- nonlocal closing_ws
330
- while True:
331
- msg = await ws.receive()
332
- if msg.type in (
333
- aiohttp.WSMsgType.CLOSED,
334
- aiohttp.WSMsgType.CLOSE,
335
- aiohttp.WSMsgType.CLOSING,
336
- ):
337
- if closing_ws: # close is expected
338
- return
339
-
340
- raise Exception("11labs connection closed unexpectedly")
341
-
342
- if msg.type != aiohttp.WSMsgType.TEXT:
343
- logger.warning("unexpected 11labs message type %s", msg.type)
344
- continue
345
-
346
- data: dict = json.loads(msg.data)
347
- if data.get("audio"):
348
- b64data = base64.b64decode(data["audio"])
349
- frame = rtc.AudioFrame(
350
- data=b64data,
351
- sample_rate=self._opts.sample_rate,
352
- num_channels=1,
353
- samples_per_channel=len(b64data) // 2,
354
- )
355
- self._event_queue.put_nowait(
356
- tts.SynthesisEvent(
357
- type=tts.SynthesisEventType.AUDIO,
358
- audio=tts.SynthesizedAudio(text="", data=frame),
359
- )
360
- )
361
- elif data.get("isFinal"):
362
- return
363
-
364
- try:
365
- await asyncio.gather(send_task(), recv_task())
366
- except Exception:
367
- logger.exception("11labs connection failed")
368
- finally:
369
- self._event_queue.put_nowait(
370
- tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
371
- )
372
-
373
- async def __anext__(self) -> tts.SynthesisEvent:
374
- evt = await self._event_queue.get()
375
- if evt is None:
376
- raise StopAsyncIteration
377
-
378
- return evt
379
-
380
-
381
- def dict_to_voices_list(data: dict) -> List[Voice]:
382
- voices = []
383
- for voice in data["voices"]:
384
- voices.append(
385
- Voice(
386
- id=voice["voice_id"],
387
- name=voice["name"],
388
- category=voice["category"],
389
- settings=None,
390
- )
391
- )
392
- return voices