livekit-plugins-elevenlabs 0.6.dev0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/tts.py +167 -297
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.6.dev0.dist-info → livekit_plugins_elevenlabs-0.7.0.dist-info}/METADATA +2 -4
- livekit_plugins_elevenlabs-0.7.0.dist-info/RECORD +10 -0
- {livekit_plugins_elevenlabs-0.6.dev0.dist-info → livekit_plugins_elevenlabs-0.7.0.dist-info}/WHEEL +1 -1
- livekit_plugins_elevenlabs-0.6.dev0.dist-info/RECORD +0 -10
- {livekit_plugins_elevenlabs-0.6.dev0.dist-info → livekit_plugins_elevenlabs-0.7.0.dist-info}/top_level.txt +0 -0
@@ -16,27 +16,20 @@ from __future__ import annotations
|
|
16
16
|
|
17
17
|
import asyncio
|
18
18
|
import base64
|
19
|
-
import contextlib
|
20
19
|
import dataclasses
|
21
20
|
import json
|
22
21
|
import os
|
23
22
|
from dataclasses import dataclass
|
24
|
-
from typing import List, Literal
|
23
|
+
from typing import Any, List, Literal
|
25
24
|
|
26
25
|
import aiohttp
|
27
26
|
from livekit import rtc
|
28
|
-
from livekit.agents import
|
27
|
+
from livekit.agents import tokenize, tts, utils
|
29
28
|
|
30
29
|
from .log import logger
|
31
|
-
from .models import
|
32
|
-
TTSEncoding,
|
33
|
-
TTSModels,
|
34
|
-
)
|
30
|
+
from .models import TTSEncoding, TTSModels
|
35
31
|
|
36
|
-
_Encoding = Literal[
|
37
|
-
"mp3",
|
38
|
-
"pcm",
|
39
|
-
]
|
32
|
+
_Encoding = Literal["mp3", "pcm"]
|
40
33
|
|
41
34
|
|
42
35
|
def _sample_rate_from_format(output_format: TTSEncoding) -> int:
|
@@ -114,7 +107,9 @@ class TTS(tts.TTS):
|
|
114
107
|
http_session: aiohttp.ClientSession | None = None,
|
115
108
|
) -> None:
|
116
109
|
super().__init__(
|
117
|
-
|
110
|
+
capabilities=tts.TTSCapabilities(
|
111
|
+
streaming=True,
|
112
|
+
),
|
118
113
|
sample_rate=_sample_rate_from_format(encoding),
|
119
114
|
num_channels=1,
|
120
115
|
)
|
@@ -137,7 +132,7 @@ class TTS(tts.TTS):
|
|
137
132
|
|
138
133
|
def _ensure_session(self) -> aiohttp.ClientSession:
|
139
134
|
if not self._session:
|
140
|
-
self._session = utils.http_session()
|
135
|
+
self._session = utils.http_context.http_session()
|
141
136
|
|
142
137
|
return self._session
|
143
138
|
|
@@ -148,15 +143,10 @@ class TTS(tts.TTS):
|
|
148
143
|
) as resp:
|
149
144
|
return _dict_to_voices_list(await resp.json())
|
150
145
|
|
151
|
-
def synthesize(
|
152
|
-
self,
|
153
|
-
text: str,
|
154
|
-
) -> "ChunkedStream":
|
146
|
+
def synthesize(self, text: str) -> "ChunkedStream":
|
155
147
|
return ChunkedStream(text, self._opts, self._ensure_session())
|
156
148
|
|
157
|
-
def stream(
|
158
|
-
self,
|
159
|
-
) -> "SynthesizeStream":
|
149
|
+
def stream(self) -> "SynthesizeStream":
|
160
150
|
return SynthesizeStream(self._ensure_session(), self._opts)
|
161
151
|
|
162
152
|
|
@@ -166,285 +156,150 @@ class ChunkedStream(tts.ChunkedStream):
|
|
166
156
|
def __init__(
|
167
157
|
self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
|
168
158
|
) -> None:
|
169
|
-
|
170
|
-
self._text = text
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
base_url = self._opts.base_url
|
177
|
-
voice_id = self._opts.voice.id
|
178
|
-
model_id = self._opts.model_id
|
179
|
-
sample_rate = _sample_rate_from_format(self._opts.encoding)
|
180
|
-
latency = self._opts.streaming_latency
|
181
|
-
url = (
|
182
|
-
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
183
|
-
f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
159
|
+
super().__init__()
|
160
|
+
self._text, self._opts, self._session = text, opts, session
|
161
|
+
|
162
|
+
@utils.log_exceptions(logger=logger)
|
163
|
+
async def _main_task(self) -> None:
|
164
|
+
bstream = utils.audio.AudioByteStream(
|
165
|
+
sample_rate=self._opts.sample_rate, num_channels=1
|
184
166
|
)
|
185
|
-
|
167
|
+
request_id = utils.shortuuid()
|
168
|
+
segment_id = utils.shortuuid()
|
186
169
|
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
self.
|
170
|
+
voice_settings = (
|
171
|
+
dataclasses.asdict(self._opts.voice.settings)
|
172
|
+
if self._opts.voice.settings
|
173
|
+
else None
|
174
|
+
)
|
175
|
+
data = {
|
176
|
+
"text": self._text,
|
177
|
+
"model_id": self._opts.model_id,
|
178
|
+
"voice_settings": voice_settings,
|
179
|
+
}
|
194
180
|
|
195
|
-
async def _run(self) -> None:
|
196
181
|
async with self._session.post(
|
197
|
-
self.
|
182
|
+
_synthesize_url(self._opts),
|
198
183
|
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
199
|
-
json=
|
200
|
-
text=self._text,
|
201
|
-
model_id=self._opts.model_id,
|
202
|
-
voice_settings=(
|
203
|
-
dataclasses.asdict(self._opts.voice.settings)
|
204
|
-
if self._opts.voice.settings
|
205
|
-
else None
|
206
|
-
),
|
207
|
-
),
|
184
|
+
json=data,
|
208
185
|
) as resp:
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
async for data, _ in resp.content.iter_chunks():
|
213
|
-
buf.extend(data)
|
214
|
-
|
215
|
-
while len(buf) >= bytes_per_frame:
|
216
|
-
frame_data = buf[:bytes_per_frame]
|
217
|
-
buf = buf[bytes_per_frame:]
|
218
|
-
|
219
|
-
self._queue.put_nowait(
|
186
|
+
async for bytes_data, _ in resp.content.iter_chunks():
|
187
|
+
for frame in bstream.write(bytes_data):
|
188
|
+
self._event_ch.send_nowait(
|
220
189
|
tts.SynthesizedAudio(
|
221
|
-
|
222
|
-
data=rtc.AudioFrame(
|
223
|
-
data=frame_data,
|
224
|
-
sample_rate=self._opts.sample_rate,
|
225
|
-
num_channels=1,
|
226
|
-
samples_per_channel=len(frame_data) // 2,
|
227
|
-
),
|
190
|
+
request_id=request_id, segment_id=segment_id, frame=frame
|
228
191
|
)
|
229
192
|
)
|
230
193
|
|
231
|
-
|
232
|
-
|
233
|
-
self._queue.put_nowait(
|
194
|
+
for frame in bstream.flush():
|
195
|
+
self._event_ch.send_nowait(
|
234
196
|
tts.SynthesizedAudio(
|
235
|
-
|
236
|
-
data=rtc.AudioFrame(
|
237
|
-
data=buf,
|
238
|
-
sample_rate=self._opts.sample_rate,
|
239
|
-
num_channels=1,
|
240
|
-
samples_per_channel=len(buf) // 2,
|
241
|
-
),
|
197
|
+
request_id=request_id, segment_id=segment_id, frame=frame
|
242
198
|
)
|
243
199
|
)
|
244
200
|
|
245
|
-
async def __anext__(self) -> tts.SynthesizedAudio:
|
246
|
-
if not self._task:
|
247
|
-
self._task = asyncio.create_task(self._main_task())
|
248
|
-
|
249
|
-
frame = await self._queue.get()
|
250
|
-
if frame is None:
|
251
|
-
raise StopAsyncIteration
|
252
|
-
|
253
|
-
return frame
|
254
|
-
|
255
|
-
async def aclose(self) -> None:
|
256
|
-
if not self._task:
|
257
|
-
return
|
258
|
-
|
259
|
-
self._task.cancel()
|
260
|
-
with contextlib.suppress(asyncio.CancelledError):
|
261
|
-
await self._task
|
262
|
-
|
263
201
|
|
264
202
|
class SynthesizeStream(tts.SynthesizeStream):
|
265
203
|
"""Streamed API using websockets"""
|
266
204
|
|
267
|
-
@dataclass
|
268
|
-
class _SegmentConnection:
|
269
|
-
audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
|
270
|
-
task: asyncio.Task
|
271
|
-
|
272
205
|
def __init__(
|
273
206
|
self,
|
274
207
|
session: aiohttp.ClientSession,
|
275
208
|
opts: _TTSOptions,
|
276
|
-
max_retry_per_segment: int = 3,
|
277
209
|
):
|
210
|
+
super().__init__()
|
278
211
|
self._opts = opts
|
279
212
|
self._session = session
|
280
|
-
self.
|
281
|
-
self._event_queue = asyncio.Queue[Optional[tts.SynthesisEvent]]()
|
282
|
-
self._closed = False
|
283
|
-
self._word_stream = opts.word_tokenizer.stream()
|
284
|
-
|
285
|
-
def _stream_url(self) -> str:
|
286
|
-
base_url = self._opts.base_url
|
287
|
-
voice_id = self._opts.voice.id
|
288
|
-
model_id = self._opts.model_id
|
289
|
-
output_format = self._opts.encoding
|
290
|
-
latency = self._opts.streaming_latency
|
291
|
-
url = (
|
292
|
-
f"{base_url}/text-to-speech/{voice_id}/stream-input?"
|
293
|
-
f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
|
294
|
-
)
|
213
|
+
self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
|
295
214
|
|
296
|
-
|
215
|
+
@utils.log_exceptions(logger=logger)
|
216
|
+
async def _main_task(self) -> None:
|
217
|
+
self._segments_ch = utils.aio.Chan[tokenize.WordStream]()
|
297
218
|
|
298
|
-
|
299
|
-
|
300
|
-
|
219
|
+
@utils.log_exceptions(logger=logger)
|
220
|
+
async def _tokenize_input():
|
221
|
+
"""tokenize text from the input_ch to words"""
|
222
|
+
word_stream = None
|
223
|
+
async for input in self._input_ch:
|
224
|
+
if isinstance(input, str):
|
225
|
+
if not word_stream:
|
226
|
+
word_stream = self._opts.word_tokenizer.stream()
|
227
|
+
self._segments_ch.send_nowait(word_stream)
|
301
228
|
|
302
|
-
|
303
|
-
|
304
|
-
|
229
|
+
word_stream.push_text(input)
|
230
|
+
elif isinstance(input, self._FlushSentinel):
|
231
|
+
word_stream.end_input()
|
232
|
+
word_stream = None
|
305
233
|
|
306
|
-
|
234
|
+
self._segments_ch.close()
|
307
235
|
|
308
|
-
|
309
|
-
|
310
|
-
|
236
|
+
async def _run():
|
237
|
+
async for word_stream in self._segments_ch:
|
238
|
+
await self._run_ws(word_stream)
|
311
239
|
|
312
|
-
|
313
|
-
self._main_task.cancel()
|
314
|
-
|
315
|
-
with contextlib.suppress(asyncio.CancelledError):
|
316
|
-
await self._main_task
|
317
|
-
|
318
|
-
async def _run(self, max_retry_per_segment: int) -> None:
|
319
|
-
conns_q = asyncio.Queue[Optional[SynthesizeStream._SegmentConnection]]()
|
320
|
-
|
321
|
-
async def _forward_events() -> None:
|
322
|
-
"""forward events from the ws connections to the event queue.
|
323
|
-
This is used to keep the right order."""
|
324
|
-
while True:
|
325
|
-
c = await conns_q.get()
|
326
|
-
if c is None:
|
327
|
-
break # no more segment, stream closed
|
328
|
-
|
329
|
-
self._event_queue.put_nowait(
|
330
|
-
tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
|
331
|
-
)
|
332
|
-
|
333
|
-
async for frame in c.audio_rx:
|
334
|
-
self._event_queue.put_nowait(
|
335
|
-
tts.SynthesisEvent(
|
336
|
-
type=tts.SynthesisEventType.AUDIO, audio=frame
|
337
|
-
)
|
338
|
-
)
|
339
|
-
|
340
|
-
self._event_queue.put_nowait(
|
341
|
-
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
342
|
-
)
|
343
|
-
|
344
|
-
async def _read_tokens() -> None:
|
345
|
-
"""read tokens from the word stream and create connections for each segment,
|
346
|
-
(this also allows concurrent connections to 11labs)"""
|
347
|
-
|
348
|
-
cur_segment: SynthesizeStream._SegmentConnection | None = None
|
349
|
-
token_tx: aio.ChanSender[str] | None = None
|
350
|
-
async for ev in self._word_stream:
|
351
|
-
if ev.type == tokenize.TokenEventType.STARTED:
|
352
|
-
token_tx, token_rx = aio.channel()
|
353
|
-
audio_tx: aio.ChanSender[tts.SynthesizedAudio]
|
354
|
-
audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
|
355
|
-
audio_tx, audio_rx = aio.channel()
|
356
|
-
task = asyncio.create_task(
|
357
|
-
self._run_ws(max_retry_per_segment, audio_tx, token_rx)
|
358
|
-
)
|
359
|
-
cur_segment = SynthesizeStream._SegmentConnection(audio_rx, task)
|
360
|
-
conns_q.put_nowait(cur_segment)
|
361
|
-
elif ev.type == tokenize.TokenEventType.TOKEN:
|
362
|
-
assert token_tx is not None
|
363
|
-
token_tx.send_nowait(ev.token)
|
364
|
-
elif ev.type == tokenize.TokenEventType.FINISHED:
|
365
|
-
assert token_tx is not None
|
366
|
-
token_tx.close()
|
367
|
-
cur_segment = token_tx = None
|
368
|
-
|
369
|
-
conns_q.put_nowait(None)
|
370
|
-
|
371
|
-
try:
|
372
|
-
await asyncio.gather(_forward_events(), _read_tokens())
|
373
|
-
except Exception:
|
374
|
-
logger.exception("11labs task failed")
|
375
|
-
|
376
|
-
self._event_queue.put_nowait(None)
|
240
|
+
await asyncio.gather(_tokenize_input(), _run(), return_exceptions=True)
|
377
241
|
|
378
242
|
async def _run_ws(
|
379
243
|
self,
|
380
|
-
|
381
|
-
|
382
|
-
token_rx: aio.ChanReceiver[str],
|
244
|
+
word_stream: tokenize.WordStream,
|
245
|
+
max_retry: int = 1,
|
383
246
|
) -> None:
|
384
|
-
|
247
|
+
request_id = utils.shortuuid()
|
248
|
+
segment_id = utils.shortuuid()
|
249
|
+
|
385
250
|
ws_conn: aiohttp.ClientWebSocketResponse | None = None
|
386
251
|
for try_i in range(max_retry):
|
252
|
+
retry_delay = 5
|
387
253
|
try:
|
254
|
+
if try_i > 0:
|
255
|
+
await asyncio.sleep(retry_delay)
|
256
|
+
|
388
257
|
ws_conn = await self._session.ws_connect(
|
389
|
-
self.
|
258
|
+
_stream_url(self._opts),
|
390
259
|
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
391
260
|
)
|
392
|
-
|
393
|
-
|
394
|
-
if self._opts.voice.settings is not None:
|
395
|
-
voice_settings = dataclasses.asdict(self._opts.voice.settings)
|
396
|
-
|
397
|
-
init_pkt = dict(
|
398
|
-
text=" ",
|
399
|
-
try_trigger_generation=True,
|
400
|
-
voice_settings=voice_settings,
|
401
|
-
generation_config=dict(
|
402
|
-
chunk_length_schedule=self._opts.chunk_length_schedule,
|
403
|
-
),
|
404
|
-
)
|
405
|
-
await ws_conn.send_str(json.dumps(init_pkt))
|
406
|
-
except Exception:
|
407
|
-
if try_i + 1 == max_retry:
|
408
|
-
logger.exception(
|
409
|
-
f"failed to connect to 11labs after {max_retry} retries"
|
410
|
-
)
|
411
|
-
return
|
412
|
-
|
413
|
-
retry_delay = min(try_i * 5, 5) # max 5s
|
261
|
+
break
|
262
|
+
except Exception as e:
|
414
263
|
logger.warning(
|
415
|
-
f"failed to connect to 11labs, retrying in {retry_delay}s"
|
264
|
+
f"failed to connect to 11labs, retrying in {retry_delay}s",
|
265
|
+
exc_info=e,
|
416
266
|
)
|
417
|
-
await asyncio.sleep(retry_delay)
|
418
|
-
|
419
|
-
assert ws_conn is not None
|
420
267
|
|
421
|
-
|
268
|
+
if ws_conn is None:
|
269
|
+
raise Exception(f"failed to connect to 11labs after {max_retry} retries")
|
270
|
+
|
271
|
+
init_pkt = dict(
|
272
|
+
text=" ",
|
273
|
+
try_trigger_generation=True,
|
274
|
+
voice_settings=dataclasses.asdict(self._opts.voice.settings)
|
275
|
+
if self._opts.voice.settings
|
276
|
+
else None,
|
277
|
+
generation_config=dict(
|
278
|
+
chunk_length_schedule=self._opts.chunk_length_schedule
|
279
|
+
),
|
280
|
+
)
|
281
|
+
await ws_conn.send_str(json.dumps(init_pkt))
|
282
|
+
eos_sent = False
|
422
283
|
|
423
284
|
async def send_task():
|
424
|
-
|
425
|
-
if token == "":
|
426
|
-
continue # empty token is closing the stream in 11labs protocol
|
285
|
+
nonlocal eos_sent
|
427
286
|
|
287
|
+
async for data in word_stream:
|
428
288
|
# try_trigger_generation=True is a bad practice, we expose
|
429
289
|
# chunk_length_schedule instead
|
430
290
|
data_pkt = dict(
|
431
|
-
text=f"{token} ", # must always end with a space
|
291
|
+
text=f"{data.token} ", # must always end with a space
|
432
292
|
try_trigger_generation=False,
|
433
293
|
)
|
294
|
+
print(data_pkt)
|
434
295
|
await ws_conn.send_str(json.dumps(data_pkt))
|
435
296
|
|
436
297
|
# no more token, mark eos
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
await ws_conn.send_str(json.dumps(flush_pkt))
|
441
|
-
|
442
|
-
nonlocal all_tokens_consumed
|
443
|
-
all_tokens_consumed = True
|
298
|
+
eos_pkt = dict(text="")
|
299
|
+
await ws_conn.send_str(json.dumps(eos_pkt))
|
300
|
+
eos_sent = True
|
444
301
|
|
445
302
|
async def recv_task():
|
446
|
-
encoding = _encoding_from_format(self._opts.encoding)
|
447
|
-
mp3_decoder = codecs.Mp3StreamDecoder()
|
448
303
|
while True:
|
449
304
|
msg = await ws_conn.receive()
|
450
305
|
if msg.type in (
|
@@ -452,70 +307,61 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
452
307
|
aiohttp.WSMsgType.CLOSE,
|
453
308
|
aiohttp.WSMsgType.CLOSING,
|
454
309
|
):
|
455
|
-
if
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
)
|
310
|
+
if not eos_sent:
|
311
|
+
raise Exception(
|
312
|
+
"11labs connection closed unexpectedly, not all tokens have been consumed"
|
313
|
+
)
|
314
|
+
return
|
461
315
|
|
462
316
|
if msg.type != aiohttp.WSMsgType.TEXT:
|
463
|
-
# audio frames are serialized in base64..
|
464
317
|
logger.warning("unexpected 11labs message type %s", msg.type)
|
465
318
|
continue
|
466
319
|
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
return
|
473
|
-
elif audio is not None:
|
474
|
-
if audio == "":
|
475
|
-
# 11labs sometimes sends empty audio, ignore
|
476
|
-
continue
|
477
|
-
|
478
|
-
b64data = base64.b64decode(audio)
|
479
|
-
frame: rtc.AudioFrame
|
480
|
-
if encoding == "mp3":
|
481
|
-
frames = mp3_decoder.decode_chunk(b64data)
|
482
|
-
frame = utils.merge_frames(frames)
|
483
|
-
else:
|
484
|
-
frame = rtc.AudioFrame(
|
485
|
-
data=b64data,
|
486
|
-
sample_rate=self._opts.sample_rate,
|
487
|
-
num_channels=1,
|
488
|
-
samples_per_channel=len(b64data) // 2,
|
489
|
-
)
|
490
|
-
|
491
|
-
text = ""
|
492
|
-
if data.get("alignment"):
|
493
|
-
text = "".join(data["alignment"].get("chars", ""))
|
494
|
-
|
495
|
-
audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
|
496
|
-
continue
|
497
|
-
elif data.get("isFinal"):
|
498
|
-
return # last message
|
499
|
-
|
500
|
-
logger.error("unexpected 11labs message %s", data)
|
501
|
-
|
502
|
-
try:
|
503
|
-
await asyncio.gather(send_task(), recv_task())
|
504
|
-
except Exception:
|
505
|
-
logger.exception("11labs ws connection failed")
|
506
|
-
finally:
|
507
|
-
audio_tx.close()
|
320
|
+
self._process_stream_event(
|
321
|
+
data=json.loads(msg.data),
|
322
|
+
request_id=request_id,
|
323
|
+
segment_id=segment_id,
|
324
|
+
)
|
508
325
|
|
509
|
-
|
510
|
-
evt = await self._event_queue.get()
|
511
|
-
if evt is None:
|
512
|
-
raise StopAsyncIteration
|
326
|
+
await asyncio.gather(send_task(), recv_task())
|
513
327
|
|
514
|
-
|
328
|
+
def _process_stream_event(
|
329
|
+
self, *, data: dict, request_id: str, segment_id: str
|
330
|
+
) -> None:
|
331
|
+
encoding = _encoding_from_format(self._opts.encoding)
|
332
|
+
if data.get("audio"):
|
333
|
+
b64data = base64.b64decode(data["audio"])
|
334
|
+
if encoding == "mp3":
|
335
|
+
for frame in self._mp3_decoder.decode_chunk(b64data):
|
336
|
+
self._event_ch.send_nowait(
|
337
|
+
tts.SynthesizedAudio(
|
338
|
+
request_id=request_id,
|
339
|
+
segment_id=segment_id,
|
340
|
+
frame=frame,
|
341
|
+
)
|
342
|
+
)
|
343
|
+
else:
|
344
|
+
chunk_frame = rtc.AudioFrame(
|
345
|
+
data=b64data,
|
346
|
+
sample_rate=self._opts.sample_rate,
|
347
|
+
num_channels=1,
|
348
|
+
samples_per_channel=len(b64data) // 2,
|
349
|
+
)
|
350
|
+
self._event_ch.send_nowait(
|
351
|
+
tts.SynthesizedAudio(
|
352
|
+
request_id=request_id,
|
353
|
+
segment_id=segment_id,
|
354
|
+
frame=chunk_frame,
|
355
|
+
)
|
356
|
+
)
|
357
|
+
elif data.get("error"):
|
358
|
+
logger.error("11labs reported an error: %s", data["error"])
|
359
|
+
elif not data.get("isFinal"):
|
360
|
+
logger.error("unexpected 11labs message %s", data)
|
515
361
|
|
516
362
|
|
517
|
-
def _dict_to_voices_list(data: dict
|
518
|
-
voices = []
|
363
|
+
def _dict_to_voices_list(data: dict[str, Any]):
|
364
|
+
voices: List[Voice] = []
|
519
365
|
for voice in data["voices"]:
|
520
366
|
voices.append(
|
521
367
|
Voice(
|
@@ -526,3 +372,27 @@ def _dict_to_voices_list(data: dict) -> List[Voice]:
|
|
526
372
|
)
|
527
373
|
)
|
528
374
|
return voices
|
375
|
+
|
376
|
+
|
377
|
+
def _synthesize_url(opts: _TTSOptions) -> str:
|
378
|
+
base_url = opts.base_url
|
379
|
+
voice_id = opts.voice.id
|
380
|
+
model_id = opts.model_id
|
381
|
+
sample_rate = _sample_rate_from_format(opts.encoding)
|
382
|
+
latency = opts.streaming_latency
|
383
|
+
return (
|
384
|
+
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
385
|
+
f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
386
|
+
)
|
387
|
+
|
388
|
+
|
389
|
+
def _stream_url(opts: _TTSOptions) -> str:
|
390
|
+
base_url = opts.base_url
|
391
|
+
voice_id = opts.voice.id
|
392
|
+
model_id = opts.model_id
|
393
|
+
output_format = opts.encoding
|
394
|
+
latency = opts.streaming_latency
|
395
|
+
return (
|
396
|
+
f"{base_url}/text-to-speech/{voice_id}/stream-input?"
|
397
|
+
f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
|
398
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -19,9 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: livekit
|
23
|
-
Requires-Dist: livekit-agents[codecs] ~=0.8.dev0
|
24
|
-
Requires-Dist: aiohttp >=3.8.5
|
22
|
+
Requires-Dist: livekit-agents[codecs] >=0.8.0.dev0
|
25
23
|
|
26
24
|
# LiveKit Plugins Elevenlabs
|
27
25
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=ez1ybDPt7GfKAKgPkxZFRB7Vyd-_i-0hfUMI79GQ5w4,1091
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=8jTchztgpiTokHEaWUK8PPxWWfvm5SMrOGsJpzxbYAw,362
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=HpaHJQysUhThDdlYDHpQxroo9L2_m6G6QBAaNXs04K4,13032
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=G63knoeV7ai0fH-1DCHqI3a7eSI4LlHqjV64n4GbCGg,600
|
7
|
+
livekit_plugins_elevenlabs-0.7.0.dist-info/METADATA,sha256=Qp7YGU3umbrkvU6mHc0-IrQQ9F_Efsl1wKzTAKTr12s,1311
|
8
|
+
livekit_plugins_elevenlabs-0.7.0.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
|
9
|
+
livekit_plugins_elevenlabs-0.7.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_elevenlabs-0.7.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=ez1ybDPt7GfKAKgPkxZFRB7Vyd-_i-0hfUMI79GQ5w4,1091
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=8jTchztgpiTokHEaWUK8PPxWWfvm5SMrOGsJpzxbYAw,362
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=GTcyQwBVVPzCYLgsnw9q5oFOq9cV3hIKndDaBPSFMr4,17738
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=yB6WnbnD5MFhQDT5ItJ02XWVsNanlDYiOezzwv0IdcM,603
|
7
|
-
livekit_plugins_elevenlabs-0.6.dev0.dist-info/METADATA,sha256=kfWET-iNGQYX7TGoo87CiMIoMINIwE28YT4-hbp8NDY,1373
|
8
|
-
livekit_plugins_elevenlabs-0.6.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
-
livekit_plugins_elevenlabs-0.6.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.6.dev0.dist-info/RECORD,,
|
File without changes
|