livekit-plugins-elevenlabs 0.7.13__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/models.py +1 -11
- livekit/plugins/elevenlabs/tts.py +238 -225
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.7.13.dist-info → livekit_plugins_elevenlabs-0.8.0.dist-info}/METADATA +2 -2
- livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD +10 -0
- {livekit_plugins_elevenlabs-0.7.13.dist-info → livekit_plugins_elevenlabs-0.8.0.dist-info}/WHEEL +1 -1
- livekit_plugins_elevenlabs-0.7.13.dist-info/RECORD +0 -10
- {livekit_plugins_elevenlabs-0.7.13.dist-info → livekit_plugins_elevenlabs-0.8.0.dist-info}/top_level.txt +0 -0
@@ -10,14 +10,4 @@ TTSModels = Literal[
|
|
10
10
|
"eleven_flash_v2",
|
11
11
|
]
|
12
12
|
|
13
|
-
TTSEncoding = Literal[
|
14
|
-
"mp3_22050_32",
|
15
|
-
"mp3_44100_32",
|
16
|
-
"mp3_44100_64",
|
17
|
-
"mp3_44100_96",
|
18
|
-
"mp3_44100_128",
|
19
|
-
"mp3_44100_192",
|
20
|
-
"pcm_16000",
|
21
|
-
"pcm_22050",
|
22
|
-
"pcm_44100",
|
23
|
-
]
|
13
|
+
TTSEncoding = Literal["mp3_44100",]
|
@@ -19,13 +19,12 @@ import base64
|
|
19
19
|
import dataclasses
|
20
20
|
import json
|
21
21
|
import os
|
22
|
+
import weakref
|
22
23
|
from dataclasses import dataclass
|
23
|
-
from typing import Any, List,
|
24
|
+
from typing import Any, List, Optional
|
24
25
|
|
25
26
|
import aiohttp
|
26
|
-
from livekit import rtc
|
27
27
|
from livekit.agents import (
|
28
|
-
DEFAULT_API_CONNECT_OPTIONS,
|
29
28
|
APIConnectionError,
|
30
29
|
APIConnectOptions,
|
31
30
|
APIStatusError,
|
@@ -38,28 +37,20 @@ from livekit.agents import (
|
|
38
37
|
from .log import logger
|
39
38
|
from .models import TTSEncoding, TTSModels
|
40
39
|
|
41
|
-
|
40
|
+
_DefaultEncoding: TTSEncoding = "mp3_44100"
|
42
41
|
|
43
42
|
|
44
43
|
def _sample_rate_from_format(output_format: TTSEncoding) -> int:
|
45
|
-
split = output_format.split("_") # e.g:
|
44
|
+
split = output_format.split("_") # e.g: mp3_44100
|
46
45
|
return int(split[1])
|
47
46
|
|
48
47
|
|
49
|
-
def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
|
50
|
-
if output_format.startswith("mp3"):
|
51
|
-
return "mp3"
|
52
|
-
elif output_format.startswith("pcm"):
|
53
|
-
return "pcm"
|
54
|
-
|
55
|
-
raise ValueError(f"Unknown format: {output_format}")
|
56
|
-
|
57
|
-
|
58
48
|
@dataclass
|
59
49
|
class VoiceSettings:
|
60
50
|
stability: float # [0.0 - 1.0]
|
61
51
|
similarity_boost: float # [0.0 - 1.0]
|
62
52
|
style: float | None = None # [0.0 - 1.0]
|
53
|
+
speed: float | None = 1.0 # [0.8 - 1.2]
|
63
54
|
use_speaker_boost: bool | None = False
|
64
55
|
|
65
56
|
|
@@ -76,12 +67,17 @@ DEFAULT_VOICE = Voice(
|
|
76
67
|
name="Bella",
|
77
68
|
category="premade",
|
78
69
|
settings=VoiceSettings(
|
79
|
-
stability=0.71,
|
70
|
+
stability=0.71,
|
71
|
+
speed=1.0,
|
72
|
+
similarity_boost=0.5,
|
73
|
+
style=0.0,
|
74
|
+
use_speaker_boost=True,
|
80
75
|
),
|
81
76
|
)
|
82
77
|
|
83
78
|
API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
|
84
79
|
AUTHORIZATION_HEADER = "xi-api-key"
|
80
|
+
WS_INACTIVITY_TIMEOUT = 300
|
85
81
|
|
86
82
|
|
87
83
|
@dataclass
|
@@ -97,6 +93,7 @@ class _TTSOptions:
|
|
97
93
|
word_tokenizer: tokenize.WordTokenizer
|
98
94
|
chunk_length_schedule: list[int]
|
99
95
|
enable_ssml_parsing: bool
|
96
|
+
inactivity_timeout: int
|
100
97
|
|
101
98
|
|
102
99
|
class TTS(tts.TTS):
|
@@ -107,11 +104,9 @@ class TTS(tts.TTS):
|
|
107
104
|
model: TTSModels | str = "eleven_flash_v2_5",
|
108
105
|
api_key: str | None = None,
|
109
106
|
base_url: str | None = None,
|
110
|
-
|
111
|
-
|
112
|
-
word_tokenizer: tokenize.WordTokenizer =
|
113
|
-
ignore_punctuation=False # punctuation can help for intonation
|
114
|
-
),
|
107
|
+
streaming_latency: int = 0,
|
108
|
+
inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
|
109
|
+
word_tokenizer: Optional[tokenize.WordTokenizer] = None,
|
115
110
|
enable_ssml_parsing: bool = False,
|
116
111
|
chunk_length_schedule: list[int] = [80, 120, 200, 260], # range is [50, 500]
|
117
112
|
http_session: aiohttp.ClientSession | None = None,
|
@@ -127,8 +122,8 @@ class TTS(tts.TTS):
|
|
127
122
|
model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
|
128
123
|
api_key (str | None): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
|
129
124
|
base_url (str | None): Custom base URL for the API. Optional.
|
130
|
-
|
131
|
-
|
125
|
+
streaming_latency (int): Optimize for streaming latency, defaults to 0 - disabled. 4 for max latency optimizations. deprecated
|
126
|
+
inactivity_timeout (int): Inactivity timeout in seconds for the websocket connection. Defaults to 300.
|
132
127
|
word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
|
133
128
|
enable_ssml_parsing (bool): Enable SSML parsing for input text. Defaults to False.
|
134
129
|
chunk_length_schedule (list[int]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
|
@@ -140,7 +135,7 @@ class TTS(tts.TTS):
|
|
140
135
|
capabilities=tts.TTSCapabilities(
|
141
136
|
streaming=True,
|
142
137
|
),
|
143
|
-
sample_rate=_sample_rate_from_format(
|
138
|
+
sample_rate=_sample_rate_from_format(_DefaultEncoding),
|
144
139
|
num_channels=1,
|
145
140
|
)
|
146
141
|
|
@@ -156,20 +151,46 @@ class TTS(tts.TTS):
|
|
156
151
|
"ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable"
|
157
152
|
)
|
158
153
|
|
154
|
+
if word_tokenizer is None:
|
155
|
+
word_tokenizer = tokenize.basic.WordTokenizer(
|
156
|
+
ignore_punctuation=False # punctuation can help for intonation
|
157
|
+
)
|
158
|
+
|
159
159
|
self._opts = _TTSOptions(
|
160
160
|
voice=voice,
|
161
161
|
model=model,
|
162
162
|
api_key=api_key,
|
163
163
|
base_url=base_url or API_BASE_URL_V1,
|
164
|
-
encoding=
|
164
|
+
encoding=_DefaultEncoding,
|
165
165
|
sample_rate=self.sample_rate,
|
166
166
|
streaming_latency=streaming_latency,
|
167
167
|
word_tokenizer=word_tokenizer,
|
168
168
|
chunk_length_schedule=chunk_length_schedule,
|
169
169
|
enable_ssml_parsing=enable_ssml_parsing,
|
170
170
|
language=language,
|
171
|
+
inactivity_timeout=inactivity_timeout,
|
171
172
|
)
|
172
173
|
self._session = http_session
|
174
|
+
self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
|
175
|
+
connect_cb=self._connect_ws,
|
176
|
+
close_cb=self._close_ws,
|
177
|
+
max_session_duration=inactivity_timeout,
|
178
|
+
mark_refreshed_on_get=True,
|
179
|
+
)
|
180
|
+
self._streams = weakref.WeakSet[SynthesizeStream]()
|
181
|
+
|
182
|
+
async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
|
183
|
+
session = self._ensure_session()
|
184
|
+
return await asyncio.wait_for(
|
185
|
+
session.ws_connect(
|
186
|
+
_stream_url(self._opts),
|
187
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
188
|
+
),
|
189
|
+
self._conn_options.timeout,
|
190
|
+
)
|
191
|
+
|
192
|
+
async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
|
193
|
+
await ws.close()
|
173
194
|
|
174
195
|
def _ensure_session(self) -> aiohttp.ClientSession:
|
175
196
|
if not self._session:
|
@@ -177,6 +198,9 @@ class TTS(tts.TTS):
|
|
177
198
|
|
178
199
|
return self._session
|
179
200
|
|
201
|
+
def prewarm(self) -> None:
|
202
|
+
self._pool.prewarm()
|
203
|
+
|
180
204
|
async def list_voices(self) -> List[Voice]:
|
181
205
|
async with self._ensure_session().get(
|
182
206
|
f"{self._opts.base_url}/voices",
|
@@ -205,7 +229,7 @@ class TTS(tts.TTS):
|
|
205
229
|
self,
|
206
230
|
text: str,
|
207
231
|
*,
|
208
|
-
conn_options: APIConnectOptions =
|
232
|
+
conn_options: Optional[APIConnectOptions] = None,
|
209
233
|
) -> "ChunkedStream":
|
210
234
|
return ChunkedStream(
|
211
235
|
tts=self,
|
@@ -216,14 +240,18 @@ class TTS(tts.TTS):
|
|
216
240
|
)
|
217
241
|
|
218
242
|
def stream(
|
219
|
-
self, *, conn_options: APIConnectOptions =
|
243
|
+
self, *, conn_options: Optional[APIConnectOptions] = None
|
220
244
|
) -> "SynthesizeStream":
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
)
|
245
|
+
stream = SynthesizeStream(tts=self, pool=self._pool, opts=self._opts)
|
246
|
+
self._streams.add(stream)
|
247
|
+
return stream
|
248
|
+
|
249
|
+
async def aclose(self) -> None:
|
250
|
+
for stream in list(self._streams):
|
251
|
+
await stream.aclose()
|
252
|
+
self._streams.clear()
|
253
|
+
await self._pool.aclose()
|
254
|
+
await super().aclose()
|
227
255
|
|
228
256
|
|
229
257
|
class ChunkedStream(tts.ChunkedStream):
|
@@ -235,20 +263,14 @@ class ChunkedStream(tts.ChunkedStream):
|
|
235
263
|
tts: TTS,
|
236
264
|
input_text: str,
|
237
265
|
opts: _TTSOptions,
|
238
|
-
conn_options: APIConnectOptions,
|
266
|
+
conn_options: Optional[APIConnectOptions] = None,
|
239
267
|
session: aiohttp.ClientSession,
|
240
268
|
) -> None:
|
241
269
|
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
242
270
|
self._opts, self._session = opts, session
|
243
|
-
if _encoding_from_format(self._opts.encoding) == "mp3":
|
244
|
-
self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
|
245
271
|
|
246
272
|
async def _run(self) -> None:
|
247
273
|
request_id = utils.shortuuid()
|
248
|
-
bstream = utils.audio.AudioByteStream(
|
249
|
-
sample_rate=self._opts.sample_rate, num_channels=1
|
250
|
-
)
|
251
|
-
|
252
274
|
voice_settings = (
|
253
275
|
_strip_nones(dataclasses.asdict(self._opts.voice.settings))
|
254
276
|
if self._opts.voice.settings
|
@@ -260,6 +282,12 @@ class ChunkedStream(tts.ChunkedStream):
|
|
260
282
|
"voice_settings": voice_settings,
|
261
283
|
}
|
262
284
|
|
285
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
286
|
+
sample_rate=self._opts.sample_rate,
|
287
|
+
num_channels=1,
|
288
|
+
)
|
289
|
+
|
290
|
+
decode_task: asyncio.Task | None = None
|
263
291
|
try:
|
264
292
|
async with self._session.post(
|
265
293
|
_synthesize_url(self._opts),
|
@@ -271,32 +299,21 @@ class ChunkedStream(tts.ChunkedStream):
|
|
271
299
|
logger.error("11labs returned non-audio data: %s", content)
|
272
300
|
return
|
273
301
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
tts.SynthesizedAudio(
|
290
|
-
request_id=request_id,
|
291
|
-
frame=frame,
|
292
|
-
)
|
293
|
-
)
|
294
|
-
|
295
|
-
for frame in bstream.flush():
|
296
|
-
self._event_ch.send_nowait(
|
297
|
-
tts.SynthesizedAudio(request_id=request_id, frame=frame)
|
298
|
-
)
|
299
|
-
|
302
|
+
async def _decode_loop():
|
303
|
+
try:
|
304
|
+
async for bytes_data, _ in resp.content.iter_chunks():
|
305
|
+
decoder.push(bytes_data)
|
306
|
+
finally:
|
307
|
+
decoder.end_input()
|
308
|
+
|
309
|
+
decode_task = asyncio.create_task(_decode_loop())
|
310
|
+
emitter = tts.SynthesizedAudioEmitter(
|
311
|
+
event_ch=self._event_ch,
|
312
|
+
request_id=request_id,
|
313
|
+
)
|
314
|
+
async for frame in decoder:
|
315
|
+
emitter.push(frame)
|
316
|
+
emitter.flush()
|
300
317
|
except asyncio.TimeoutError as e:
|
301
318
|
raise APITimeoutError() from e
|
302
319
|
except aiohttp.ClientResponseError as e:
|
@@ -308,6 +325,10 @@ class ChunkedStream(tts.ChunkedStream):
|
|
308
325
|
) from e
|
309
326
|
except Exception as e:
|
310
327
|
raise APIConnectionError() from e
|
328
|
+
finally:
|
329
|
+
if decode_task:
|
330
|
+
await utils.aio.gracefully_cancel(decode_task)
|
331
|
+
await decoder.aclose()
|
311
332
|
|
312
333
|
|
313
334
|
class SynthesizeStream(tts.SynthesizeStream):
|
@@ -317,15 +338,14 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
317
338
|
self,
|
318
339
|
*,
|
319
340
|
tts: TTS,
|
320
|
-
|
321
|
-
conn_options: APIConnectOptions,
|
341
|
+
pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
|
322
342
|
opts: _TTSOptions,
|
323
343
|
):
|
324
|
-
super().__init__(tts=tts
|
325
|
-
self._opts, self.
|
326
|
-
self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
|
344
|
+
super().__init__(tts=tts)
|
345
|
+
self._opts, self._pool = opts, pool
|
327
346
|
|
328
347
|
async def _run(self) -> None:
|
348
|
+
request_id = utils.shortuuid()
|
329
349
|
self._segments_ch = utils.aio.Chan[tokenize.WordStream]()
|
330
350
|
|
331
351
|
@utils.log_exceptions(logger=logger)
|
@@ -338,190 +358,179 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
338
358
|
# new segment (after flush for e.g)
|
339
359
|
word_stream = self._opts.word_tokenizer.stream()
|
340
360
|
self._segments_ch.send_nowait(word_stream)
|
341
|
-
|
342
361
|
word_stream.push_text(input)
|
343
362
|
elif isinstance(input, self._FlushSentinel):
|
344
363
|
if word_stream is not None:
|
345
364
|
word_stream.end_input()
|
346
|
-
|
347
365
|
word_stream = None
|
348
|
-
|
366
|
+
if word_stream is not None:
|
367
|
+
word_stream.end_input()
|
349
368
|
self._segments_ch.close()
|
350
369
|
|
351
370
|
@utils.log_exceptions(logger=logger)
|
352
|
-
async def
|
371
|
+
async def _process_segments():
|
353
372
|
async for word_stream in self._segments_ch:
|
354
|
-
await self._run_ws(word_stream)
|
373
|
+
await self._run_ws(word_stream, request_id)
|
355
374
|
|
356
375
|
tasks = [
|
357
376
|
asyncio.create_task(_tokenize_input()),
|
358
|
-
asyncio.create_task(
|
377
|
+
asyncio.create_task(_process_segments()),
|
359
378
|
]
|
360
379
|
try:
|
361
380
|
await asyncio.gather(*tasks)
|
381
|
+
except asyncio.TimeoutError as e:
|
382
|
+
raise APITimeoutError() from e
|
383
|
+
except aiohttp.ClientResponseError as e:
|
384
|
+
raise APIStatusError(
|
385
|
+
message=e.message,
|
386
|
+
status_code=e.status,
|
387
|
+
request_id=request_id,
|
388
|
+
body=None,
|
389
|
+
) from e
|
390
|
+
except Exception as e:
|
391
|
+
raise APIConnectionError() from e
|
362
392
|
finally:
|
363
393
|
await utils.aio.gracefully_cancel(*tasks)
|
364
394
|
|
365
395
|
async def _run_ws(
|
366
396
|
self,
|
367
397
|
word_stream: tokenize.WordStream,
|
368
|
-
|
398
|
+
request_id: str,
|
369
399
|
) -> None:
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
try:
|
374
|
-
if try_i > 0:
|
375
|
-
await asyncio.sleep(retry_delay)
|
400
|
+
async with self._pool.connection() as ws_conn:
|
401
|
+
segment_id = utils.shortuuid()
|
402
|
+
expected_text = "" # accumulate all tokens sent
|
376
403
|
|
377
|
-
|
378
|
-
_stream_url(self._opts),
|
379
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
380
|
-
)
|
381
|
-
break
|
382
|
-
except Exception as e:
|
383
|
-
logger.warning(
|
384
|
-
f"failed to connect to 11labs, retrying in {retry_delay}s",
|
385
|
-
exc_info=e,
|
386
|
-
)
|
387
|
-
|
388
|
-
if ws_conn is None:
|
389
|
-
raise Exception(f"failed to connect to 11labs after {max_retry} retries")
|
390
|
-
|
391
|
-
request_id = utils.shortuuid()
|
392
|
-
segment_id = utils.shortuuid()
|
393
|
-
|
394
|
-
# 11labs protocol expects the first message to be an "init msg"
|
395
|
-
init_pkt = dict(
|
396
|
-
text=" ",
|
397
|
-
try_trigger_generation=True,
|
398
|
-
voice_settings=_strip_nones(dataclasses.asdict(self._opts.voice.settings))
|
399
|
-
if self._opts.voice.settings
|
400
|
-
else None,
|
401
|
-
generation_config=dict(
|
402
|
-
chunk_length_schedule=self._opts.chunk_length_schedule
|
403
|
-
),
|
404
|
-
)
|
405
|
-
await ws_conn.send_str(json.dumps(init_pkt))
|
406
|
-
eos_sent = False
|
407
|
-
|
408
|
-
async def send_task():
|
409
|
-
nonlocal eos_sent
|
410
|
-
|
411
|
-
xml_content = []
|
412
|
-
async for data in word_stream:
|
413
|
-
text = data.token
|
414
|
-
|
415
|
-
# send the xml phoneme in one go
|
416
|
-
if (
|
417
|
-
self._opts.enable_ssml_parsing
|
418
|
-
and data.token.startswith("<phoneme")
|
419
|
-
or xml_content
|
420
|
-
):
|
421
|
-
xml_content.append(text)
|
422
|
-
if data.token.find("</phoneme>") > -1:
|
423
|
-
text = self._opts.word_tokenizer.format_words(xml_content)
|
424
|
-
xml_content = []
|
425
|
-
else:
|
426
|
-
continue
|
427
|
-
|
428
|
-
# try_trigger_generation=True is a bad practice, we expose
|
429
|
-
# chunk_length_schedule instead
|
430
|
-
data_pkt = dict(
|
431
|
-
text=f"{text} ", # must always end with a space
|
432
|
-
try_trigger_generation=False,
|
433
|
-
)
|
434
|
-
self._mark_started()
|
435
|
-
await ws_conn.send_str(json.dumps(data_pkt))
|
436
|
-
|
437
|
-
if xml_content:
|
438
|
-
logger.warning("11labs stream ended with incomplete xml content")
|
439
|
-
|
440
|
-
# no more token, mark eos
|
441
|
-
eos_pkt = dict(text="")
|
442
|
-
await ws_conn.send_str(json.dumps(eos_pkt))
|
443
|
-
eos_sent = True
|
444
|
-
|
445
|
-
async def recv_task():
|
446
|
-
nonlocal eos_sent
|
447
|
-
audio_bstream = utils.audio.AudioByteStream(
|
404
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
448
405
|
sample_rate=self._opts.sample_rate,
|
449
406
|
num_channels=1,
|
450
407
|
)
|
451
408
|
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
409
|
+
# 11labs protocol expects the first message to be an "init msg"
|
410
|
+
init_pkt = dict(
|
411
|
+
text=" ",
|
412
|
+
voice_settings=_strip_nones(
|
413
|
+
dataclasses.asdict(self._opts.voice.settings)
|
414
|
+
)
|
415
|
+
if self._opts.voice.settings
|
416
|
+
else None,
|
417
|
+
generation_config=dict(
|
418
|
+
chunk_length_schedule=self._opts.chunk_length_schedule
|
419
|
+
),
|
420
|
+
)
|
421
|
+
await ws_conn.send_str(json.dumps(init_pkt))
|
422
|
+
|
423
|
+
@utils.log_exceptions(logger=logger)
|
424
|
+
async def send_task():
|
425
|
+
nonlocal expected_text
|
426
|
+
xml_content = []
|
427
|
+
async for data in word_stream:
|
428
|
+
text = data.token
|
429
|
+
expected_text += text
|
430
|
+
# send the xml phoneme in one go
|
431
|
+
if (
|
432
|
+
self._opts.enable_ssml_parsing
|
433
|
+
and data.token.startswith("<phoneme")
|
434
|
+
or xml_content
|
435
|
+
):
|
436
|
+
xml_content.append(text)
|
437
|
+
if text.find("</phoneme>") > -1:
|
438
|
+
text = self._opts.word_tokenizer.format_words(xml_content)
|
439
|
+
xml_content = []
|
440
|
+
else:
|
441
|
+
continue
|
442
|
+
|
443
|
+
data_pkt = dict(text=f"{text} ") # must always end with a space
|
444
|
+
self._mark_started()
|
445
|
+
await ws_conn.send_str(json.dumps(data_pkt))
|
446
|
+
if xml_content:
|
447
|
+
logger.warning("11labs stream ended with incomplete xml content")
|
448
|
+
await ws_conn.send_str(json.dumps({"flush": True}))
|
449
|
+
|
450
|
+
# consumes from decoder and generates events
|
451
|
+
@utils.log_exceptions(logger=logger)
|
452
|
+
async def generate_task():
|
453
|
+
emitter = tts.SynthesizedAudioEmitter(
|
454
|
+
event_ch=self._event_ch,
|
455
|
+
request_id=request_id,
|
456
|
+
segment_id=segment_id,
|
457
|
+
)
|
458
|
+
async for frame in decoder:
|
459
|
+
emitter.push(frame)
|
460
|
+
emitter.flush()
|
461
|
+
|
462
|
+
# receives from ws and decodes audio
|
463
|
+
@utils.log_exceptions(logger=logger)
|
464
|
+
async def recv_task():
|
465
|
+
nonlocal expected_text
|
466
|
+
received_text = ""
|
467
|
+
|
468
|
+
while True:
|
469
|
+
msg = await ws_conn.receive()
|
470
|
+
if msg.type in (
|
471
|
+
aiohttp.WSMsgType.CLOSED,
|
472
|
+
aiohttp.WSMsgType.CLOSE,
|
473
|
+
aiohttp.WSMsgType.CLOSING,
|
474
|
+
):
|
476
475
|
raise APIStatusError(
|
477
476
|
"11labs connection closed unexpectedly, not all tokens have been consumed",
|
478
477
|
request_id=request_id,
|
479
478
|
)
|
480
|
-
return
|
481
479
|
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
data = json.loads(msg.data)
|
487
|
-
encoding = _encoding_from_format(self._opts.encoding)
|
488
|
-
if data.get("audio"):
|
489
|
-
b64data = base64.b64decode(data["audio"])
|
490
|
-
if encoding == "mp3":
|
491
|
-
for frame in self._mp3_decoder.decode_chunk(b64data):
|
492
|
-
for frame in audio_bstream.write(frame.data.tobytes()):
|
493
|
-
_send_last_frame(segment_id=segment_id, is_final=False)
|
494
|
-
last_frame = frame
|
480
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
481
|
+
logger.warning("unexpected 11labs message type %s", msg.type)
|
482
|
+
continue
|
495
483
|
|
484
|
+
data = json.loads(msg.data)
|
485
|
+
if data.get("audio"):
|
486
|
+
b64data = base64.b64decode(data["audio"])
|
487
|
+
decoder.push(b64data)
|
488
|
+
|
489
|
+
if alignment := data.get("normalizedAlignment"):
|
490
|
+
received_text += "".join(
|
491
|
+
alignment.get("chars", [])
|
492
|
+
).replace(" ", "")
|
493
|
+
if received_text == expected_text:
|
494
|
+
decoder.end_input()
|
495
|
+
break
|
496
|
+
elif data.get("error"):
|
497
|
+
raise APIStatusError(
|
498
|
+
message=data["error"],
|
499
|
+
status_code=500,
|
500
|
+
request_id=request_id,
|
501
|
+
body=None,
|
502
|
+
)
|
496
503
|
else:
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
_send_last_frame(segment_id=segment_id, is_final=False)
|
504
|
-
last_frame = frame
|
505
|
-
|
506
|
-
_send_last_frame(segment_id=segment_id, is_final=True)
|
507
|
-
|
508
|
-
pass
|
509
|
-
elif data.get("error"):
|
510
|
-
logger.error("11labs reported an error: %s", data["error"])
|
511
|
-
else:
|
512
|
-
logger.error("unexpected 11labs message %s", data)
|
513
|
-
|
514
|
-
tasks = [
|
515
|
-
asyncio.create_task(send_task()),
|
516
|
-
asyncio.create_task(recv_task()),
|
517
|
-
]
|
504
|
+
raise APIStatusError(
|
505
|
+
message=f"unexpected 11labs message {data}",
|
506
|
+
status_code=500,
|
507
|
+
request_id=request_id,
|
508
|
+
body=None,
|
509
|
+
)
|
518
510
|
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
511
|
+
tasks = [
|
512
|
+
asyncio.create_task(send_task()),
|
513
|
+
asyncio.create_task(recv_task()),
|
514
|
+
asyncio.create_task(generate_task()),
|
515
|
+
]
|
516
|
+
try:
|
517
|
+
await asyncio.gather(*tasks)
|
518
|
+
except asyncio.TimeoutError as e:
|
519
|
+
raise APITimeoutError() from e
|
520
|
+
except aiohttp.ClientResponseError as e:
|
521
|
+
raise APIStatusError(
|
522
|
+
message=e.message,
|
523
|
+
status_code=e.status,
|
524
|
+
request_id=request_id,
|
525
|
+
body=None,
|
526
|
+
) from e
|
527
|
+
except APIStatusError:
|
528
|
+
raise
|
529
|
+
except Exception as e:
|
530
|
+
raise APIConnectionError() from e
|
531
|
+
finally:
|
532
|
+
await utils.aio.gracefully_cancel(*tasks)
|
533
|
+
await decoder.aclose()
|
525
534
|
|
526
535
|
|
527
536
|
def _dict_to_voices_list(data: dict[str, Any]):
|
@@ -547,11 +556,13 @@ def _synthesize_url(opts: _TTSOptions) -> str:
|
|
547
556
|
voice_id = opts.voice.id
|
548
557
|
model_id = opts.model
|
549
558
|
output_format = opts.encoding
|
550
|
-
|
551
|
-
return (
|
559
|
+
url = (
|
552
560
|
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
553
|
-
f"model_id={model_id}&output_format={output_format}
|
561
|
+
f"model_id={model_id}&output_format={output_format}"
|
554
562
|
)
|
563
|
+
if opts.streaming_latency:
|
564
|
+
url += f"&optimize_streaming_latency={opts.streaming_latency}"
|
565
|
+
return url
|
555
566
|
|
556
567
|
|
557
568
|
def _stream_url(opts: _TTSOptions) -> str:
|
@@ -559,14 +570,16 @@ def _stream_url(opts: _TTSOptions) -> str:
|
|
559
570
|
voice_id = opts.voice.id
|
560
571
|
model_id = opts.model
|
561
572
|
output_format = opts.encoding
|
562
|
-
latency = opts.streaming_latency
|
563
573
|
enable_ssml = str(opts.enable_ssml_parsing).lower()
|
564
574
|
language = opts.language
|
575
|
+
inactivity_timeout = opts.inactivity_timeout
|
565
576
|
url = (
|
566
577
|
f"{base_url}/text-to-speech/{voice_id}/stream-input?"
|
567
|
-
f"model_id={model_id}&output_format={output_format}&
|
568
|
-
f"enable_ssml_parsing={enable_ssml}"
|
578
|
+
f"model_id={model_id}&output_format={output_format}&"
|
579
|
+
f"enable_ssml_parsing={enable_ssml}&inactivity_timeout={inactivity_timeout}"
|
569
580
|
)
|
570
581
|
if language is not None:
|
571
582
|
url += f"&language_code={language}"
|
583
|
+
if opts.streaming_latency:
|
584
|
+
url += f"&optimize_streaming_latency={opts.streaming_latency}"
|
572
585
|
return url
|
{livekit_plugins_elevenlabs-0.7.13.dist-info → livekit_plugins_elevenlabs-0.8.0.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.8.0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
19
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: livekit-agents[codecs]
|
22
|
+
Requires-Dist: livekit-agents[codecs]<1.0.0,>=0.12.16
|
23
23
|
Dynamic: classifier
|
24
24
|
Dynamic: description
|
25
25
|
Dynamic: description-content-type
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=nB43wLS1ilzS7IxLYVSQxBjKPnbiPl4AHpHAOlG2i00,273
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=KCZnuAngDZck4zIMMgp0BLV0GS31kKChMvdvXUVZ8vY,20491
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=fObgfvFfJb5Vj0qY1hgEiVKSo6z6atjrJvwAVl4KvR4,600
|
7
|
+
livekit_plugins_elevenlabs-0.8.0.dist-info/METADATA,sha256=BwddENtvF9zqxTgjgIsHyavyRfA82TBISYEVwFfo2vs,1529
|
8
|
+
livekit_plugins_elevenlabs-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
9
|
+
livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=cVoaMYNlUXZzP-HOpbtU16OM9m-bACnSat8-o87tTyk,435
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=dFeEtnNYR0sIDjQZARvUb6cG3VUD4gUNU3hpbFGpyNo,19744
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=ePihhrwb0N1YVz4mZBMdwgECen0up-RbS8yrvmQGHt4,601
|
7
|
-
livekit_plugins_elevenlabs-0.7.13.dist-info/METADATA,sha256=en67LviRFvRsErhZ5qvb8UvbmQc2nLcYSijszgdMj1Q,1523
|
8
|
-
livekit_plugins_elevenlabs-0.7.13.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
9
|
-
livekit_plugins_elevenlabs-0.7.13.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.7.13.dist-info/RECORD,,
|
File without changes
|