livekit-plugins-elevenlabs 0.8.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/__init__.py +2 -2
- livekit/plugins/elevenlabs/models.py +9 -1
- livekit/plugins/elevenlabs/tts.py +221 -232
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.8.0.dist-info → livekit_plugins_elevenlabs-1.0.0.dist-info}/METADATA +10 -20
- livekit_plugins_elevenlabs-1.0.0.dist-info/RECORD +9 -0
- {livekit_plugins_elevenlabs-0.8.0.dist-info → livekit_plugins_elevenlabs-1.0.0.dist-info}/WHEEL +1 -2
- livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD +0 -10
- livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt +0 -1
@@ -13,7 +13,7 @@
|
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
15
|
from .models import TTSEncoding, TTSModels
|
16
|
-
from .tts import
|
16
|
+
from .tts import DEFAULT_VOICE_ID, TTS, Voice, VoiceSettings
|
17
17
|
from .version import __version__
|
18
18
|
|
19
19
|
__all__ = [
|
@@ -22,7 +22,7 @@ __all__ = [
|
|
22
22
|
"VoiceSettings",
|
23
23
|
"TTSEncoding",
|
24
24
|
"TTSModels",
|
25
|
-
"
|
25
|
+
"DEFAULT_VOICE_ID",
|
26
26
|
"__version__",
|
27
27
|
]
|
28
28
|
|
@@ -10,4 +10,12 @@ TTSModels = Literal[
|
|
10
10
|
"eleven_flash_v2",
|
11
11
|
]
|
12
12
|
|
13
|
-
TTSEncoding = Literal[
|
13
|
+
TTSEncoding = Literal[
|
14
|
+
"mp3_22050_32",
|
15
|
+
"mp3_44100",
|
16
|
+
"mp3_44100_32",
|
17
|
+
"mp3_44100_64",
|
18
|
+
"mp3_44100_96",
|
19
|
+
"mp3_44100_128",
|
20
|
+
"mp3_44100_192",
|
21
|
+
]
|
@@ -21,9 +21,10 @@ import json
|
|
21
21
|
import os
|
22
22
|
import weakref
|
23
23
|
from dataclasses import dataclass
|
24
|
-
from typing import Any
|
24
|
+
from typing import Any
|
25
25
|
|
26
26
|
import aiohttp
|
27
|
+
|
27
28
|
from livekit.agents import (
|
28
29
|
APIConnectionError,
|
29
30
|
APIConnectOptions,
|
@@ -33,11 +34,19 @@ from livekit.agents import (
|
|
33
34
|
tts,
|
34
35
|
utils,
|
35
36
|
)
|
37
|
+
from livekit.agents.types import (
|
38
|
+
DEFAULT_API_CONNECT_OPTIONS,
|
39
|
+
NOT_GIVEN,
|
40
|
+
NotGivenOr,
|
41
|
+
)
|
42
|
+
from livekit.agents.utils import is_given
|
36
43
|
|
37
44
|
from .log import logger
|
38
45
|
from .models import TTSEncoding, TTSModels
|
39
46
|
|
40
|
-
|
47
|
+
# by default, use 22.05kHz sample rate at 32kbps
|
48
|
+
# in our testing, reduce TTFB by about ~110ms
|
49
|
+
_DefaultEncoding: TTSEncoding = "mp3_22050_32"
|
41
50
|
|
42
51
|
|
43
52
|
def _sample_rate_from_format(output_format: TTSEncoding) -> int:
|
@@ -49,9 +58,9 @@ def _sample_rate_from_format(output_format: TTSEncoding) -> int:
|
|
49
58
|
class VoiceSettings:
|
50
59
|
stability: float # [0.0 - 1.0]
|
51
60
|
similarity_boost: float # [0.0 - 1.0]
|
52
|
-
style: float
|
53
|
-
speed: float
|
54
|
-
use_speaker_boost: bool
|
61
|
+
style: NotGivenOr[float] = NOT_GIVEN # [0.0 - 1.0]
|
62
|
+
speed: NotGivenOr[float] = NOT_GIVEN # [0.8 - 1.2]
|
63
|
+
use_speaker_boost: NotGivenOr[bool] = NOT_GIVEN
|
55
64
|
|
56
65
|
|
57
66
|
@dataclass
|
@@ -59,22 +68,9 @@ class Voice:
|
|
59
68
|
id: str
|
60
69
|
name: str
|
61
70
|
category: str
|
62
|
-
settings: VoiceSettings | None = None
|
63
|
-
|
64
|
-
|
65
|
-
DEFAULT_VOICE = Voice(
|
66
|
-
id="EXAVITQu4vr4xnSDxMaL",
|
67
|
-
name="Bella",
|
68
|
-
category="premade",
|
69
|
-
settings=VoiceSettings(
|
70
|
-
stability=0.71,
|
71
|
-
speed=1.0,
|
72
|
-
similarity_boost=0.5,
|
73
|
-
style=0.0,
|
74
|
-
use_speaker_boost=True,
|
75
|
-
),
|
76
|
-
)
|
77
71
|
|
72
|
+
|
73
|
+
DEFAULT_VOICE_ID = "EXAVITQu4vr4xnSDxMaL"
|
78
74
|
API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
|
79
75
|
AUTHORIZATION_HEADER = "xi-api-key"
|
80
76
|
WS_INACTIVITY_TIMEOUT = 300
|
@@ -83,13 +79,14 @@ WS_INACTIVITY_TIMEOUT = 300
|
|
83
79
|
@dataclass
|
84
80
|
class _TTSOptions:
|
85
81
|
api_key: str
|
86
|
-
|
82
|
+
voice_id: str
|
83
|
+
voice_settings: NotGivenOr[VoiceSettings]
|
87
84
|
model: TTSModels | str
|
88
|
-
language: str
|
85
|
+
language: NotGivenOr[str]
|
89
86
|
base_url: str
|
90
87
|
encoding: TTSEncoding
|
91
88
|
sample_rate: int
|
92
|
-
streaming_latency: int
|
89
|
+
streaming_latency: NotGivenOr[int]
|
93
90
|
word_tokenizer: tokenize.WordTokenizer
|
94
91
|
chunk_length_schedule: list[int]
|
95
92
|
enable_ssml_parsing: bool
|
@@ -100,68 +97,70 @@ class TTS(tts.TTS):
|
|
100
97
|
def __init__(
|
101
98
|
self,
|
102
99
|
*,
|
103
|
-
|
100
|
+
voice_id: str = DEFAULT_VOICE_ID,
|
101
|
+
voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN,
|
104
102
|
model: TTSModels | str = "eleven_flash_v2_5",
|
105
|
-
|
106
|
-
|
107
|
-
|
103
|
+
encoding: NotGivenOr[TTSEncoding] = NOT_GIVEN,
|
104
|
+
api_key: NotGivenOr[str] = NOT_GIVEN,
|
105
|
+
base_url: NotGivenOr[str] = NOT_GIVEN,
|
106
|
+
streaming_latency: NotGivenOr[int] = NOT_GIVEN,
|
108
107
|
inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
|
109
|
-
word_tokenizer:
|
108
|
+
word_tokenizer: NotGivenOr[tokenize.WordTokenizer] = NOT_GIVEN,
|
110
109
|
enable_ssml_parsing: bool = False,
|
111
|
-
chunk_length_schedule: list[int] =
|
110
|
+
chunk_length_schedule: NotGivenOr[list[int]] = NOT_GIVEN, # range is [50, 500]
|
112
111
|
http_session: aiohttp.ClientSession | None = None,
|
113
|
-
|
114
|
-
model_id: TTSModels | str | None = None,
|
115
|
-
language: str | None = None,
|
112
|
+
language: NotGivenOr[str] = NOT_GIVEN,
|
116
113
|
) -> None:
|
117
114
|
"""
|
118
115
|
Create a new instance of ElevenLabs TTS.
|
119
116
|
|
120
117
|
Args:
|
121
|
-
|
118
|
+
voice_id (str): Voice ID. Defaults to `DEFAULT_VOICE_ID`.
|
119
|
+
voice_settings (NotGivenOr[VoiceSettings]): Voice settings.
|
122
120
|
model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
|
123
|
-
api_key (str
|
124
|
-
base_url (str
|
125
|
-
streaming_latency (int): Optimize for streaming latency, defaults to 0 - disabled. 4 for max latency optimizations. deprecated
|
121
|
+
api_key (NotGivenOr[str]): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
|
122
|
+
base_url (NotGivenOr[str]): Custom base URL for the API. Optional.
|
123
|
+
streaming_latency (NotGivenOr[int]): Optimize for streaming latency, defaults to 0 - disabled. 4 for max latency optimizations. deprecated
|
126
124
|
inactivity_timeout (int): Inactivity timeout in seconds for the websocket connection. Defaults to 300.
|
127
|
-
word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
|
125
|
+
word_tokenizer (NotGivenOr[tokenize.WordTokenizer]): Tokenizer for processing text. Defaults to basic WordTokenizer.
|
128
126
|
enable_ssml_parsing (bool): Enable SSML parsing for input text. Defaults to False.
|
129
|
-
chunk_length_schedule (list[int]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
|
127
|
+
chunk_length_schedule (NotGivenOr[list[int]]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
|
130
128
|
http_session (aiohttp.ClientSession | None): Custom HTTP session for API requests. Optional.
|
131
|
-
language (str
|
132
|
-
"""
|
129
|
+
language (NotGivenOr[str]): Language code for the TTS model, as of 10/24/24 only valid for "eleven_turbo_v2_5".
|
130
|
+
""" # noqa: E501
|
131
|
+
|
132
|
+
if not is_given(chunk_length_schedule):
|
133
|
+
chunk_length_schedule = [80, 120, 200, 260]
|
134
|
+
|
135
|
+
if not is_given(encoding):
|
136
|
+
encoding = _DefaultEncoding
|
133
137
|
|
134
138
|
super().__init__(
|
135
139
|
capabilities=tts.TTSCapabilities(
|
136
140
|
streaming=True,
|
137
141
|
),
|
138
|
-
sample_rate=_sample_rate_from_format(
|
142
|
+
sample_rate=_sample_rate_from_format(encoding),
|
139
143
|
num_channels=1,
|
140
144
|
)
|
141
145
|
|
142
|
-
if
|
143
|
-
|
144
|
-
"model_id is deprecated and will be removed in 1.5.0, use model instead",
|
145
|
-
)
|
146
|
-
model = model_id
|
147
|
-
|
148
|
-
api_key = api_key or os.environ.get("ELEVEN_API_KEY")
|
149
|
-
if not api_key:
|
146
|
+
elevenlabs_api_key = api_key if is_given(api_key) else os.environ.get("ELEVEN_API_KEY")
|
147
|
+
if not elevenlabs_api_key:
|
150
148
|
raise ValueError(
|
151
|
-
"ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable"
|
149
|
+
"ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable" # noqa: E501
|
152
150
|
)
|
153
151
|
|
154
|
-
if word_tokenizer
|
152
|
+
if not is_given(word_tokenizer):
|
155
153
|
word_tokenizer = tokenize.basic.WordTokenizer(
|
156
154
|
ignore_punctuation=False # punctuation can help for intonation
|
157
155
|
)
|
158
156
|
|
159
157
|
self._opts = _TTSOptions(
|
160
|
-
|
158
|
+
voice_id=voice_id,
|
159
|
+
voice_settings=voice_settings,
|
161
160
|
model=model,
|
162
|
-
api_key=
|
163
|
-
base_url=base_url
|
164
|
-
encoding=
|
161
|
+
api_key=elevenlabs_api_key,
|
162
|
+
base_url=base_url if is_given(base_url) else API_BASE_URL_V1,
|
163
|
+
encoding=encoding,
|
165
164
|
sample_rate=self.sample_rate,
|
166
165
|
streaming_latency=streaming_latency,
|
167
166
|
word_tokenizer=word_tokenizer,
|
@@ -171,37 +170,15 @@ class TTS(tts.TTS):
|
|
171
170
|
inactivity_timeout=inactivity_timeout,
|
172
171
|
)
|
173
172
|
self._session = http_session
|
174
|
-
self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
|
175
|
-
connect_cb=self._connect_ws,
|
176
|
-
close_cb=self._close_ws,
|
177
|
-
max_session_duration=inactivity_timeout,
|
178
|
-
mark_refreshed_on_get=True,
|
179
|
-
)
|
180
173
|
self._streams = weakref.WeakSet[SynthesizeStream]()
|
181
174
|
|
182
|
-
async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
|
183
|
-
session = self._ensure_session()
|
184
|
-
return await asyncio.wait_for(
|
185
|
-
session.ws_connect(
|
186
|
-
_stream_url(self._opts),
|
187
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
188
|
-
),
|
189
|
-
self._conn_options.timeout,
|
190
|
-
)
|
191
|
-
|
192
|
-
async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
|
193
|
-
await ws.close()
|
194
|
-
|
195
175
|
def _ensure_session(self) -> aiohttp.ClientSession:
|
196
176
|
if not self._session:
|
197
177
|
self._session = utils.http_context.http_session()
|
198
178
|
|
199
179
|
return self._session
|
200
180
|
|
201
|
-
def
|
202
|
-
self._pool.prewarm()
|
203
|
-
|
204
|
-
async def list_voices(self) -> List[Voice]:
|
181
|
+
async def list_voices(self) -> list[Voice]:
|
205
182
|
async with self._ensure_session().get(
|
206
183
|
f"{self._opts.base_url}/voices",
|
207
184
|
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
@@ -211,26 +188,33 @@ class TTS(tts.TTS):
|
|
211
188
|
def update_options(
|
212
189
|
self,
|
213
190
|
*,
|
214
|
-
|
215
|
-
|
216
|
-
|
191
|
+
voice_id: NotGivenOr[str] = NOT_GIVEN,
|
192
|
+
voice_settings: NotGivenOr[VoiceSettings] = NOT_GIVEN,
|
193
|
+
model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
|
194
|
+
language: NotGivenOr[str] = NOT_GIVEN,
|
217
195
|
) -> None:
|
218
196
|
"""
|
219
197
|
Args:
|
220
|
-
|
221
|
-
|
222
|
-
|
198
|
+
voice_id (NotGivenOr[str]): Voice ID.
|
199
|
+
voice_settings (NotGivenOr[VoiceSettings]): Voice settings.
|
200
|
+
model (NotGivenOr[TTSModels | str]): TTS model to use.
|
201
|
+
language (NotGivenOr[str]): Language code for the TTS model.
|
223
202
|
"""
|
224
|
-
|
225
|
-
|
226
|
-
|
203
|
+
if is_given(model):
|
204
|
+
self._opts.model = model
|
205
|
+
if is_given(voice_id):
|
206
|
+
self._opts.voice_id = voice_id
|
207
|
+
if is_given(voice_settings):
|
208
|
+
self._opts.voice_settings = voice_settings
|
209
|
+
if is_given(language):
|
210
|
+
self._opts.language = language
|
227
211
|
|
228
212
|
def synthesize(
|
229
213
|
self,
|
230
214
|
text: str,
|
231
215
|
*,
|
232
|
-
conn_options:
|
233
|
-
) ->
|
216
|
+
conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
|
217
|
+
) -> ChunkedStream:
|
234
218
|
return ChunkedStream(
|
235
219
|
tts=self,
|
236
220
|
input_text=text,
|
@@ -240,9 +224,14 @@ class TTS(tts.TTS):
|
|
240
224
|
)
|
241
225
|
|
242
226
|
def stream(
|
243
|
-
self, *, conn_options:
|
244
|
-
) ->
|
245
|
-
stream = SynthesizeStream(
|
227
|
+
self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
|
228
|
+
) -> SynthesizeStream:
|
229
|
+
stream = SynthesizeStream(
|
230
|
+
tts=self,
|
231
|
+
conn_options=conn_options,
|
232
|
+
opts=self._opts,
|
233
|
+
session=self._ensure_session(),
|
234
|
+
)
|
246
235
|
self._streams.add(stream)
|
247
236
|
return stream
|
248
237
|
|
@@ -250,7 +239,6 @@ class TTS(tts.TTS):
|
|
250
239
|
for stream in list(self._streams):
|
251
240
|
await stream.aclose()
|
252
241
|
self._streams.clear()
|
253
|
-
await self._pool.aclose()
|
254
242
|
await super().aclose()
|
255
243
|
|
256
244
|
|
@@ -263,7 +251,7 @@ class ChunkedStream(tts.ChunkedStream):
|
|
263
251
|
tts: TTS,
|
264
252
|
input_text: str,
|
265
253
|
opts: _TTSOptions,
|
266
|
-
conn_options:
|
254
|
+
conn_options: APIConnectOptions,
|
267
255
|
session: aiohttp.ClientSession,
|
268
256
|
) -> None:
|
269
257
|
super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
|
@@ -272,8 +260,8 @@ class ChunkedStream(tts.ChunkedStream):
|
|
272
260
|
async def _run(self) -> None:
|
273
261
|
request_id = utils.shortuuid()
|
274
262
|
voice_settings = (
|
275
|
-
_strip_nones(dataclasses.asdict(self._opts.
|
276
|
-
if self._opts.
|
263
|
+
_strip_nones(dataclasses.asdict(self._opts.voice_settings))
|
264
|
+
if is_given(self._opts.voice_settings)
|
277
265
|
else None
|
278
266
|
)
|
279
267
|
data = {
|
@@ -338,11 +326,12 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
338
326
|
self,
|
339
327
|
*,
|
340
328
|
tts: TTS,
|
341
|
-
|
329
|
+
session: aiohttp.ClientSession,
|
342
330
|
opts: _TTSOptions,
|
331
|
+
conn_options: APIConnectOptions,
|
343
332
|
):
|
344
|
-
super().__init__(tts=tts)
|
345
|
-
self._opts, self.
|
333
|
+
super().__init__(tts=tts, conn_options=conn_options)
|
334
|
+
self._opts, self._session = opts, session
|
346
335
|
|
347
336
|
async def _run(self) -> None:
|
348
337
|
request_id = utils.shortuuid()
|
@@ -397,177 +386,177 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
397
386
|
word_stream: tokenize.WordStream,
|
398
387
|
request_id: str,
|
399
388
|
) -> None:
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
decoder = utils.codecs.AudioStreamDecoder(
|
405
|
-
sample_rate=self._opts.sample_rate,
|
406
|
-
num_channels=1,
|
407
|
-
)
|
389
|
+
ws_conn = await self._session.ws_connect(
|
390
|
+
_stream_url(self._opts),
|
391
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
392
|
+
)
|
408
393
|
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
)
|
415
|
-
if self._opts.voice.settings
|
416
|
-
else None,
|
417
|
-
generation_config=dict(
|
418
|
-
chunk_length_schedule=self._opts.chunk_length_schedule
|
419
|
-
),
|
420
|
-
)
|
421
|
-
await ws_conn.send_str(json.dumps(init_pkt))
|
422
|
-
|
423
|
-
@utils.log_exceptions(logger=logger)
|
424
|
-
async def send_task():
|
425
|
-
nonlocal expected_text
|
426
|
-
xml_content = []
|
427
|
-
async for data in word_stream:
|
428
|
-
text = data.token
|
429
|
-
expected_text += text
|
430
|
-
# send the xml phoneme in one go
|
431
|
-
if (
|
432
|
-
self._opts.enable_ssml_parsing
|
433
|
-
and data.token.startswith("<phoneme")
|
434
|
-
or xml_content
|
435
|
-
):
|
436
|
-
xml_content.append(text)
|
437
|
-
if text.find("</phoneme>") > -1:
|
438
|
-
text = self._opts.word_tokenizer.format_words(xml_content)
|
439
|
-
xml_content = []
|
440
|
-
else:
|
441
|
-
continue
|
442
|
-
|
443
|
-
data_pkt = dict(text=f"{text} ") # must always end with a space
|
444
|
-
self._mark_started()
|
445
|
-
await ws_conn.send_str(json.dumps(data_pkt))
|
446
|
-
if xml_content:
|
447
|
-
logger.warning("11labs stream ended with incomplete xml content")
|
448
|
-
await ws_conn.send_str(json.dumps({"flush": True}))
|
449
|
-
|
450
|
-
# consumes from decoder and generates events
|
451
|
-
@utils.log_exceptions(logger=logger)
|
452
|
-
async def generate_task():
|
453
|
-
emitter = tts.SynthesizedAudioEmitter(
|
454
|
-
event_ch=self._event_ch,
|
455
|
-
request_id=request_id,
|
456
|
-
segment_id=segment_id,
|
457
|
-
)
|
458
|
-
async for frame in decoder:
|
459
|
-
emitter.push(frame)
|
460
|
-
emitter.flush()
|
394
|
+
segment_id = utils.shortuuid()
|
395
|
+
decoder = utils.codecs.AudioStreamDecoder(
|
396
|
+
sample_rate=self._opts.sample_rate,
|
397
|
+
num_channels=1,
|
398
|
+
)
|
461
399
|
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
aiohttp.WSMsgType.CLOSE,
|
473
|
-
aiohttp.WSMsgType.CLOSING,
|
474
|
-
):
|
475
|
-
raise APIStatusError(
|
476
|
-
"11labs connection closed unexpectedly, not all tokens have been consumed",
|
477
|
-
request_id=request_id,
|
478
|
-
)
|
400
|
+
# 11labs protocol expects the first message to be an "init msg"
|
401
|
+
init_pkt = {
|
402
|
+
"text": " ",
|
403
|
+
"voice_settings": _strip_nones(dataclasses.asdict(self._opts.voice_settings))
|
404
|
+
if is_given(self._opts.voice_settings)
|
405
|
+
else None,
|
406
|
+
"generation_config": {"chunk_length_schedule": self._opts.chunk_length_schedule},
|
407
|
+
}
|
408
|
+
await ws_conn.send_str(json.dumps(init_pkt))
|
409
|
+
eos_sent = False
|
479
410
|
|
480
|
-
|
481
|
-
|
411
|
+
@utils.log_exceptions(logger=logger)
|
412
|
+
async def send_task():
|
413
|
+
nonlocal eos_sent
|
414
|
+
xml_content = []
|
415
|
+
async for data in word_stream:
|
416
|
+
text = data.token
|
417
|
+
# send the xml phoneme in one go
|
418
|
+
if (
|
419
|
+
self._opts.enable_ssml_parsing
|
420
|
+
and data.token.startswith("<phoneme")
|
421
|
+
or xml_content
|
422
|
+
):
|
423
|
+
xml_content.append(text)
|
424
|
+
if data.token.find("</phoneme>") > -1:
|
425
|
+
text = self._opts.word_tokenizer.format_words(xml_content)
|
426
|
+
xml_content = []
|
427
|
+
else:
|
482
428
|
continue
|
483
429
|
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
430
|
+
data_pkt = {"text": f"{text} "} # must always end with a space
|
431
|
+
self._mark_started()
|
432
|
+
await ws_conn.send_str(json.dumps(data_pkt))
|
433
|
+
if xml_content:
|
434
|
+
logger.warning("11labs stream ended with incomplete xml content")
|
435
|
+
|
436
|
+
# no more token, mark eos
|
437
|
+
eos_pkt = {"text": ""}
|
438
|
+
await ws_conn.send_str(json.dumps(eos_pkt))
|
439
|
+
eos_sent = True
|
440
|
+
|
441
|
+
# consumes from decoder and generates events
|
442
|
+
@utils.log_exceptions(logger=logger)
|
443
|
+
async def generate_task():
|
444
|
+
emitter = tts.SynthesizedAudioEmitter(
|
445
|
+
event_ch=self._event_ch,
|
446
|
+
request_id=request_id,
|
447
|
+
segment_id=segment_id,
|
448
|
+
)
|
449
|
+
async for frame in decoder:
|
450
|
+
emitter.push(frame)
|
451
|
+
emitter.flush()
|
452
|
+
|
453
|
+
# receives from ws and decodes audio
|
454
|
+
@utils.log_exceptions(logger=logger)
|
455
|
+
async def recv_task():
|
456
|
+
nonlocal eos_sent
|
457
|
+
|
458
|
+
while True:
|
459
|
+
msg = await ws_conn.receive()
|
460
|
+
if msg.type in (
|
461
|
+
aiohttp.WSMsgType.CLOSED,
|
462
|
+
aiohttp.WSMsgType.CLOSE,
|
463
|
+
aiohttp.WSMsgType.CLOSING,
|
464
|
+
):
|
465
|
+
if not eos_sent:
|
504
466
|
raise APIStatusError(
|
505
|
-
|
506
|
-
status_code=500,
|
467
|
+
"11labs connection closed unexpectedly, not all tokens have been consumed", # noqa: E501
|
507
468
|
request_id=request_id,
|
508
|
-
body=None,
|
509
469
|
)
|
470
|
+
return
|
510
471
|
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
472
|
+
if msg.type != aiohttp.WSMsgType.TEXT:
|
473
|
+
logger.warning("unexpected 11labs message type %s", msg.type)
|
474
|
+
continue
|
475
|
+
|
476
|
+
data = json.loads(msg.data)
|
477
|
+
if data.get("audio"):
|
478
|
+
b64data = base64.b64decode(data["audio"])
|
479
|
+
decoder.push(b64data)
|
480
|
+
|
481
|
+
elif data.get("isFinal"):
|
482
|
+
decoder.end_input()
|
483
|
+
break
|
484
|
+
elif data.get("error"):
|
485
|
+
raise APIStatusError(
|
486
|
+
message=data["error"],
|
487
|
+
status_code=500,
|
488
|
+
request_id=request_id,
|
489
|
+
body=None,
|
490
|
+
)
|
491
|
+
else:
|
492
|
+
raise APIStatusError(
|
493
|
+
message=f"unexpected 11labs message {data}",
|
494
|
+
status_code=500,
|
495
|
+
request_id=request_id,
|
496
|
+
body=None,
|
497
|
+
)
|
498
|
+
|
499
|
+
tasks = [
|
500
|
+
asyncio.create_task(send_task()),
|
501
|
+
asyncio.create_task(recv_task()),
|
502
|
+
asyncio.create_task(generate_task()),
|
503
|
+
]
|
504
|
+
try:
|
505
|
+
await asyncio.gather(*tasks)
|
506
|
+
except asyncio.TimeoutError as e:
|
507
|
+
raise APITimeoutError() from e
|
508
|
+
except aiohttp.ClientResponseError as e:
|
509
|
+
raise APIStatusError(
|
510
|
+
message=e.message,
|
511
|
+
status_code=e.status,
|
512
|
+
request_id=request_id,
|
513
|
+
body=None,
|
514
|
+
) from e
|
515
|
+
except APIStatusError:
|
516
|
+
raise
|
517
|
+
except Exception as e:
|
518
|
+
raise APIConnectionError() from e
|
519
|
+
finally:
|
520
|
+
await utils.aio.gracefully_cancel(*tasks)
|
521
|
+
await decoder.aclose()
|
522
|
+
if ws_conn is not None:
|
523
|
+
await ws_conn.close()
|
534
524
|
|
535
525
|
|
536
526
|
def _dict_to_voices_list(data: dict[str, Any]):
|
537
|
-
voices:
|
527
|
+
voices: list[Voice] = []
|
538
528
|
for voice in data["voices"]:
|
539
529
|
voices.append(
|
540
530
|
Voice(
|
541
531
|
id=voice["voice_id"],
|
542
532
|
name=voice["name"],
|
543
533
|
category=voice["category"],
|
544
|
-
settings=None,
|
545
534
|
)
|
546
535
|
)
|
547
536
|
return voices
|
548
537
|
|
549
538
|
|
550
539
|
def _strip_nones(data: dict[str, Any]):
|
551
|
-
return {k: v for k, v in data.items() if v is not None}
|
540
|
+
return {k: v for k, v in data.items() if is_given(v) and v is not None}
|
552
541
|
|
553
542
|
|
554
543
|
def _synthesize_url(opts: _TTSOptions) -> str:
|
555
544
|
base_url = opts.base_url
|
556
|
-
voice_id = opts.
|
545
|
+
voice_id = opts.voice_id
|
557
546
|
model_id = opts.model
|
558
547
|
output_format = opts.encoding
|
559
548
|
url = (
|
560
549
|
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
561
550
|
f"model_id={model_id}&output_format={output_format}"
|
562
551
|
)
|
563
|
-
if opts.streaming_latency:
|
552
|
+
if is_given(opts.streaming_latency):
|
564
553
|
url += f"&optimize_streaming_latency={opts.streaming_latency}"
|
565
554
|
return url
|
566
555
|
|
567
556
|
|
568
557
|
def _stream_url(opts: _TTSOptions) -> str:
|
569
558
|
base_url = opts.base_url
|
570
|
-
voice_id = opts.
|
559
|
+
voice_id = opts.voice_id
|
571
560
|
model_id = opts.model
|
572
561
|
output_format = opts.encoding
|
573
562
|
enable_ssml = str(opts.enable_ssml_parsing).lower()
|
@@ -578,8 +567,8 @@ def _stream_url(opts: _TTSOptions) -> str:
|
|
578
567
|
f"model_id={model_id}&output_format={output_format}&"
|
579
568
|
f"enable_ssml_parsing={enable_ssml}&inactivity_timeout={inactivity_timeout}"
|
580
569
|
)
|
581
|
-
if language
|
570
|
+
if is_given(language):
|
582
571
|
url += f"&language_code={language}"
|
583
|
-
if opts.streaming_latency:
|
572
|
+
if is_given(opts.streaming_latency):
|
584
573
|
url += f"&optimize_streaming_latency={opts.streaming_latency}"
|
585
574
|
return url
|
{livekit_plugins_elevenlabs-0.8.0.dist-info → livekit_plugins_elevenlabs-1.0.0.dist-info}/METADATA
RENAMED
@@ -1,35 +1,25 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 1.0.0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
|
-
Home-page: https://github.com/livekit/agents
|
6
|
-
License: Apache-2.0
|
7
5
|
Project-URL: Documentation, https://docs.livekit.io
|
8
6
|
Project-URL: Website, https://livekit.io/
|
9
7
|
Project-URL: Source, https://github.com/livekit/agents
|
10
|
-
|
8
|
+
Author-email: LiveKit <hello@livekit.io>
|
9
|
+
License-Expression: Apache-2.0
|
10
|
+
Keywords: audio,elevenlabs,livekit,realtime,video,webrtc
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
13
|
-
Classifier: Topic :: Multimedia :: Sound/Audio
|
14
|
-
Classifier: Topic :: Multimedia :: Video
|
15
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
16
13
|
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
17
15
|
Classifier: Programming Language :: Python :: 3.9
|
18
16
|
Classifier: Programming Language :: Python :: 3.10
|
19
|
-
Classifier:
|
17
|
+
Classifier: Topic :: Multimedia :: Sound/Audio
|
18
|
+
Classifier: Topic :: Multimedia :: Video
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
20
|
Requires-Python: >=3.9.0
|
21
|
+
Requires-Dist: livekit-agents[codecs]>=1.0.0
|
21
22
|
Description-Content-Type: text/markdown
|
22
|
-
Requires-Dist: livekit-agents[codecs]<1.0.0,>=0.12.16
|
23
|
-
Dynamic: classifier
|
24
|
-
Dynamic: description
|
25
|
-
Dynamic: description-content-type
|
26
|
-
Dynamic: home-page
|
27
|
-
Dynamic: keywords
|
28
|
-
Dynamic: license
|
29
|
-
Dynamic: project-url
|
30
|
-
Dynamic: requires-dist
|
31
|
-
Dynamic: requires-python
|
32
|
-
Dynamic: summary
|
33
23
|
|
34
24
|
# LiveKit Plugins Elevenlabs
|
35
25
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=Va24UYTuuosmRuTcuzd_DIHYQOgV-wSYKJIXmOSB2Go,1255
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=p_wHEz15bdsNEqwzN831ysm70PNWQ-xeN__BKvGPZxA,401
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=gs9p4TwBAYX3vlsNn2XQ-oyPNUGcuvgix8K7vChRMmc,19985
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=nW89L_U9N4ukT3wAO3BeTqOaa87zLUOsEFz8TkiKIP8,600
|
7
|
+
livekit_plugins_elevenlabs-1.0.0.dist-info/METADATA,sha256=EL7wso-EPaWpWwQ5OtxwDaIueFvHrSBEy7PPCigZ8SI,1312
|
8
|
+
livekit_plugins_elevenlabs-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
9
|
+
livekit_plugins_elevenlabs-1.0.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=nB43wLS1ilzS7IxLYVSQxBjKPnbiPl4AHpHAOlG2i00,273
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=KCZnuAngDZck4zIMMgp0BLV0GS31kKChMvdvXUVZ8vY,20491
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=fObgfvFfJb5Vj0qY1hgEiVKSo6z6atjrJvwAVl4KvR4,600
|
7
|
-
livekit_plugins_elevenlabs-0.8.0.dist-info/METADATA,sha256=BwddENtvF9zqxTgjgIsHyavyRfA82TBISYEVwFfo2vs,1529
|
8
|
-
livekit_plugins_elevenlabs-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
9
|
-
livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD,,
|
@@ -1 +0,0 @@
|
|
1
|
-
livekit
|