livekit-plugins-elevenlabs 0.4.dev2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/__init__.py +10 -1
- livekit/plugins/elevenlabs/models.py +12 -0
- livekit/plugins/elevenlabs/tts.py +327 -191
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.4.dev2.dist-info → livekit_plugins_elevenlabs-0.5.0.dist-info}/METADATA +2 -2
- livekit_plugins_elevenlabs-0.5.0.dist-info/RECORD +10 -0
- livekit_plugins_elevenlabs-0.4.dev2.dist-info/RECORD +0 -10
- {livekit_plugins_elevenlabs-0.4.dev2.dist-info → livekit_plugins_elevenlabs-0.5.0.dist-info}/WHEEL +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2.dist-info → livekit_plugins_elevenlabs-0.5.0.dist-info}/top_level.txt +0 -0
@@ -12,10 +12,19 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from .models import TTSEncoding, TTSModels
|
15
16
|
from .tts import DEFAULT_VOICE, TTS, Voice, VoiceSettings
|
16
17
|
from .version import __version__
|
17
18
|
|
18
|
-
__all__ = [
|
19
|
+
__all__ = [
|
20
|
+
"TTS",
|
21
|
+
"Voice",
|
22
|
+
"VoiceSettings",
|
23
|
+
"TTSEncoding",
|
24
|
+
"TTSModels",
|
25
|
+
"DEFAULT_VOICE",
|
26
|
+
"__version__",
|
27
|
+
]
|
19
28
|
|
20
29
|
from livekit.agents import Plugin
|
21
30
|
|
@@ -6,3 +6,15 @@ TTSModels = Literal[
|
|
6
6
|
"eleven_multilingual_v2",
|
7
7
|
"eleven_turbo_v2",
|
8
8
|
]
|
9
|
+
|
10
|
+
TTSEncoding = Literal[
|
11
|
+
"mp3_22050_32",
|
12
|
+
"mp3_44100_32",
|
13
|
+
"mp3_44100_64",
|
14
|
+
"mp3_44100_96",
|
15
|
+
"mp3_44100_128",
|
16
|
+
"mp3_44100_192",
|
17
|
+
"pcm_16000",
|
18
|
+
"pcm_22050",
|
19
|
+
"pcm_44100",
|
20
|
+
]
|
@@ -12,6 +12,8 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from __future__ import annotations
|
16
|
+
|
15
17
|
import asyncio
|
16
18
|
import base64
|
17
19
|
import contextlib
|
@@ -19,14 +21,36 @@ import dataclasses
|
|
19
21
|
import json
|
20
22
|
import os
|
21
23
|
from dataclasses import dataclass
|
22
|
-
from typing import
|
24
|
+
from typing import List, Literal, Optional
|
23
25
|
|
24
26
|
import aiohttp
|
25
27
|
from livekit import rtc
|
26
|
-
from livekit.agents import aio, tts
|
28
|
+
from livekit.agents import aio, codecs, tokenize, tts, utils
|
27
29
|
|
28
30
|
from .log import logger
|
29
|
-
from .models import
|
31
|
+
from .models import (
|
32
|
+
TTSEncoding,
|
33
|
+
TTSModels,
|
34
|
+
)
|
35
|
+
|
36
|
+
_Encoding = Literal[
|
37
|
+
"mp3",
|
38
|
+
"pcm",
|
39
|
+
]
|
40
|
+
|
41
|
+
|
42
|
+
def _sample_rate_from_format(output_format: TTSEncoding) -> int:
|
43
|
+
split = output_format.split("_") # e.g: mp3_22050_32
|
44
|
+
return int(split[1])
|
45
|
+
|
46
|
+
|
47
|
+
def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
|
48
|
+
if output_format.startswith("mp3"):
|
49
|
+
return "mp3"
|
50
|
+
elif output_format.startswith("pcm"):
|
51
|
+
return "pcm"
|
52
|
+
|
53
|
+
raise ValueError(f"Unknown format: {output_format}")
|
30
54
|
|
31
55
|
|
32
56
|
@dataclass
|
@@ -59,13 +83,16 @@ AUTHORIZATION_HEADER = "xi-api-key"
|
|
59
83
|
|
60
84
|
|
61
85
|
@dataclass
|
62
|
-
class
|
86
|
+
class _TTSOptions:
|
63
87
|
api_key: str
|
64
88
|
voice: Voice
|
65
89
|
model_id: TTSModels
|
66
90
|
base_url: str
|
91
|
+
encoding: TTSEncoding
|
67
92
|
sample_rate: int
|
68
|
-
|
93
|
+
streaming_latency: int
|
94
|
+
word_tokenizer: tokenize.WordTokenizer
|
95
|
+
chunk_length_schedule: list[int]
|
69
96
|
|
70
97
|
|
71
98
|
class TTS(tts.TTS):
|
@@ -76,140 +103,211 @@ class TTS(tts.TTS):
|
|
76
103
|
model_id: TTSModels = "eleven_turbo_v2",
|
77
104
|
api_key: str | None = None,
|
78
105
|
base_url: str | None = None,
|
79
|
-
|
80
|
-
|
106
|
+
encoding: TTSEncoding = "mp3_22050_32",
|
107
|
+
streaming_latency: int = 3,
|
108
|
+
word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
|
109
|
+
ignore_punctuation=False # punctuation can help for intonation
|
110
|
+
),
|
111
|
+
# default value of 11labs is [120, 160, 250, 290], but we want faster responses by default
|
112
|
+
# (range is 50-500)
|
113
|
+
chunk_length_schedule: list[int] = [80, 120, 200, 260],
|
114
|
+
http_session: aiohttp.ClientSession | None = None,
|
81
115
|
) -> None:
|
82
116
|
super().__init__(
|
83
|
-
streaming_supported=True,
|
117
|
+
streaming_supported=True,
|
118
|
+
sample_rate=_sample_rate_from_format(encoding),
|
119
|
+
num_channels=1,
|
84
120
|
)
|
85
121
|
api_key = api_key or os.environ.get("ELEVEN_API_KEY")
|
86
122
|
if not api_key:
|
87
123
|
raise ValueError("ELEVEN_API_KEY must be set")
|
88
124
|
|
89
|
-
self.
|
90
|
-
self._opts = TTSOptions(
|
125
|
+
self._opts = _TTSOptions(
|
91
126
|
voice=voice,
|
92
127
|
model_id=model_id,
|
93
128
|
api_key=api_key,
|
94
129
|
base_url=base_url or API_BASE_URL_V1,
|
95
|
-
|
96
|
-
|
130
|
+
encoding=encoding,
|
131
|
+
sample_rate=self.sample_rate,
|
132
|
+
streaming_latency=streaming_latency,
|
133
|
+
word_tokenizer=word_tokenizer,
|
134
|
+
chunk_length_schedule=chunk_length_schedule,
|
97
135
|
)
|
136
|
+
self._session = http_session
|
137
|
+
|
138
|
+
def _ensure_session(self) -> aiohttp.ClientSession:
|
139
|
+
if not self._session:
|
140
|
+
self._session = utils.http_session()
|
141
|
+
|
142
|
+
return self._session
|
98
143
|
|
99
144
|
async def list_voices(self) -> List[Voice]:
|
100
|
-
async with self.
|
145
|
+
async with self._ensure_session().get(
|
101
146
|
f"{self._opts.base_url}/voices",
|
102
147
|
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
103
148
|
) as resp:
|
104
|
-
|
105
|
-
return dict_to_voices_list(data)
|
149
|
+
return _dict_to_voices_list(await resp.json())
|
106
150
|
|
107
151
|
def synthesize(
|
108
152
|
self,
|
109
153
|
text: str,
|
110
|
-
) ->
|
111
|
-
|
112
|
-
url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
|
154
|
+
) -> "ChunkedStream":
|
155
|
+
return ChunkedStream(text, self._opts, self._ensure_session())
|
113
156
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
157
|
+
def stream(
|
158
|
+
self,
|
159
|
+
) -> "SynthesizeStream":
|
160
|
+
return SynthesizeStream(self._ensure_session(), self._opts)
|
161
|
+
|
162
|
+
|
163
|
+
class ChunkedStream(tts.ChunkedStream):
|
164
|
+
"""Synthesize using the chunked api endpoint"""
|
165
|
+
|
166
|
+
def __init__(
|
167
|
+
self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
|
168
|
+
) -> None:
|
169
|
+
self._opts = opts
|
170
|
+
self._text = text
|
171
|
+
self._session = session
|
172
|
+
self._task: asyncio.Task | None = None
|
173
|
+
self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
|
174
|
+
|
175
|
+
def _synthesize_url(self) -> str:
|
176
|
+
base_url = self._opts.base_url
|
177
|
+
voice_id = self._opts.voice.id
|
178
|
+
model_id = self._opts.model_id
|
179
|
+
sample_rate = _sample_rate_from_format(self._opts.encoding)
|
180
|
+
latency = self._opts.streaming_latency
|
181
|
+
url = (
|
182
|
+
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
183
|
+
f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
184
|
+
)
|
185
|
+
return url
|
186
|
+
|
187
|
+
async def _main_task(self):
|
188
|
+
try:
|
189
|
+
await self._run()
|
190
|
+
except Exception:
|
191
|
+
logger.exception("11labs main task failed in chunked stream")
|
192
|
+
finally:
|
193
|
+
self._queue.put_nowait(None)
|
194
|
+
|
195
|
+
async def _run(self) -> None:
|
196
|
+
async with self._session.post(
|
197
|
+
self._synthesize_url(),
|
198
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
199
|
+
json=dict(
|
200
|
+
text=self._text,
|
201
|
+
model_id=self._opts.model_id,
|
202
|
+
voice_settings=(
|
203
|
+
dataclasses.asdict(self._opts.voice.settings)
|
204
|
+
if self._opts.voice.settings
|
205
|
+
else None
|
206
|
+
),
|
207
|
+
),
|
208
|
+
) as resp:
|
209
|
+
# avoid very small frames. chunk by 10ms 16bits
|
210
|
+
bytes_per_frame = (self._opts.sample_rate // 100) * 2
|
211
|
+
buf = bytearray()
|
212
|
+
async for data, _ in resp.content.iter_chunks():
|
213
|
+
buf.extend(data)
|
214
|
+
|
215
|
+
while len(buf) >= bytes_per_frame:
|
216
|
+
frame_data = buf[:bytes_per_frame]
|
217
|
+
buf = buf[bytes_per_frame:]
|
218
|
+
|
219
|
+
self._queue.put_nowait(
|
220
|
+
tts.SynthesizedAudio(
|
221
|
+
text=self._text,
|
222
|
+
data=rtc.AudioFrame(
|
223
|
+
data=frame_data,
|
224
|
+
sample_rate=self._opts.sample_rate,
|
225
|
+
num_channels=1,
|
226
|
+
samples_per_channel=len(frame_data) // 2,
|
227
|
+
),
|
228
|
+
)
|
229
|
+
)
|
230
|
+
|
231
|
+
# send any remaining data
|
232
|
+
if len(buf) > 0:
|
233
|
+
self._queue.put_nowait(
|
234
|
+
tts.SynthesizedAudio(
|
235
|
+
text=self._text,
|
130
236
|
data=rtc.AudioFrame(
|
131
|
-
data=
|
237
|
+
data=buf,
|
132
238
|
sample_rate=self._opts.sample_rate,
|
133
239
|
num_channels=1,
|
134
|
-
samples_per_channel=len(
|
240
|
+
samples_per_channel=len(buf) // 2,
|
135
241
|
),
|
136
242
|
)
|
137
|
-
|
138
|
-
logger.error(f"failed to synthesize: {e}")
|
243
|
+
)
|
139
244
|
|
140
|
-
|
245
|
+
async def __anext__(self) -> tts.SynthesizedAudio:
|
246
|
+
if not self._task:
|
247
|
+
self._task = asyncio.create_task(self._main_task())
|
141
248
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
249
|
+
frame = await self._queue.get()
|
250
|
+
if frame is None:
|
251
|
+
raise StopAsyncIteration
|
252
|
+
|
253
|
+
return frame
|
254
|
+
|
255
|
+
async def aclose(self) -> None:
|
256
|
+
if not self._task:
|
257
|
+
return
|
258
|
+
|
259
|
+
self._task.cancel()
|
260
|
+
with contextlib.suppress(asyncio.CancelledError):
|
261
|
+
await self._task
|
146
262
|
|
147
263
|
|
148
264
|
class SynthesizeStream(tts.SynthesizeStream):
|
149
|
-
|
265
|
+
"""Streamed API using websockets"""
|
266
|
+
|
267
|
+
@dataclass
|
268
|
+
class _SegmentConnection:
|
269
|
+
audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
|
270
|
+
task: asyncio.Task
|
150
271
|
|
151
272
|
def __init__(
|
152
273
|
self,
|
153
274
|
session: aiohttp.ClientSession,
|
154
|
-
opts:
|
155
|
-
|
275
|
+
opts: _TTSOptions,
|
276
|
+
max_retry_per_segment: int = 3,
|
156
277
|
):
|
157
278
|
self._opts = opts
|
158
279
|
self._session = session
|
159
|
-
|
160
|
-
self.
|
161
|
-
self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
|
280
|
+
self._main_task = asyncio.create_task(self._run(max_retry_per_segment))
|
281
|
+
self._event_queue = asyncio.Queue[Optional[tts.SynthesisEvent]]()
|
162
282
|
self._closed = False
|
163
|
-
self.
|
164
|
-
|
165
|
-
self._main_task = asyncio.create_task(self._run(max_retry))
|
283
|
+
self._word_stream = opts.word_tokenizer.stream()
|
166
284
|
|
167
285
|
def _stream_url(self) -> str:
|
168
286
|
base_url = self._opts.base_url
|
169
287
|
voice_id = self._opts.voice.id
|
170
288
|
model_id = self._opts.model_id
|
171
|
-
|
172
|
-
latency = self._opts.
|
173
|
-
|
289
|
+
output_format = self._opts.encoding
|
290
|
+
latency = self._opts.streaming_latency
|
291
|
+
url = (
|
292
|
+
f"{base_url}/text-to-speech/{voice_id}/stream-input?"
|
293
|
+
f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
|
294
|
+
)
|
295
|
+
|
296
|
+
return url
|
174
297
|
|
175
298
|
def push_text(self, token: str | None) -> None:
|
176
299
|
if self._closed:
|
177
300
|
raise ValueError("cannot push to a closed stream")
|
178
301
|
|
179
302
|
if token is None:
|
180
|
-
self.
|
181
|
-
return
|
182
|
-
|
183
|
-
if len(token) == 0:
|
184
|
-
# 11labs marks the EOS with an empty string, avoid users from pushing empty strings
|
303
|
+
self._word_stream.mark_segment_end()
|
185
304
|
return
|
186
305
|
|
187
|
-
|
188
|
-
# fmt: off
|
189
|
-
splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
|
190
|
-
# fmt: on
|
191
|
-
|
192
|
-
self._text += token
|
193
|
-
|
194
|
-
while True:
|
195
|
-
last_split = -1
|
196
|
-
for i, c in enumerate(self._text):
|
197
|
-
if c in splitters:
|
198
|
-
last_split = i
|
199
|
-
break
|
200
|
-
|
201
|
-
if last_split == -1:
|
202
|
-
break
|
203
|
-
|
204
|
-
seg = self._text[: last_split + 1]
|
205
|
-
seg = seg.strip() + " " # 11labs expects a space at the end
|
206
|
-
self._queue.put_nowait(seg)
|
207
|
-
self._text = self._text[last_split + 1 :]
|
306
|
+
self._word_stream.push_text(token)
|
208
307
|
|
209
308
|
async def aclose(self, *, wait: bool = True) -> None:
|
210
|
-
self._flush_if_needed()
|
211
|
-
self._queue.put_nowait(None)
|
212
309
|
self._closed = True
|
310
|
+
await self._word_stream.aclose()
|
213
311
|
|
214
312
|
if not wait:
|
215
313
|
self._main_task.cancel()
|
@@ -217,158 +315,196 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
217
315
|
with contextlib.suppress(asyncio.CancelledError):
|
218
316
|
await self._main_task
|
219
317
|
|
220
|
-
def
|
221
|
-
|
222
|
-
if len(seg) > 0:
|
223
|
-
self._queue.put_nowait(seg + " ")
|
224
|
-
|
225
|
-
self._text = ""
|
226
|
-
self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
|
227
|
-
|
228
|
-
async def _run(self, max_retry: int) -> None:
|
229
|
-
retry_count = 0
|
230
|
-
ws: aiohttp.ClientWebSocketResponse | None = None
|
231
|
-
ws_task: asyncio.Task | None = None
|
232
|
-
data_tx: aio.ChanSender[str] | None = None
|
318
|
+
async def _run(self, max_retry_per_segment: int) -> None:
|
319
|
+
conns_q = asyncio.Queue[Optional[SynthesizeStream._SegmentConnection]]()
|
233
320
|
|
234
|
-
|
321
|
+
async def _forward_events() -> None:
|
322
|
+
"""forward events from the ws connections to the event queue.
|
323
|
+
This is used to keep the right order."""
|
235
324
|
while True:
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
if data is None:
|
241
|
-
if ws_task is not None:
|
242
|
-
await ws_task
|
243
|
-
break
|
244
|
-
|
245
|
-
if not ws_connected:
|
246
|
-
if data == SynthesizeStream._STREAM_EOS:
|
247
|
-
continue
|
248
|
-
|
249
|
-
with contextlib.suppress(asyncio.CancelledError):
|
250
|
-
if ws_task is not None:
|
251
|
-
await ws_task
|
252
|
-
|
253
|
-
ws = await self._session.ws_connect(
|
254
|
-
self._stream_url(),
|
255
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
256
|
-
)
|
257
|
-
data_tx, data_rx = aio.channel()
|
258
|
-
ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
|
325
|
+
c = await conns_q.get()
|
326
|
+
if c is None:
|
327
|
+
break # no more segment, stream closed
|
259
328
|
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
data_tx.send_nowait(data)
|
329
|
+
self._event_queue.put_nowait(
|
330
|
+
tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
|
331
|
+
)
|
265
332
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
333
|
+
async for frame in c.audio_rx:
|
334
|
+
self._event_queue.put_nowait(
|
335
|
+
tts.SynthesisEvent(
|
336
|
+
type=tts.SynthesisEventType.AUDIO, audio=frame
|
270
337
|
)
|
271
|
-
|
338
|
+
)
|
272
339
|
|
273
|
-
|
274
|
-
|
340
|
+
self._event_queue.put_nowait(
|
341
|
+
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
342
|
+
)
|
275
343
|
|
276
|
-
|
277
|
-
|
344
|
+
async def _read_tokens() -> None:
|
345
|
+
"""read tokens from the word stream and create connections for each segment,
|
346
|
+
(this also allows concurrent connections to 11labs)"""
|
347
|
+
|
348
|
+
cur_segment: SynthesizeStream._SegmentConnection | None = None
|
349
|
+
token_tx: aio.ChanSender[str] | None = None
|
350
|
+
async for ev in self._word_stream:
|
351
|
+
if ev.type == tokenize.TokenEventType.STARTED:
|
352
|
+
token_tx, token_rx = aio.channel()
|
353
|
+
audio_tx: aio.ChanSender[tts.SynthesizedAudio]
|
354
|
+
audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
|
355
|
+
audio_tx, audio_rx = aio.channel()
|
356
|
+
task = asyncio.create_task(
|
357
|
+
self._run_ws(max_retry_per_segment, audio_tx, token_rx)
|
278
358
|
)
|
279
|
-
|
359
|
+
cur_segment = SynthesizeStream._SegmentConnection(audio_rx, task)
|
360
|
+
conns_q.put_nowait(cur_segment)
|
361
|
+
elif ev.type == tokenize.TokenEventType.TOKEN:
|
362
|
+
assert token_tx is not None
|
363
|
+
token_tx.send_nowait(ev.token)
|
364
|
+
elif ev.type == tokenize.TokenEventType.FINISHED:
|
365
|
+
assert token_tx is not None
|
366
|
+
token_tx.close()
|
367
|
+
cur_segment = token_tx = None
|
368
|
+
|
369
|
+
conns_q.put_nowait(None)
|
280
370
|
|
371
|
+
try:
|
372
|
+
await asyncio.gather(_forward_events(), _read_tokens())
|
281
373
|
except Exception:
|
282
374
|
logger.exception("11labs task failed")
|
283
|
-
finally:
|
284
|
-
with contextlib.suppress(asyncio.CancelledError):
|
285
|
-
if ws_task is not None:
|
286
|
-
ws_task.cancel()
|
287
|
-
await ws_task
|
288
375
|
|
289
|
-
|
376
|
+
self._event_queue.put_nowait(None)
|
290
377
|
|
291
378
|
async def _run_ws(
|
292
|
-
self,
|
379
|
+
self,
|
380
|
+
max_retry: int,
|
381
|
+
audio_tx: aio.ChanSender[tts.SynthesizedAudio],
|
382
|
+
token_rx: aio.ChanReceiver[str],
|
293
383
|
) -> None:
|
294
|
-
|
384
|
+
# try to connect to 11labs
|
385
|
+
ws_conn: aiohttp.ClientWebSocketResponse | None = None
|
386
|
+
for try_i in range(max_retry):
|
387
|
+
try:
|
388
|
+
ws_conn = await self._session.ws_connect(
|
389
|
+
self._stream_url(),
|
390
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
391
|
+
)
|
295
392
|
|
296
|
-
|
297
|
-
|
298
|
-
|
393
|
+
voice_settings = None
|
394
|
+
if self._opts.voice.settings is not None:
|
395
|
+
voice_settings = dataclasses.asdict(self._opts.voice.settings)
|
299
396
|
|
300
|
-
|
301
|
-
|
397
|
+
init_pkt = dict(
|
398
|
+
text=" ",
|
399
|
+
try_trigger_generation=True,
|
400
|
+
voice_settings=voice_settings,
|
401
|
+
generation_config=dict(
|
402
|
+
chunk_length_schedule=self._opts.chunk_length_schedule,
|
403
|
+
),
|
404
|
+
)
|
405
|
+
await ws_conn.send_str(json.dumps(init_pkt))
|
406
|
+
except Exception:
|
407
|
+
if try_i + 1 == max_retry:
|
408
|
+
logger.exception(
|
409
|
+
f"failed to connect to 11labs after {max_retry} retries"
|
410
|
+
)
|
411
|
+
return
|
302
412
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
init_pkt = dict(
|
309
|
-
text=" ",
|
310
|
-
voice_settings=voice_settings,
|
311
|
-
)
|
312
|
-
await ws.send_str(json.dumps(init_pkt))
|
413
|
+
retry_delay = min(try_i * 5, 5) # max 5s
|
414
|
+
logger.warning(
|
415
|
+
f"failed to connect to 11labs, retrying in {retry_delay}s"
|
416
|
+
)
|
417
|
+
await asyncio.sleep(retry_delay)
|
313
418
|
|
314
|
-
|
315
|
-
|
419
|
+
assert ws_conn is not None
|
420
|
+
|
421
|
+
all_tokens_consumed = False
|
422
|
+
|
423
|
+
async def send_task():
|
424
|
+
async for token in token_rx:
|
425
|
+
if token == "":
|
426
|
+
continue # empty token is closing the stream in 11labs protocol
|
427
|
+
|
428
|
+
# try_trigger_generation=True is a bad practice, we expose
|
429
|
+
# chunk_length_schedule instead
|
316
430
|
data_pkt = dict(
|
317
|
-
text=
|
318
|
-
try_trigger_generation=
|
431
|
+
text=f"{token} ", # must always end with a space
|
432
|
+
try_trigger_generation=False,
|
319
433
|
)
|
320
|
-
|
321
|
-
closing_ws = True
|
434
|
+
await ws_conn.send_str(json.dumps(data_pkt))
|
322
435
|
|
323
|
-
|
436
|
+
# no more token, mark eos
|
437
|
+
flush_pkt = dict(
|
438
|
+
text="",
|
439
|
+
)
|
440
|
+
await ws_conn.send_str(json.dumps(flush_pkt))
|
324
441
|
|
325
|
-
|
326
|
-
|
442
|
+
nonlocal all_tokens_consumed
|
443
|
+
all_tokens_consumed = True
|
327
444
|
|
328
445
|
async def recv_task():
|
329
|
-
|
446
|
+
encoding = _encoding_from_format(self._opts.encoding)
|
447
|
+
mp3_decoder = codecs.Mp3StreamDecoder()
|
330
448
|
while True:
|
331
|
-
msg = await
|
449
|
+
msg = await ws_conn.receive()
|
332
450
|
if msg.type in (
|
333
451
|
aiohttp.WSMsgType.CLOSED,
|
334
452
|
aiohttp.WSMsgType.CLOSE,
|
335
453
|
aiohttp.WSMsgType.CLOSING,
|
336
454
|
):
|
337
|
-
if
|
338
|
-
return
|
455
|
+
if all_tokens_consumed:
|
456
|
+
return # close is expected
|
339
457
|
|
340
|
-
raise Exception(
|
458
|
+
raise Exception(
|
459
|
+
"11labs connection closed unexpectedly, not all tokens have been consumed"
|
460
|
+
)
|
341
461
|
|
342
462
|
if msg.type != aiohttp.WSMsgType.TEXT:
|
463
|
+
# audio frames are serialized in base64..
|
343
464
|
logger.warning("unexpected 11labs message type %s", msg.type)
|
344
465
|
continue
|
345
466
|
|
346
467
|
data: dict = json.loads(msg.data)
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
468
|
+
audio = data.get("audio")
|
469
|
+
|
470
|
+
if data.get("error"):
|
471
|
+
logger.error("11labs error %s", data)
|
472
|
+
return
|
473
|
+
elif audio is not None:
|
474
|
+
if audio == "":
|
475
|
+
# 11labs sometimes sends empty audio, ignore
|
476
|
+
continue
|
477
|
+
|
478
|
+
b64data = base64.b64decode(audio)
|
479
|
+
frame: rtc.AudioFrame
|
480
|
+
if encoding == "mp3":
|
481
|
+
frames = mp3_decoder.decode_chunk(b64data)
|
482
|
+
frame = utils.merge_frames(frames)
|
483
|
+
else:
|
484
|
+
frame = rtc.AudioFrame(
|
485
|
+
data=b64data,
|
486
|
+
sample_rate=self._opts.sample_rate,
|
487
|
+
num_channels=1,
|
488
|
+
samples_per_channel=len(b64data) // 2,
|
359
489
|
)
|
360
|
-
|
490
|
+
|
491
|
+
text = ""
|
492
|
+
if data.get("alignment"):
|
493
|
+
text = "".join(data["alignment"].get("chars", ""))
|
494
|
+
|
495
|
+
audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
|
496
|
+
continue
|
361
497
|
elif data.get("isFinal"):
|
362
|
-
return
|
498
|
+
return # last message
|
499
|
+
|
500
|
+
logger.error("unexpected 11labs message %s", data)
|
363
501
|
|
364
502
|
try:
|
365
503
|
await asyncio.gather(send_task(), recv_task())
|
366
504
|
except Exception:
|
367
|
-
logger.exception("11labs connection failed")
|
505
|
+
logger.exception("11labs ws connection failed")
|
368
506
|
finally:
|
369
|
-
|
370
|
-
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
371
|
-
)
|
507
|
+
audio_tx.close()
|
372
508
|
|
373
509
|
async def __anext__(self) -> tts.SynthesisEvent:
|
374
510
|
evt = await self._event_queue.get()
|
@@ -378,7 +514,7 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
378
514
|
return evt
|
379
515
|
|
380
516
|
|
381
|
-
def
|
517
|
+
def _dict_to_voices_list(data: dict) -> List[Voice]:
|
382
518
|
voices = []
|
383
519
|
for voice in data["voices"]:
|
384
520
|
voices.append(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
22
|
Requires-Dist: livekit ~=0.11
|
23
|
-
Requires-Dist: livekit-agents ~=0.
|
23
|
+
Requires-Dist: livekit-agents[codecs] ~=0.7.0
|
24
24
|
Requires-Dist: aiohttp >=3.8.5
|
25
25
|
|
26
26
|
# LiveKit Plugins Elevenlabs
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=ez1ybDPt7GfKAKgPkxZFRB7Vyd-_i-0hfUMI79GQ5w4,1091
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=8jTchztgpiTokHEaWUK8PPxWWfvm5SMrOGsJpzxbYAw,362
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=GTcyQwBVVPzCYLgsnw9q5oFOq9cV3hIKndDaBPSFMr4,17738
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=pZ7bgeWLjw4VCWymU1ntHaHorKRusUkm56y6tZe5gmQ,600
|
7
|
+
livekit_plugins_elevenlabs-0.5.0.dist-info/METADATA,sha256=nmaTaWHwzuzT9nBjaLsJlzTAanMsxl7lv8wH5Sq7boI,1367
|
8
|
+
livekit_plugins_elevenlabs-0.5.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
+
livekit_plugins_elevenlabs-0.5.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_elevenlabs-0.5.0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=NMlmq9lwtOmXrsilDFlnooILRl_eDD1wY7qMGQYrwYQ,12398
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=7ECAuYd8NWEdqAQOS31si8_NKj2Tqs0ygy32XpL0Sbg,603
|
7
|
-
livekit_plugins_elevenlabs-0.4.dev2.dist-info/METADATA,sha256=pzXGnkkVlSwqCKz2crm_H_VXHtAbzu7cJQLQp1RRYz4,1365
|
8
|
-
livekit_plugins_elevenlabs-0.4.dev2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
-
livekit_plugins_elevenlabs-0.4.dev2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.4.dev2.dist-info/RECORD,,
|
{livekit_plugins_elevenlabs-0.4.dev2.dist-info → livekit_plugins_elevenlabs-0.5.0.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|