livekit-plugins-elevenlabs 0.4.dev2__py3-none-any.whl → 0.5.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/elevenlabs/tts.py +274 -179
- livekit/plugins/elevenlabs/version.py +1 -1
- {livekit_plugins_elevenlabs-0.4.dev2.dist-info → livekit_plugins_elevenlabs-0.5.dev0.dist-info}/METADATA +2 -2
- livekit_plugins_elevenlabs-0.5.dev0.dist-info/RECORD +10 -0
- livekit_plugins_elevenlabs-0.4.dev2.dist-info/RECORD +0 -10
- {livekit_plugins_elevenlabs-0.4.dev2.dist-info → livekit_plugins_elevenlabs-0.5.dev0.dist-info}/WHEEL +0 -0
- {livekit_plugins_elevenlabs-0.4.dev2.dist-info → livekit_plugins_elevenlabs-0.5.dev0.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,8 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
+
from __future__ import annotations
|
16
|
+
|
15
17
|
import asyncio
|
16
18
|
import base64
|
17
19
|
import contextlib
|
@@ -19,11 +21,11 @@ import dataclasses
|
|
19
21
|
import json
|
20
22
|
import os
|
21
23
|
from dataclasses import dataclass
|
22
|
-
from typing import
|
24
|
+
from typing import List, Optional
|
23
25
|
|
24
26
|
import aiohttp
|
25
27
|
from livekit import rtc
|
26
|
-
from livekit.agents import aio, tts
|
28
|
+
from livekit.agents import aio, tokenize, tts, utils
|
27
29
|
|
28
30
|
from .log import logger
|
29
31
|
from .models import TTSModels
|
@@ -59,13 +61,15 @@ AUTHORIZATION_HEADER = "xi-api-key"
|
|
59
61
|
|
60
62
|
|
61
63
|
@dataclass
|
62
|
-
class
|
64
|
+
class _TTSOptions:
|
63
65
|
api_key: str
|
64
66
|
voice: Voice
|
65
67
|
model_id: TTSModels
|
66
68
|
base_url: str
|
67
69
|
sample_rate: int
|
68
|
-
|
70
|
+
streaming_latency: int
|
71
|
+
word_tokenizer: tokenize.WordTokenizer
|
72
|
+
chunk_length_schedule: list[int]
|
69
73
|
|
70
74
|
|
71
75
|
class TTS(tts.TTS):
|
@@ -77,7 +81,14 @@ class TTS(tts.TTS):
|
|
77
81
|
api_key: str | None = None,
|
78
82
|
base_url: str | None = None,
|
79
83
|
sample_rate: int = 24000,
|
80
|
-
|
84
|
+
streaming_latency: int = 3,
|
85
|
+
word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
|
86
|
+
ignore_punctuation=False # punctuation can help for intonation
|
87
|
+
),
|
88
|
+
# default value of 11labs is [120, 160, 250, 290], but we want faster responses by default
|
89
|
+
# (range is 50-500)
|
90
|
+
chunk_length_schedule: list[int] = [80, 120, 200, 260],
|
91
|
+
http_session: aiohttp.ClientSession | None = None,
|
81
92
|
) -> None:
|
82
93
|
super().__init__(
|
83
94
|
streaming_supported=True, sample_rate=sample_rate, num_channels=1
|
@@ -86,130 +97,191 @@ class TTS(tts.TTS):
|
|
86
97
|
if not api_key:
|
87
98
|
raise ValueError("ELEVEN_API_KEY must be set")
|
88
99
|
|
89
|
-
self.
|
90
|
-
self._opts = TTSOptions(
|
100
|
+
self._opts = _TTSOptions(
|
91
101
|
voice=voice,
|
92
102
|
model_id=model_id,
|
93
103
|
api_key=api_key,
|
94
104
|
base_url=base_url or API_BASE_URL_V1,
|
95
105
|
sample_rate=sample_rate,
|
96
|
-
|
106
|
+
streaming_latency=streaming_latency,
|
107
|
+
word_tokenizer=word_tokenizer,
|
108
|
+
chunk_length_schedule=chunk_length_schedule,
|
97
109
|
)
|
110
|
+
self._session = http_session
|
111
|
+
|
112
|
+
def _ensure_session(self) -> aiohttp.ClientSession:
|
113
|
+
if not self._session:
|
114
|
+
self._session = utils.http_session()
|
115
|
+
|
116
|
+
return self._session
|
98
117
|
|
99
118
|
async def list_voices(self) -> List[Voice]:
|
100
|
-
async with self.
|
119
|
+
async with self._ensure_session().get(
|
101
120
|
f"{self._opts.base_url}/voices",
|
102
121
|
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
103
122
|
) as resp:
|
104
|
-
|
105
|
-
return dict_to_voices_list(data)
|
123
|
+
return _dict_to_voices_list(await resp.json())
|
106
124
|
|
107
125
|
def synthesize(
|
108
126
|
self,
|
109
127
|
text: str,
|
110
|
-
) ->
|
111
|
-
|
112
|
-
url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
|
128
|
+
) -> "ChunkedStream":
|
129
|
+
return ChunkedStream(text, self._opts, self._ensure_session())
|
113
130
|
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
131
|
+
def stream(
|
132
|
+
self,
|
133
|
+
) -> "SynthesizeStream":
|
134
|
+
return SynthesizeStream(self._ensure_session(), self._opts)
|
135
|
+
|
136
|
+
|
137
|
+
class ChunkedStream(tts.ChunkedStream):
|
138
|
+
"""Synthesize using the chunked api endpoint"""
|
139
|
+
|
140
|
+
def __init__(
|
141
|
+
self, text: str, opts: _TTSOptions, session: aiohttp.ClientSession
|
142
|
+
) -> None:
|
143
|
+
self._opts = opts
|
144
|
+
self._text = text
|
145
|
+
self._session = session
|
146
|
+
self._task: asyncio.Task | None = None
|
147
|
+
self._queue = asyncio.Queue[Optional[tts.SynthesizedAudio]]()
|
148
|
+
|
149
|
+
def _synthesize_url(self) -> str:
|
150
|
+
base_url = self._opts.base_url
|
151
|
+
voice_id = self._opts.voice.id
|
152
|
+
model_id = self._opts.model_id
|
153
|
+
sample_rate = self._opts.sample_rate
|
154
|
+
latency = self._opts.streaming_latency
|
155
|
+
url = (
|
156
|
+
f"{base_url}/text-to-speech/{voice_id}/stream?"
|
157
|
+
f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
158
|
+
)
|
159
|
+
return url
|
160
|
+
|
161
|
+
async def _main_task(self):
|
162
|
+
try:
|
163
|
+
await self._run()
|
164
|
+
except Exception:
|
165
|
+
logger.exception("11labs main task failed in chunked stream")
|
166
|
+
finally:
|
167
|
+
self._queue.put_nowait(None)
|
168
|
+
|
169
|
+
async def _run(self) -> None:
|
170
|
+
async with self._session.post(
|
171
|
+
self._synthesize_url(),
|
172
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
173
|
+
json=dict(
|
174
|
+
text=self._text,
|
175
|
+
model_id=self._opts.model_id,
|
176
|
+
voice_settings=(
|
177
|
+
dataclasses.asdict(self._opts.voice.settings)
|
178
|
+
if self._opts.voice.settings
|
179
|
+
else None
|
180
|
+
),
|
181
|
+
),
|
182
|
+
) as resp:
|
183
|
+
# avoid very small frames. chunk by 10ms 16bits
|
184
|
+
bytes_per_frame = (self._opts.sample_rate // 100) * 2
|
185
|
+
buf = bytearray()
|
186
|
+
async for data, _ in resp.content.iter_chunks():
|
187
|
+
buf.extend(data)
|
188
|
+
|
189
|
+
while len(buf) >= bytes_per_frame:
|
190
|
+
frame_data = buf[:bytes_per_frame]
|
191
|
+
buf = buf[bytes_per_frame:]
|
192
|
+
|
193
|
+
self._queue.put_nowait(
|
194
|
+
tts.SynthesizedAudio(
|
195
|
+
text=self._text,
|
196
|
+
data=rtc.AudioFrame(
|
197
|
+
data=frame_data,
|
198
|
+
sample_rate=self._opts.sample_rate,
|
199
|
+
num_channels=1,
|
200
|
+
samples_per_channel=len(frame_data) // 2,
|
201
|
+
),
|
202
|
+
)
|
203
|
+
)
|
204
|
+
|
205
|
+
# send any remaining data
|
206
|
+
if len(buf) > 0:
|
207
|
+
self._queue.put_nowait(
|
208
|
+
tts.SynthesizedAudio(
|
209
|
+
text=self._text,
|
130
210
|
data=rtc.AudioFrame(
|
131
|
-
data=
|
211
|
+
data=buf,
|
132
212
|
sample_rate=self._opts.sample_rate,
|
133
213
|
num_channels=1,
|
134
|
-
samples_per_channel=len(
|
214
|
+
samples_per_channel=len(buf) // 2,
|
135
215
|
),
|
136
216
|
)
|
137
|
-
|
138
|
-
logger.error(f"failed to synthesize: {e}")
|
217
|
+
)
|
139
218
|
|
140
|
-
|
219
|
+
async def __anext__(self) -> tts.SynthesizedAudio:
|
220
|
+
if not self._task:
|
221
|
+
self._task = asyncio.create_task(self._main_task())
|
141
222
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
223
|
+
frame = await self._queue.get()
|
224
|
+
if frame is None:
|
225
|
+
raise StopAsyncIteration
|
226
|
+
|
227
|
+
return frame
|
228
|
+
|
229
|
+
async def aclose(self) -> None:
|
230
|
+
if not self._task:
|
231
|
+
return
|
232
|
+
|
233
|
+
self._task.cancel()
|
234
|
+
with contextlib.suppress(asyncio.CancelledError):
|
235
|
+
await self._task
|
146
236
|
|
147
237
|
|
148
238
|
class SynthesizeStream(tts.SynthesizeStream):
|
149
|
-
|
239
|
+
"""Streamed API using websockets"""
|
240
|
+
|
241
|
+
@dataclass
|
242
|
+
class _SegmentConnection:
|
243
|
+
audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
|
244
|
+
task: asyncio.Task
|
150
245
|
|
151
246
|
def __init__(
|
152
247
|
self,
|
153
248
|
session: aiohttp.ClientSession,
|
154
|
-
opts:
|
155
|
-
|
249
|
+
opts: _TTSOptions,
|
250
|
+
max_retry_per_segment: int = 3,
|
156
251
|
):
|
157
252
|
self._opts = opts
|
158
253
|
self._session = session
|
159
|
-
|
160
|
-
self.
|
161
|
-
self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
|
254
|
+
self._main_task = asyncio.create_task(self._run(max_retry_per_segment))
|
255
|
+
self._event_queue = asyncio.Queue[Optional[tts.SynthesisEvent]]()
|
162
256
|
self._closed = False
|
163
|
-
self.
|
164
|
-
|
165
|
-
self._main_task = asyncio.create_task(self._run(max_retry))
|
257
|
+
self._word_stream = opts.word_tokenizer.stream()
|
166
258
|
|
167
259
|
def _stream_url(self) -> str:
|
168
260
|
base_url = self._opts.base_url
|
169
261
|
voice_id = self._opts.voice.id
|
170
262
|
model_id = self._opts.model_id
|
171
263
|
sample_rate = self._opts.sample_rate
|
172
|
-
latency = self._opts.
|
173
|
-
|
264
|
+
latency = self._opts.streaming_latency
|
265
|
+
url = (
|
266
|
+
f"{base_url}/text-to-speech/{voice_id}/stream-input?"
|
267
|
+
f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
|
268
|
+
)
|
269
|
+
|
270
|
+
return url
|
174
271
|
|
175
272
|
def push_text(self, token: str | None) -> None:
|
176
273
|
if self._closed:
|
177
274
|
raise ValueError("cannot push to a closed stream")
|
178
275
|
|
179
276
|
if token is None:
|
180
|
-
self.
|
277
|
+
self._word_stream.mark_segment_end()
|
181
278
|
return
|
182
279
|
|
183
|
-
|
184
|
-
# 11labs marks the EOS with an empty string, avoid users from pushing empty strings
|
185
|
-
return
|
186
|
-
|
187
|
-
# TODO: Naive word boundary detection may not be good enough for all languages
|
188
|
-
# fmt: off
|
189
|
-
splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
|
190
|
-
# fmt: on
|
191
|
-
|
192
|
-
self._text += token
|
193
|
-
|
194
|
-
while True:
|
195
|
-
last_split = -1
|
196
|
-
for i, c in enumerate(self._text):
|
197
|
-
if c in splitters:
|
198
|
-
last_split = i
|
199
|
-
break
|
200
|
-
|
201
|
-
if last_split == -1:
|
202
|
-
break
|
203
|
-
|
204
|
-
seg = self._text[: last_split + 1]
|
205
|
-
seg = seg.strip() + " " # 11labs expects a space at the end
|
206
|
-
self._queue.put_nowait(seg)
|
207
|
-
self._text = self._text[last_split + 1 :]
|
280
|
+
self._word_stream.push_text(token)
|
208
281
|
|
209
282
|
async def aclose(self, *, wait: bool = True) -> None:
|
210
|
-
self._flush_if_needed()
|
211
|
-
self._queue.put_nowait(None)
|
212
283
|
self._closed = True
|
284
|
+
await self._word_stream.aclose()
|
213
285
|
|
214
286
|
if not wait:
|
215
287
|
self._main_task.cancel()
|
@@ -217,158 +289,181 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
217
289
|
with contextlib.suppress(asyncio.CancelledError):
|
218
290
|
await self._main_task
|
219
291
|
|
220
|
-
def
|
221
|
-
|
222
|
-
if len(seg) > 0:
|
223
|
-
self._queue.put_nowait(seg + " ")
|
224
|
-
|
225
|
-
self._text = ""
|
226
|
-
self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
|
292
|
+
async def _run(self, max_retry_per_segment: int) -> None:
|
293
|
+
conns_q = asyncio.Queue[Optional[SynthesizeStream._SegmentConnection]]()
|
227
294
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
ws_task: asyncio.Task | None = None
|
232
|
-
data_tx: aio.ChanSender[str] | None = None
|
233
|
-
|
234
|
-
try:
|
295
|
+
async def _forward_events() -> None:
|
296
|
+
"""forward events from the ws connections to the event queue.
|
297
|
+
This is used to keep the right order."""
|
235
298
|
while True:
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
if data is None:
|
241
|
-
if ws_task is not None:
|
242
|
-
await ws_task
|
243
|
-
break
|
244
|
-
|
245
|
-
if not ws_connected:
|
246
|
-
if data == SynthesizeStream._STREAM_EOS:
|
247
|
-
continue
|
248
|
-
|
249
|
-
with contextlib.suppress(asyncio.CancelledError):
|
250
|
-
if ws_task is not None:
|
251
|
-
await ws_task
|
252
|
-
|
253
|
-
ws = await self._session.ws_connect(
|
254
|
-
self._stream_url(),
|
255
|
-
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
256
|
-
)
|
257
|
-
data_tx, data_rx = aio.channel()
|
258
|
-
ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
|
299
|
+
c = await conns_q.get()
|
300
|
+
if c is None:
|
301
|
+
break # no more segment, stream closed
|
259
302
|
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
data_tx.send_nowait(data)
|
303
|
+
self._event_queue.put_nowait(
|
304
|
+
tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
|
305
|
+
)
|
265
306
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
307
|
+
async for frame in c.audio_rx:
|
308
|
+
self._event_queue.put_nowait(
|
309
|
+
tts.SynthesisEvent(
|
310
|
+
type=tts.SynthesisEventType.AUDIO, audio=frame
|
270
311
|
)
|
271
|
-
|
312
|
+
)
|
272
313
|
|
273
|
-
|
274
|
-
|
314
|
+
self._event_queue.put_nowait(
|
315
|
+
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
316
|
+
)
|
275
317
|
|
276
|
-
|
277
|
-
|
318
|
+
async def _read_tokens() -> None:
|
319
|
+
"""read tokens from the word stream and create connections for each segment,
|
320
|
+
(this also allows concurrent connections to 11labs)"""
|
321
|
+
|
322
|
+
cur_segment: SynthesizeStream._SegmentConnection | None = None
|
323
|
+
token_tx: aio.ChanSender[str] | None = None
|
324
|
+
async for ev in self._word_stream:
|
325
|
+
if ev.type == tokenize.TokenEventType.STARTED:
|
326
|
+
token_tx, token_rx = aio.channel()
|
327
|
+
audio_tx: aio.ChanSender[tts.SynthesizedAudio]
|
328
|
+
audio_rx: aio.ChanReceiver[tts.SynthesizedAudio]
|
329
|
+
audio_tx, audio_rx = aio.channel()
|
330
|
+
task = asyncio.create_task(
|
331
|
+
self._run_ws(max_retry_per_segment, audio_tx, token_rx)
|
278
332
|
)
|
279
|
-
|
333
|
+
cur_segment = SynthesizeStream._SegmentConnection(audio_rx, task)
|
334
|
+
conns_q.put_nowait(cur_segment)
|
335
|
+
elif ev.type == tokenize.TokenEventType.TOKEN:
|
336
|
+
assert token_tx is not None
|
337
|
+
token_tx.send_nowait(ev.token)
|
338
|
+
elif ev.type == tokenize.TokenEventType.FINISHED:
|
339
|
+
assert token_tx is not None
|
340
|
+
token_tx.close()
|
341
|
+
cur_segment = token_tx = None
|
342
|
+
|
343
|
+
conns_q.put_nowait(None)
|
280
344
|
|
345
|
+
try:
|
346
|
+
await asyncio.gather(_forward_events(), _read_tokens())
|
281
347
|
except Exception:
|
282
348
|
logger.exception("11labs task failed")
|
283
|
-
finally:
|
284
|
-
with contextlib.suppress(asyncio.CancelledError):
|
285
|
-
if ws_task is not None:
|
286
|
-
ws_task.cancel()
|
287
|
-
await ws_task
|
288
349
|
|
289
|
-
|
350
|
+
self._event_queue.put_nowait(None)
|
290
351
|
|
291
352
|
async def _run_ws(
|
292
|
-
self,
|
353
|
+
self,
|
354
|
+
max_retry: int,
|
355
|
+
audio_tx: aio.ChanSender[tts.SynthesizedAudio],
|
356
|
+
token_rx: aio.ChanReceiver[str],
|
293
357
|
) -> None:
|
294
|
-
|
358
|
+
# try to connect to 11labs
|
359
|
+
ws_conn: aiohttp.ClientWebSocketResponse | None = None
|
360
|
+
for try_i in range(max_retry):
|
361
|
+
try:
|
362
|
+
ws_conn = await self._session.ws_connect(
|
363
|
+
self._stream_url(),
|
364
|
+
headers={AUTHORIZATION_HEADER: self._opts.api_key},
|
365
|
+
)
|
295
366
|
|
296
|
-
|
297
|
-
|
298
|
-
|
367
|
+
voice_settings = None
|
368
|
+
if self._opts.voice.settings is not None:
|
369
|
+
voice_settings = dataclasses.asdict(self._opts.voice.settings)
|
299
370
|
|
300
|
-
|
301
|
-
|
371
|
+
init_pkt = dict(
|
372
|
+
text=" ",
|
373
|
+
try_trigger_generation=True,
|
374
|
+
voice_settings=voice_settings,
|
375
|
+
generation_config=dict(
|
376
|
+
chunk_length_schedule=self._opts.chunk_length_schedule,
|
377
|
+
),
|
378
|
+
)
|
379
|
+
await ws_conn.send_str(json.dumps(init_pkt))
|
380
|
+
except Exception:
|
381
|
+
if try_i + 1 == max_retry:
|
382
|
+
logger.exception(
|
383
|
+
f"failed to connect to 11labs after {max_retry} retries"
|
384
|
+
)
|
385
|
+
return
|
302
386
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
init_pkt = dict(
|
309
|
-
text=" ",
|
310
|
-
voice_settings=voice_settings,
|
311
|
-
)
|
312
|
-
await ws.send_str(json.dumps(init_pkt))
|
387
|
+
retry_delay = min(try_i * 5, 5) # max 5s
|
388
|
+
logger.warning(
|
389
|
+
f"failed to connect to 11labs, retrying in {retry_delay}s"
|
390
|
+
)
|
391
|
+
await asyncio.sleep(retry_delay)
|
313
392
|
|
314
|
-
|
315
|
-
|
393
|
+
assert ws_conn is not None
|
394
|
+
|
395
|
+
all_tokens_consumed = False
|
396
|
+
|
397
|
+
async def send_task():
|
398
|
+
async for token in token_rx:
|
399
|
+
if token == "":
|
400
|
+
continue # empty token is closing the stream in 11labs protocol
|
401
|
+
|
402
|
+
# try_trigger_generation=True is a bad practice, we expose
|
403
|
+
# chunk_length_schedule instead
|
316
404
|
data_pkt = dict(
|
317
|
-
text=
|
318
|
-
try_trigger_generation=
|
405
|
+
text=f"{token} ", # must always end with a space
|
406
|
+
try_trigger_generation=False,
|
319
407
|
)
|
320
|
-
|
321
|
-
closing_ws = True
|
408
|
+
await ws_conn.send_str(json.dumps(data_pkt))
|
322
409
|
|
323
|
-
|
410
|
+
# no more token, mark eos
|
411
|
+
flush_pkt = dict(
|
412
|
+
text="",
|
413
|
+
)
|
414
|
+
await ws_conn.send_str(json.dumps(flush_pkt))
|
324
415
|
|
325
|
-
|
326
|
-
|
416
|
+
nonlocal all_tokens_consumed
|
417
|
+
all_tokens_consumed = True
|
327
418
|
|
328
419
|
async def recv_task():
|
329
|
-
nonlocal closing_ws
|
330
420
|
while True:
|
331
|
-
msg = await
|
421
|
+
msg = await ws_conn.receive()
|
332
422
|
if msg.type in (
|
333
423
|
aiohttp.WSMsgType.CLOSED,
|
334
424
|
aiohttp.WSMsgType.CLOSE,
|
335
425
|
aiohttp.WSMsgType.CLOSING,
|
336
426
|
):
|
337
|
-
if
|
338
|
-
return
|
427
|
+
if all_tokens_consumed:
|
428
|
+
return # close is expected
|
339
429
|
|
340
|
-
raise Exception(
|
430
|
+
raise Exception(
|
431
|
+
"11labs connection closed unexpectedly, not all tokens have been consumed"
|
432
|
+
)
|
341
433
|
|
342
434
|
if msg.type != aiohttp.WSMsgType.TEXT:
|
435
|
+
# audio frames are serialized in base64..
|
343
436
|
logger.warning("unexpected 11labs message type %s", msg.type)
|
344
437
|
continue
|
345
438
|
|
346
439
|
data: dict = json.loads(msg.data)
|
347
440
|
if data.get("audio"):
|
348
441
|
b64data = base64.b64decode(data["audio"])
|
442
|
+
|
349
443
|
frame = rtc.AudioFrame(
|
350
444
|
data=b64data,
|
351
445
|
sample_rate=self._opts.sample_rate,
|
352
446
|
num_channels=1,
|
353
447
|
samples_per_channel=len(b64data) // 2,
|
354
448
|
)
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
)
|
449
|
+
|
450
|
+
text = ""
|
451
|
+
if data.get("alignment"):
|
452
|
+
text = data["alignment"].get("chars", "")
|
453
|
+
|
454
|
+
audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
|
455
|
+
continue
|
361
456
|
elif data.get("isFinal"):
|
362
|
-
return
|
457
|
+
return # last message
|
458
|
+
|
459
|
+
logger.error("unexpected 11labs message %s", data)
|
363
460
|
|
364
461
|
try:
|
365
462
|
await asyncio.gather(send_task(), recv_task())
|
366
463
|
except Exception:
|
367
|
-
logger.exception("11labs connection failed")
|
464
|
+
logger.exception("11labs ws connection failed")
|
368
465
|
finally:
|
369
|
-
|
370
|
-
tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
|
371
|
-
)
|
466
|
+
audio_tx.close()
|
372
467
|
|
373
468
|
async def __anext__(self) -> tts.SynthesisEvent:
|
374
469
|
evt = await self._event_queue.get()
|
@@ -378,7 +473,7 @@ class SynthesizeStream(tts.SynthesizeStream):
|
|
378
473
|
return evt
|
379
474
|
|
380
475
|
|
381
|
-
def
|
476
|
+
def _dict_to_voices_list(data: dict) -> List[Voice]:
|
382
477
|
voices = []
|
383
478
|
for voice in data["voices"]:
|
384
479
|
voices.append(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: livekit-plugins-elevenlabs
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.dev0
|
4
4
|
Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
|
5
5
|
Home-page: https://github.com/livekit/agents
|
6
6
|
License: Apache-2.0
|
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
|
|
20
20
|
Requires-Python: >=3.9.0
|
21
21
|
Description-Content-Type: text/markdown
|
22
22
|
Requires-Dist: livekit ~=0.11
|
23
|
-
Requires-Dist: livekit-agents ~=0.
|
23
|
+
Requires-Dist: livekit-agents ~=0.7.dev0
|
24
24
|
Requires-Dist: aiohttp >=3.8.5
|
25
25
|
|
26
26
|
# LiveKit Plugins Elevenlabs
|
@@ -0,0 +1,10 @@
|
|
1
|
+
livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
|
2
|
+
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
+
livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
|
4
|
+
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
livekit/plugins/elevenlabs/tts.py,sha256=p7mEiUDR6gbqEUrLp1lgTkJ3ounN6rhnenYoYqWNF2k,16418
|
6
|
+
livekit/plugins/elevenlabs/version.py,sha256=h2gCxcJSMvCrVP7h14ON6HaghqLCkbl3--HZKEopR_8,603
|
7
|
+
livekit_plugins_elevenlabs-0.5.dev0.dist-info/METADATA,sha256=5uCb2q4zTTGaCSSN448GLqhj9-41bg0jjR2CSeov8ms,1365
|
8
|
+
livekit_plugins_elevenlabs-0.5.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
+
livekit_plugins_elevenlabs-0.5.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
+
livekit_plugins_elevenlabs-0.5.dev0.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
|
2
|
-
livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
|
3
|
-
livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
|
4
|
-
livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/elevenlabs/tts.py,sha256=NMlmq9lwtOmXrsilDFlnooILRl_eDD1wY7qMGQYrwYQ,12398
|
6
|
-
livekit/plugins/elevenlabs/version.py,sha256=7ECAuYd8NWEdqAQOS31si8_NKj2Tqs0ygy32XpL0Sbg,603
|
7
|
-
livekit_plugins_elevenlabs-0.4.dev2.dist-info/METADATA,sha256=pzXGnkkVlSwqCKz2crm_H_VXHtAbzu7cJQLQp1RRYz4,1365
|
8
|
-
livekit_plugins_elevenlabs-0.4.dev2.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
9
|
-
livekit_plugins_elevenlabs-0.4.dev2.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
|
10
|
-
livekit_plugins_elevenlabs-0.4.dev2.dist-info/RECORD,,
|
File without changes
|
File without changes
|