livekit-plugins-elevenlabs 0.3.dev0__py3-none-any.whl → 0.4.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.elevenlabs")
@@ -17,32 +17,32 @@ import base64
17
17
  import contextlib
18
18
  import dataclasses
19
19
  import json
20
- import logging
21
20
  import os
22
21
  from dataclasses import dataclass
23
- from typing import Any, AsyncIterable, Dict, List, Optional
22
+ from typing import AsyncIterable, List
24
23
 
25
24
  import aiohttp
26
25
  from livekit import rtc
27
- from livekit.agents import tts
26
+ from livekit.agents import aio, tts
28
27
 
28
+ from .log import logger
29
29
  from .models import TTSModels
30
30
 
31
31
 
32
+ @dataclass
33
+ class VoiceSettings:
34
+ stability: float # [0.0 - 1.0]
35
+ similarity_boost: float # [0.0 - 1.0]
36
+ style: float | None = None # [0.0 - 1.0]
37
+ use_speaker_boost: bool | None = False
38
+
39
+
32
40
  @dataclass
33
41
  class Voice:
34
42
  id: str
35
43
  name: str
36
44
  category: str
37
- settings: Optional["VoiceSettings"] = None
38
-
39
-
40
- @dataclass
41
- class VoiceSettings:
42
- stability: float # [0.0 - 1.0]
43
- similarity_boost: float # [0.0 - 1.0]
44
- style: Optional[float] = None # [0.0 - 1.0]
45
- use_speaker_boost: Optional[bool] = False
45
+ settings: VoiceSettings | None = None
46
46
 
47
47
 
48
48
  DEFAULT_VOICE = Voice(
@@ -56,7 +56,6 @@ DEFAULT_VOICE = Voice(
56
56
 
57
57
  API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
58
58
  AUTHORIZATION_HEADER = "xi-api-key"
59
- STREAM_EOS = ""
60
59
 
61
60
 
62
61
  @dataclass
@@ -74,19 +73,21 @@ class TTS(tts.TTS):
74
73
  self,
75
74
  *,
76
75
  voice: Voice = DEFAULT_VOICE,
77
- model_id: TTSModels = "eleven_multilingual_v2",
78
- api_key: Optional[str] = None,
79
- base_url: Optional[str] = None,
76
+ model_id: TTSModels = "eleven_turbo_v2",
77
+ api_key: str | None = None,
78
+ base_url: str | None = None,
80
79
  sample_rate: int = 24000,
81
- latency: int = 2,
80
+ latency: int = 3,
82
81
  ) -> None:
83
- super().__init__(streaming_supported=True)
82
+ super().__init__(
83
+ streaming_supported=True, sample_rate=sample_rate, num_channels=1
84
+ )
84
85
  api_key = api_key or os.environ.get("ELEVEN_API_KEY")
85
86
  if not api_key:
86
87
  raise ValueError("ELEVEN_API_KEY must be set")
87
88
 
88
89
  self._session = aiohttp.ClientSession()
89
- self._config = TTSOptions(
90
+ self._opts = TTSOptions(
90
91
  voice=voice,
91
92
  model_id=model_id,
92
93
  api_key=api_key,
@@ -97,8 +98,8 @@ class TTS(tts.TTS):
97
98
 
98
99
  async def list_voices(self) -> List[Voice]:
99
100
  async with self._session.get(
100
- f"{self._config.base_url}/voices",
101
- headers={AUTHORIZATION_HEADER: self._config.api_key},
101
+ f"{self._opts.base_url}/voices",
102
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
102
103
  ) as resp:
103
104
  data = await resp.json()
104
105
  return dict_to_voices_list(data)
@@ -107,227 +108,274 @@ class TTS(tts.TTS):
107
108
  self,
108
109
  text: str,
109
110
  ) -> AsyncIterable[tts.SynthesizedAudio]:
110
- voice = self._config.voice
111
+ voice = self._opts.voice
112
+ url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
111
113
 
112
114
  async def generator():
113
- async with self._session.post(
114
- f"{self._config.base_url}/text-to-speech/{voice.id}?output_format=pcm_44100",
115
- headers={AUTHORIZATION_HEADER: self._config.api_key},
116
- json=dict(
117
- text=text,
118
- model_id=self._config.model_id,
119
- voice_settings=dataclasses.asdict(voice.settings)
120
- if voice.settings
121
- else None,
122
- ),
123
- ) as resp:
124
- data = await resp.read()
125
- yield tts.SynthesizedAudio(
126
- text=text,
127
- data=rtc.AudioFrame(
128
- data=data,
129
- sample_rate=44100,
130
- num_channels=1,
131
- samples_per_channel=len(data) // 2, # 16-bit
115
+ try:
116
+ async with self._session.post(
117
+ url,
118
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
119
+ json=dict(
120
+ text=text,
121
+ model_id=self._opts.model_id,
122
+ voice_settings=dataclasses.asdict(voice.settings)
123
+ if voice.settings
124
+ else None,
132
125
  ),
133
- )
126
+ ) as resp:
127
+ data = await resp.read()
128
+ yield tts.SynthesizedAudio(
129
+ text=text,
130
+ data=rtc.AudioFrame(
131
+ data=data,
132
+ sample_rate=self._opts.sample_rate,
133
+ num_channels=1,
134
+ samples_per_channel=len(data) // 2, # 16-bit
135
+ ),
136
+ )
137
+ except Exception as e:
138
+ logger.error(f"failed to synthesize: {e}")
134
139
 
135
140
  return generator()
136
141
 
137
142
  def stream(
138
143
  self,
139
144
  ) -> "SynthesizeStream":
140
- return SynthesizeStream(self._session, self._config)
145
+ return SynthesizeStream(self._session, self._opts)
141
146
 
142
147
 
143
148
  class SynthesizeStream(tts.SynthesizeStream):
149
+ _STREAM_EOS = ""
150
+
144
151
  def __init__(
145
152
  self,
146
153
  session: aiohttp.ClientSession,
147
- config: TTSOptions,
154
+ opts: TTSOptions,
155
+ max_retry: int = 32,
148
156
  ):
149
- self._config = config
157
+ self._opts = opts
150
158
  self._session = session
151
159
 
152
- self._queue = asyncio.Queue[str]()
153
- self._event_queue = asyncio.Queue[tts.SynthesisEvent]()
160
+ self._queue = asyncio.Queue[str | None]()
161
+ self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
154
162
  self._closed = False
155
-
156
- self._main_task = asyncio.create_task(self._run(max_retry=32))
157
-
158
- def log_exception(task: asyncio.Task) -> None:
159
- if not task.cancelled() and task.exception():
160
- logging.error(f"elevenlabs synthesis task failed: {task.exception()}")
161
-
162
- self._main_task.add_done_callback(log_exception)
163
163
  self._text = ""
164
164
 
165
+ self._main_task = asyncio.create_task(self._run(max_retry))
166
+
165
167
  def _stream_url(self) -> str:
166
- base_url = self._config.base_url
167
- voice_id = self._config.voice.id
168
- model_id = self._config.model_id
169
- return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{self._config.sample_rate}&optimize_streaming_latency={self._config.latency}"
168
+ base_url = self._opts.base_url
169
+ voice_id = self._opts.voice.id
170
+ model_id = self._opts.model_id
171
+ sample_rate = self._opts.sample_rate
172
+ latency = self._opts.latency
173
+ return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
170
174
 
171
175
  def push_text(self, token: str | None) -> None:
172
176
  if self._closed:
173
177
  raise ValueError("cannot push to a closed stream")
174
178
 
175
- if not token or len(token) == 0:
179
+ if token is None:
180
+ self._flush_if_needed()
181
+ return
182
+
183
+ if len(token) == 0:
184
+ # 11labs marks the EOS with an empty string, avoid users from pushing empty strings
176
185
  return
177
186
 
178
- # TODO: Native word boundary detection may not be good enough for all languages
187
+ # TODO: Naive word boundary detection may not be good enough for all languages
179
188
  # fmt: off
180
189
  splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
181
190
  # fmt: on
182
191
 
183
192
  self._text += token
184
- if token[-1] in splitters:
185
- self._queue.put_nowait(self._text)
186
- self._text = ""
193
+
194
+ while True:
195
+ last_split = -1
196
+ for i, c in enumerate(self._text):
197
+ if c in splitters:
198
+ last_split = i
199
+ break
200
+
201
+ if last_split == -1:
202
+ break
203
+
204
+ seg = self._text[: last_split + 1]
205
+ seg = seg.strip() + " " # 11labs expects a space at the end
206
+ self._queue.put_nowait(seg)
207
+ self._text = self._text[last_split + 1 :]
208
+
209
+ async def aclose(self, *, wait: bool = True) -> None:
210
+ self._flush_if_needed()
211
+ self._queue.put_nowait(None)
212
+ self._closed = True
213
+
214
+ if not wait:
215
+ self._main_task.cancel()
216
+
217
+ with contextlib.suppress(asyncio.CancelledError):
218
+ await self._main_task
219
+
220
+ def _flush_if_needed(self) -> None:
221
+ seg = self._text.strip()
222
+ if len(seg) > 0:
223
+ self._queue.put_nowait(seg + " ")
224
+
225
+ self._text = ""
226
+ self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
187
227
 
188
228
  async def _run(self, max_retry: int) -> None:
189
229
  retry_count = 0
190
- listen_task: Optional[asyncio.Task] = None
191
- ws: Optional[aiohttp.ClientWebSocketResponse] = None
192
- retry_text_queue: asyncio.Queue[str] = asyncio.Queue()
193
- while True:
194
- try:
195
- ws = await self._try_connect()
196
- retry_count = 0 # reset retry count
197
-
198
- listen_task = asyncio.create_task(self._listen_task(ws))
199
-
200
- # forward queued text to 11labs
201
- started = False
202
- while not ws.closed:
203
- text = None
204
- if not retry_text_queue.empty():
205
- text = await retry_text_queue.get()
206
- retry_text_queue.task_done()
207
- else:
208
- text = await self._queue.get()
209
-
210
- if not started:
211
- self._event_queue.put_nowait(
212
- tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
230
+ ws: aiohttp.ClientWebSocketResponse | None = None
231
+ ws_task: asyncio.Task | None = None
232
+ data_tx: aio.ChanSender[str] | None = None
233
+
234
+ try:
235
+ while True:
236
+ ws_connected = ws is not None and not ws.closed
237
+ try:
238
+ data = await self._queue.get()
239
+
240
+ if data is None:
241
+ if ws_task is not None:
242
+ await ws_task
243
+ break
244
+
245
+ if not ws_connected:
246
+ if data == SynthesizeStream._STREAM_EOS:
247
+ continue
248
+
249
+ with contextlib.suppress(asyncio.CancelledError):
250
+ if ws_task is not None:
251
+ await ws_task
252
+
253
+ ws = await self._session.ws_connect(
254
+ self._stream_url(),
255
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
213
256
  )
214
- started = True
215
- text_packet = dict(
216
- text=text,
217
- try_trigger_generation=True,
218
- )
257
+ data_tx, data_rx = aio.channel()
258
+ ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
219
259
 
220
- # This case can happen in normal operation because 11labs will not
221
- # keep connections open indefinitely if we are not sending data.
222
- try:
223
- await ws.send_str(json.dumps(text_packet))
224
- except Exception:
225
- await retry_text_queue.put(text)
226
- break
260
+ assert data_tx is not None
261
+ assert ws_task is not None
262
+ assert ws is not None
263
+
264
+ data_tx.send_nowait(data)
227
265
 
228
- # We call self._queue.task_done() even if we are retrying the text because
229
- # all text has gone through self._queue. An exception may have short-circuited
230
- # out of the loop so task_done() will not have already been called on text that
231
- # is being retried.
232
- self._queue.task_done()
233
- if text == STREAM_EOS:
234
- await listen_task
235
- # We know 11labs is closing the stream after each request/flush
236
- self._event_queue.put_nowait(
237
- tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
266
+ except Exception:
267
+ if retry_count >= max_retry:
268
+ logger.exception(
269
+ f"failed to connect to 11labs after {max_retry} retries"
238
270
  )
239
271
  break
240
272
 
241
- except asyncio.CancelledError:
242
- if ws:
243
- await ws.close()
244
- if listen_task:
245
- await asyncio.shield(listen_task)
246
- break
247
- except Exception as e:
248
- if retry_count > max_retry and max_retry > 0:
249
- logging.error(f"failed to connect to ElevenLabs: {e}")
250
- break
273
+ retry_delay = min(retry_count * 5, 5) # max 5s
274
+ retry_count += 1
251
275
 
252
- retry_delay = min(retry_count * 5, 5) # max 5s
253
- retry_count += 1
254
- logging.warning(
255
- f"failed to connect to ElevenLabs: {e} - retrying in {retry_delay}s"
256
- )
257
- await asyncio.sleep(retry_delay)
276
+ logger.warning(
277
+ f"failed to connect to 11labs, retrying in {retry_delay}s"
278
+ )
279
+ await asyncio.sleep(retry_delay)
258
280
 
259
- self._closed = True
281
+ except Exception:
282
+ logger.exception("11labs task failed")
283
+ finally:
284
+ with contextlib.suppress(asyncio.CancelledError):
285
+ if ws_task is not None:
286
+ ws_task.cancel()
287
+ await ws_task
260
288
 
261
- async def _try_connect(self) -> aiohttp.ClientWebSocketResponse:
262
- ws = await self._session.ws_connect(
263
- self._stream_url(),
264
- headers={AUTHORIZATION_HEADER: self._config.api_key},
265
- )
289
+ self._event_queue.put_nowait(None)
266
290
 
267
- voice = self._config.voice
268
- voice_settings = dataclasses.asdict(voice.settings) if voice.settings else None
291
+ async def _run_ws(
292
+ self, ws: aiohttp.ClientWebSocketResponse, data_rx: aio.ChanReceiver[str]
293
+ ) -> None:
294
+ closing_ws = False
269
295
 
270
- init_packet = dict(
271
- text=" ",
272
- voice_settings=voice_settings,
296
+ self._event_queue.put_nowait(
297
+ tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
273
298
  )
274
- await ws.send_str(json.dumps(init_packet))
275
- return ws
276
299
 
277
- async def _listen_task(self, ws: aiohttp.ClientWebSocketResponse) -> None:
278
- while True:
279
- msg = await ws.receive()
300
+ async def send_task():
301
+ nonlocal closing_ws
280
302
 
281
- if msg.type in (
282
- aiohttp.WSMsgType.CLOSED,
283
- aiohttp.WSMsgType.CLOSE,
284
- aiohttp.WSMsgType.CLOSING,
285
- ):
286
- break
303
+ # 11labs stream must be initialized with a space
304
+ voice = self._opts.voice
305
+ voice_settings = (
306
+ dataclasses.asdict(voice.settings) if voice.settings else None
307
+ )
308
+ init_pkt = dict(
309
+ text=" ",
310
+ voice_settings=voice_settings,
311
+ )
312
+ await ws.send_str(json.dumps(init_pkt))
287
313
 
288
- if msg.type != aiohttp.WSMsgType.TEXT:
289
- continue
290
-
291
- jsonMessage: Dict[str, Any] = json.loads(str(msg.data))
292
- if jsonMessage.get("audio"):
293
- data = base64.b64decode(jsonMessage["audio"])
294
- audio_frame = rtc.AudioFrame(
295
- data=data,
296
- sample_rate=self._config.sample_rate,
297
- num_channels=1,
298
- samples_per_channel=len(data) // 2,
314
+ while True:
315
+ data = await data_rx.recv()
316
+ data_pkt = dict(
317
+ text=data,
318
+ try_trigger_generation=False,
299
319
  )
300
- self._event_queue.put_nowait(
301
- tts.SynthesisEvent(
302
- type=tts.SynthesisEventType.AUDIO,
303
- audio=tts.SynthesizedAudio(text="", data=audio_frame),
320
+ if data == SynthesizeStream._STREAM_EOS:
321
+ closing_ws = True
322
+
323
+ await ws.send_str(json.dumps(data_pkt))
324
+
325
+ if closing_ws:
326
+ return
327
+
328
+ async def recv_task():
329
+ nonlocal closing_ws
330
+ while True:
331
+ msg = await ws.receive()
332
+ if msg.type in (
333
+ aiohttp.WSMsgType.CLOSED,
334
+ aiohttp.WSMsgType.CLOSE,
335
+ aiohttp.WSMsgType.CLOSING,
336
+ ):
337
+ if closing_ws: # close is expected
338
+ return
339
+
340
+ raise Exception("11labs connection closed unexpectedly")
341
+
342
+ if msg.type != aiohttp.WSMsgType.TEXT:
343
+ logger.warning("unexpected 11labs message type %s", msg.type)
344
+ continue
345
+
346
+ data: dict = json.loads(msg.data)
347
+ if data.get("audio"):
348
+ b64data = base64.b64decode(data["audio"])
349
+ frame = rtc.AudioFrame(
350
+ data=b64data,
351
+ sample_rate=self._opts.sample_rate,
352
+ num_channels=1,
353
+ samples_per_channel=len(data) // 2,
304
354
  )
305
- )
306
- elif jsonMessage.get("isFinal"):
307
- break
308
- else:
309
- logging.error(f"Unhandled message from ElevenLabs: {msg}")
310
-
311
- async def flush(self) -> None:
312
- self._queue.put_nowait(self._text + " ")
313
- self._text = ""
314
- self._queue.put_nowait(STREAM_EOS)
315
- await self._queue.join()
316
-
317
- async def aclose(self, wait=False) -> None:
318
- if wait:
319
- logging.warning(
320
- "wait=True is not yet supported for ElevenLabs TTS. Closing immediately."
355
+ self._event_queue.put_nowait(
356
+ tts.SynthesisEvent(
357
+ type=tts.SynthesisEventType.AUDIO,
358
+ audio=tts.SynthesizedAudio(text="", data=frame),
359
+ )
360
+ )
361
+ elif data.get("isFinal"):
362
+ return
363
+
364
+ try:
365
+ await asyncio.gather(send_task(), recv_task())
366
+ except Exception:
367
+ logger.exception("11labs connection failed")
368
+ finally:
369
+ self._event_queue.put_nowait(
370
+ tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
321
371
  )
322
- self._main_task.cancel()
323
- with contextlib.suppress(asyncio.CancelledError):
324
- await self._main_task
325
372
 
326
373
  async def __anext__(self) -> tts.SynthesisEvent:
327
- if self._closed and self._event_queue.empty():
374
+ evt = await self._event_queue.get()
375
+ if evt is None:
328
376
  raise StopAsyncIteration
329
377
 
330
- return await self._event_queue.get()
378
+ return evt
331
379
 
332
380
 
333
381
  def dict_to_voices_list(data: dict) -> List[Voice]:
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.3.dev0"
15
+ __version__ = "0.4.dev0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.3.dev0
3
+ Version: 0.4.dev0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,8 +19,8 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit ~=0.9
23
- Requires-Dist: livekit-agents ~=0.5.dev0
22
+ Requires-Dist: livekit ~=0.11
23
+ Requires-Dist: livekit-agents ~=0.6.dev0
24
24
  Requires-Dist: aiohttp >=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=5PO_KjUzIMnHPD_iRyotLqR7qMIjpJYqR52K8wdnzts,12396
6
+ livekit/plugins/elevenlabs/version.py,sha256=OwSbVTqWUJKy9w2Jbh1MIrp5cHPvEYsLXDhRGwdZKso,603
7
+ livekit_plugins_elevenlabs-0.4.dev0.dist-info/METADATA,sha256=TCRDG52GXKhdqP2iDipt28Qu5vAe0ZWYW5CTdZTiN0k,1365
8
+ livekit_plugins_elevenlabs-0.4.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
+ livekit_plugins_elevenlabs-0.4.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.4.dev0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
2
- livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
3
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- livekit/plugins/elevenlabs/tts.py,sha256=hN9aRGQ_9B9ehnB7cS19gtZ3uHIa-28RPoIIRZrdm-w,11503
5
- livekit/plugins/elevenlabs/version.py,sha256=qkEpuSbW6SpTptjDO-UGEI0FuAKdGu2JIYA_Bw_kavc,603
6
- livekit_plugins_elevenlabs-0.3.dev0.dist-info/METADATA,sha256=2qOFZKGAajRmX5BlkqStyuOyrkehxrvtqGmogzk7BP8,1364
7
- livekit_plugins_elevenlabs-0.3.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
8
- livekit_plugins_elevenlabs-0.3.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
9
- livekit_plugins_elevenlabs-0.3.dev0.dist-info/RECORD,,