livekit-plugins-elevenlabs 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.elevenlabs")
@@ -17,32 +17,32 @@ import base64
17
17
  import contextlib
18
18
  import dataclasses
19
19
  import json
20
- import logging
21
20
  import os
22
21
  from dataclasses import dataclass
23
- from typing import Any, AsyncIterable, Dict, List, Optional
22
+ from typing import AsyncIterable, List
24
23
 
25
24
  import aiohttp
26
25
  from livekit import rtc
27
- from livekit.agents import tts
26
+ from livekit.agents import aio, tts
28
27
 
28
+ from .log import logger
29
29
  from .models import TTSModels
30
30
 
31
31
 
32
+ @dataclass
33
+ class VoiceSettings:
34
+ stability: float # [0.0 - 1.0]
35
+ similarity_boost: float # [0.0 - 1.0]
36
+ style: float | None = None # [0.0 - 1.0]
37
+ use_speaker_boost: bool | None = False
38
+
39
+
32
40
  @dataclass
33
41
  class Voice:
34
42
  id: str
35
43
  name: str
36
44
  category: str
37
- settings: Optional["VoiceSettings"] = None
38
-
39
-
40
- @dataclass
41
- class VoiceSettings:
42
- stability: float # [0.0 - 1.0]
43
- similarity_boost: float # [0.0 - 1.0]
44
- style: Optional[float] = None # [0.0 - 1.0]
45
- use_speaker_boost: Optional[bool] = False
45
+ settings: VoiceSettings | None = None
46
46
 
47
47
 
48
48
  DEFAULT_VOICE = Voice(
@@ -56,7 +56,6 @@ DEFAULT_VOICE = Voice(
56
56
 
57
57
  API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
58
58
  AUTHORIZATION_HEADER = "xi-api-key"
59
- STREAM_EOS = ""
60
59
 
61
60
 
62
61
  @dataclass
@@ -74,19 +73,21 @@ class TTS(tts.TTS):
74
73
  self,
75
74
  *,
76
75
  voice: Voice = DEFAULT_VOICE,
77
- model_id: TTSModels = "eleven_multilingual_v2",
78
- api_key: Optional[str] = None,
79
- base_url: Optional[str] = None,
76
+ model_id: TTSModels = "eleven_turbo_v2",
77
+ api_key: str | None = None,
78
+ base_url: str | None = None,
80
79
  sample_rate: int = 24000,
81
- latency: int = 2,
80
+ latency: int = 3,
82
81
  ) -> None:
83
- super().__init__(streaming_supported=True)
82
+ super().__init__(
83
+ streaming_supported=True, sample_rate=sample_rate, num_channels=1
84
+ )
84
85
  api_key = api_key or os.environ.get("ELEVEN_API_KEY")
85
86
  if not api_key:
86
87
  raise ValueError("ELEVEN_API_KEY must be set")
87
88
 
88
89
  self._session = aiohttp.ClientSession()
89
- self._config = TTSOptions(
90
+ self._opts = TTSOptions(
90
91
  voice=voice,
91
92
  model_id=model_id,
92
93
  api_key=api_key,
@@ -97,8 +98,8 @@ class TTS(tts.TTS):
97
98
 
98
99
  async def list_voices(self) -> List[Voice]:
99
100
  async with self._session.get(
100
- f"{self._config.base_url}/voices",
101
- headers={AUTHORIZATION_HEADER: self._config.api_key},
101
+ f"{self._opts.base_url}/voices",
102
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
102
103
  ) as resp:
103
104
  data = await resp.json()
104
105
  return dict_to_voices_list(data)
@@ -107,227 +108,276 @@ class TTS(tts.TTS):
107
108
  self,
108
109
  text: str,
109
110
  ) -> AsyncIterable[tts.SynthesizedAudio]:
110
- voice = self._config.voice
111
+ voice = self._opts.voice
112
+ url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
111
113
 
112
114
  async def generator():
113
- async with self._session.post(
114
- f"{self._config.base_url}/text-to-speech/{voice.id}?output_format=pcm_44100",
115
- headers={AUTHORIZATION_HEADER: self._config.api_key},
116
- json=dict(
117
- text=text,
118
- model_id=self._config.model_id,
119
- voice_settings=dataclasses.asdict(voice.settings)
120
- if voice.settings
121
- else None,
122
- ),
123
- ) as resp:
124
- data = await resp.read()
125
- yield tts.SynthesizedAudio(
126
- text=text,
127
- data=rtc.AudioFrame(
128
- data=data,
129
- sample_rate=44100,
130
- num_channels=1,
131
- samples_per_channel=len(data) // 2, # 16-bit
115
+ try:
116
+ async with self._session.post(
117
+ url,
118
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
119
+ json=dict(
120
+ text=text,
121
+ model_id=self._opts.model_id,
122
+ voice_settings=(
123
+ dataclasses.asdict(voice.settings)
124
+ if voice.settings
125
+ else None
126
+ ),
132
127
  ),
133
- )
128
+ ) as resp:
129
+ data = await resp.read()
130
+ yield tts.SynthesizedAudio(
131
+ text=text,
132
+ data=rtc.AudioFrame(
133
+ data=data,
134
+ sample_rate=self._opts.sample_rate,
135
+ num_channels=1,
136
+ samples_per_channel=len(data) // 2, # 16-bit
137
+ ),
138
+ )
139
+ except Exception as e:
140
+ logger.error(f"failed to synthesize: {e}")
134
141
 
135
142
  return generator()
136
143
 
137
144
  def stream(
138
145
  self,
139
146
  ) -> "SynthesizeStream":
140
- return SynthesizeStream(self._session, self._config)
147
+ return SynthesizeStream(self._session, self._opts)
141
148
 
142
149
 
143
150
  class SynthesizeStream(tts.SynthesizeStream):
151
+ _STREAM_EOS = ""
152
+
144
153
  def __init__(
145
154
  self,
146
155
  session: aiohttp.ClientSession,
147
- config: TTSOptions,
156
+ opts: TTSOptions,
157
+ max_retry: int = 32,
148
158
  ):
149
- self._config = config
159
+ self._opts = opts
150
160
  self._session = session
151
161
 
152
- self._queue = asyncio.Queue[str]()
153
- self._event_queue = asyncio.Queue[tts.SynthesisEvent]()
162
+ self._queue = asyncio.Queue[str | None]()
163
+ self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
154
164
  self._closed = False
155
-
156
- self._main_task = asyncio.create_task(self._run(max_retry=32))
157
-
158
- def log_exception(task: asyncio.Task) -> None:
159
- if not task.cancelled() and task.exception():
160
- logging.error(f"elevenlabs synthesis task failed: {task.exception()}")
161
-
162
- self._main_task.add_done_callback(log_exception)
163
165
  self._text = ""
164
166
 
167
+ self._main_task = asyncio.create_task(self._run(max_retry))
168
+
165
169
  def _stream_url(self) -> str:
166
- base_url = self._config.base_url
167
- voice_id = self._config.voice.id
168
- model_id = self._config.model_id
169
- return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{self._config.sample_rate}&optimize_streaming_latency={self._config.latency}"
170
+ base_url = self._opts.base_url
171
+ voice_id = self._opts.voice.id
172
+ model_id = self._opts.model_id
173
+ sample_rate = self._opts.sample_rate
174
+ latency = self._opts.latency
175
+ return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
170
176
 
171
177
  def push_text(self, token: str | None) -> None:
172
178
  if self._closed:
173
179
  raise ValueError("cannot push to a closed stream")
174
180
 
175
- if not token or len(token) == 0:
181
+ if token is None:
182
+ self._flush_if_needed()
183
+ return
184
+
185
+ if len(token) == 0:
186
+ # 11labs marks the EOS with an empty string, avoid users from pushing empty strings
176
187
  return
177
188
 
178
- # TODO: Native word boundary detection may not be good enough for all languages
189
+ # TODO: Naive word boundary detection may not be good enough for all languages
179
190
  # fmt: off
180
191
  splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
181
192
  # fmt: on
182
193
 
183
194
  self._text += token
184
- if token[-1] in splitters:
185
- self._queue.put_nowait(self._text)
186
- self._text = ""
195
+
196
+ while True:
197
+ last_split = -1
198
+ for i, c in enumerate(self._text):
199
+ if c in splitters:
200
+ last_split = i
201
+ break
202
+
203
+ if last_split == -1:
204
+ break
205
+
206
+ seg = self._text[: last_split + 1]
207
+ seg = seg.strip() + " " # 11labs expects a space at the end
208
+ self._queue.put_nowait(seg)
209
+ self._text = self._text[last_split + 1 :]
210
+
211
+ async def aclose(self, *, wait: bool = True) -> None:
212
+ self._flush_if_needed()
213
+ self._queue.put_nowait(None)
214
+ self._closed = True
215
+
216
+ if not wait:
217
+ self._main_task.cancel()
218
+
219
+ with contextlib.suppress(asyncio.CancelledError):
220
+ await self._main_task
221
+
222
+ def _flush_if_needed(self) -> None:
223
+ seg = self._text.strip()
224
+ if len(seg) > 0:
225
+ self._queue.put_nowait(seg + " ")
226
+
227
+ self._text = ""
228
+ self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
187
229
 
188
230
  async def _run(self, max_retry: int) -> None:
189
231
  retry_count = 0
190
- listen_task: Optional[asyncio.Task] = None
191
- ws: Optional[aiohttp.ClientWebSocketResponse] = None
192
- retry_text_queue: asyncio.Queue[str] = asyncio.Queue()
193
- while True:
194
- try:
195
- ws = await self._try_connect()
196
- retry_count = 0 # reset retry count
197
-
198
- listen_task = asyncio.create_task(self._listen_task(ws))
199
-
200
- # forward queued text to 11labs
201
- started = False
202
- while not ws.closed:
203
- text = None
204
- if not retry_text_queue.empty():
205
- text = await retry_text_queue.get()
206
- retry_text_queue.task_done()
207
- else:
208
- text = await self._queue.get()
209
-
210
- if not started:
211
- self._event_queue.put_nowait(
212
- tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
232
+ ws: aiohttp.ClientWebSocketResponse | None = None
233
+ ws_task: asyncio.Task | None = None
234
+ data_tx: aio.ChanSender[str] | None = None
235
+
236
+ try:
237
+ while True:
238
+ ws_connected = ws is not None and not ws.closed
239
+ try:
240
+ data = await self._queue.get()
241
+
242
+ if data is None:
243
+ if ws_task is not None:
244
+ await ws_task
245
+ break
246
+
247
+ if not ws_connected:
248
+ if data == SynthesizeStream._STREAM_EOS:
249
+ continue
250
+
251
+ with contextlib.suppress(asyncio.CancelledError):
252
+ if ws_task is not None:
253
+ await ws_task
254
+
255
+ ws = await self._session.ws_connect(
256
+ self._stream_url(),
257
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
213
258
  )
214
- started = True
215
- text_packet = dict(
216
- text=text,
217
- try_trigger_generation=True,
218
- )
259
+ data_tx, data_rx = aio.channel()
260
+ ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
219
261
 
220
- # This case can happen in normal operation because 11labs will not
221
- # keep connections open indefinitely if we are not sending data.
222
- try:
223
- await ws.send_str(json.dumps(text_packet))
224
- except Exception:
225
- await retry_text_queue.put(text)
226
- break
262
+ assert data_tx is not None
263
+ assert ws_task is not None
264
+ assert ws is not None
265
+
266
+ data_tx.send_nowait(data)
227
267
 
228
- # We call self._queue.task_done() even if we are retrying the text because
229
- # all text has gone through self._queue. An exception may have short-circuited
230
- # out of the loop so task_done() will not have already been called on text that
231
- # is being retried.
232
- self._queue.task_done()
233
- if text == STREAM_EOS:
234
- await listen_task
235
- # We know 11labs is closing the stream after each request/flush
236
- self._event_queue.put_nowait(
237
- tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
268
+ except Exception:
269
+ if retry_count >= max_retry:
270
+ logger.exception(
271
+ f"failed to connect to 11labs after {max_retry} retries"
238
272
  )
239
273
  break
240
274
 
241
- except asyncio.CancelledError:
242
- if ws:
243
- await ws.close()
244
- if listen_task:
245
- await asyncio.shield(listen_task)
246
- break
247
- except Exception as e:
248
- if retry_count > max_retry and max_retry > 0:
249
- logging.error(f"failed to connect to ElevenLabs: {e}")
250
- break
275
+ retry_delay = min(retry_count * 5, 5) # max 5s
276
+ retry_count += 1
251
277
 
252
- retry_delay = min(retry_count * 5, 5) # max 5s
253
- retry_count += 1
254
- logging.warning(
255
- f"failed to connect to ElevenLabs: {e} - retrying in {retry_delay}s"
256
- )
257
- await asyncio.sleep(retry_delay)
278
+ logger.warning(
279
+ f"failed to connect to 11labs, retrying in {retry_delay}s"
280
+ )
281
+ await asyncio.sleep(retry_delay)
258
282
 
259
- self._closed = True
283
+ except Exception:
284
+ logger.exception("11labs task failed")
285
+ finally:
286
+ with contextlib.suppress(asyncio.CancelledError):
287
+ if ws_task is not None:
288
+ ws_task.cancel()
289
+ await ws_task
260
290
 
261
- async def _try_connect(self) -> aiohttp.ClientWebSocketResponse:
262
- ws = await self._session.ws_connect(
263
- self._stream_url(),
264
- headers={AUTHORIZATION_HEADER: self._config.api_key},
265
- )
291
+ self._event_queue.put_nowait(None)
266
292
 
267
- voice = self._config.voice
268
- voice_settings = dataclasses.asdict(voice.settings) if voice.settings else None
293
+ async def _run_ws(
294
+ self, ws: aiohttp.ClientWebSocketResponse, data_rx: aio.ChanReceiver[str]
295
+ ) -> None:
296
+ closing_ws = False
269
297
 
270
- init_packet = dict(
271
- text=" ",
272
- voice_settings=voice_settings,
298
+ self._event_queue.put_nowait(
299
+ tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
273
300
  )
274
- await ws.send_str(json.dumps(init_packet))
275
- return ws
276
301
 
277
- async def _listen_task(self, ws: aiohttp.ClientWebSocketResponse) -> None:
278
- while True:
279
- msg = await ws.receive()
302
+ async def send_task():
303
+ nonlocal closing_ws
280
304
 
281
- if msg.type in (
282
- aiohttp.WSMsgType.CLOSED,
283
- aiohttp.WSMsgType.CLOSE,
284
- aiohttp.WSMsgType.CLOSING,
285
- ):
286
- break
305
+ # 11labs stream must be initialized with a space
306
+ voice = self._opts.voice
307
+ voice_settings = (
308
+ dataclasses.asdict(voice.settings) if voice.settings else None
309
+ )
310
+ init_pkt = dict(
311
+ text=" ",
312
+ voice_settings=voice_settings,
313
+ )
314
+ await ws.send_str(json.dumps(init_pkt))
287
315
 
288
- if msg.type != aiohttp.WSMsgType.TEXT:
289
- continue
290
-
291
- jsonMessage: Dict[str, Any] = json.loads(str(msg.data))
292
- if jsonMessage.get("audio"):
293
- data = base64.b64decode(jsonMessage["audio"])
294
- audio_frame = rtc.AudioFrame(
295
- data=data,
296
- sample_rate=self._config.sample_rate,
297
- num_channels=1,
298
- samples_per_channel=len(data) // 2,
316
+ while True:
317
+ data = await data_rx.recv()
318
+ data_pkt = dict(
319
+ text=data,
320
+ try_trigger_generation=True,
299
321
  )
300
- self._event_queue.put_nowait(
301
- tts.SynthesisEvent(
302
- type=tts.SynthesisEventType.AUDIO,
303
- audio=tts.SynthesizedAudio(text="", data=audio_frame),
322
+ if data == SynthesizeStream._STREAM_EOS:
323
+ closing_ws = True
324
+
325
+ await ws.send_str(json.dumps(data_pkt))
326
+
327
+ if closing_ws:
328
+ return
329
+
330
+ async def recv_task():
331
+ nonlocal closing_ws
332
+ while True:
333
+ msg = await ws.receive()
334
+ if msg.type in (
335
+ aiohttp.WSMsgType.CLOSED,
336
+ aiohttp.WSMsgType.CLOSE,
337
+ aiohttp.WSMsgType.CLOSING,
338
+ ):
339
+ if closing_ws: # close is expected
340
+ return
341
+
342
+ raise Exception("11labs connection closed unexpectedly")
343
+
344
+ if msg.type != aiohttp.WSMsgType.TEXT:
345
+ logger.warning("unexpected 11labs message type %s", msg.type)
346
+ continue
347
+
348
+ data: dict = json.loads(msg.data)
349
+ if data.get("audio"):
350
+ b64data = base64.b64decode(data["audio"])
351
+ frame = rtc.AudioFrame(
352
+ data=b64data,
353
+ sample_rate=self._opts.sample_rate,
354
+ num_channels=1,
355
+ samples_per_channel=len(b64data) // 2,
304
356
  )
305
- )
306
- elif jsonMessage.get("isFinal"):
307
- break
308
- else:
309
- logging.error(f"Unhandled message from ElevenLabs: {msg}")
310
-
311
- async def flush(self) -> None:
312
- self._queue.put_nowait(self._text + " ")
313
- self._text = ""
314
- self._queue.put_nowait(STREAM_EOS)
315
- await self._queue.join()
316
-
317
- async def aclose(self, wait=False) -> None:
318
- if wait:
319
- logging.warning(
320
- "wait=True is not yet supported for ElevenLabs TTS. Closing immediately."
357
+ self._event_queue.put_nowait(
358
+ tts.SynthesisEvent(
359
+ type=tts.SynthesisEventType.AUDIO,
360
+ audio=tts.SynthesizedAudio(text="", data=frame),
361
+ )
362
+ )
363
+ elif data.get("isFinal"):
364
+ return
365
+
366
+ try:
367
+ await asyncio.gather(send_task(), recv_task())
368
+ except Exception:
369
+ logger.exception("11labs connection failed")
370
+ finally:
371
+ self._event_queue.put_nowait(
372
+ tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
321
373
  )
322
- self._main_task.cancel()
323
- with contextlib.suppress(asyncio.CancelledError):
324
- await self._main_task
325
374
 
326
375
  async def __anext__(self) -> tts.SynthesisEvent:
327
- if self._closed and self._event_queue.empty():
376
+ evt = await self._event_queue.get()
377
+ if evt is None:
328
378
  raise StopAsyncIteration
329
379
 
330
- return await self._event_queue.get()
380
+ return evt
331
381
 
332
382
 
333
383
  def dict_to_voices_list(data: dict) -> List[Voice]:
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.3.0"
15
+ __version__ = "0.4.0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,8 +19,8 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit ~=0.9
23
- Requires-Dist: livekit-agents ~=0.5.dev0
22
+ Requires-Dist: livekit ~=0.11
23
+ Requires-Dist: livekit-agents ~=0.6.0
24
24
  Requires-Dist: aiohttp >=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=WRfuua0msUfvfNLPtpDn4PLNxw-2khdViUssDkQST8Q,12462
6
+ livekit/plugins/elevenlabs/version.py,sha256=yelanl1wEXtgUH0CzoNVXfi2yTc2hElSzuAhULFzANc,600
7
+ livekit_plugins_elevenlabs-0.4.0.dist-info/METADATA,sha256=rWQJu5rmeMSqI4_oElqzCMzz2svtgNb7UDipGFmRL2s,1359
8
+ livekit_plugins_elevenlabs-0.4.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
+ livekit_plugins_elevenlabs-0.4.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.4.0.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
2
- livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
3
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- livekit/plugins/elevenlabs/tts.py,sha256=hN9aRGQ_9B9ehnB7cS19gtZ3uHIa-28RPoIIRZrdm-w,11503
5
- livekit/plugins/elevenlabs/version.py,sha256=G5iYozum4q7UpHwW43F7QfhzUfwcncPxBZ0gmUGsd5I,600
6
- livekit_plugins_elevenlabs-0.3.0.dist-info/METADATA,sha256=vY-Re5myy-A_j253KS9MNz7LGmE2TL5Trr6q54JnegQ,1361
7
- livekit_plugins_elevenlabs-0.3.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
8
- livekit_plugins_elevenlabs-0.3.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
9
- livekit_plugins_elevenlabs-0.3.0.dist-info/RECORD,,