livekit-plugins-elevenlabs 0.3.dev0__tar.gz → 0.4.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (18) hide show
  1. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/PKG-INFO +3 -3
  2. livekit_plugins_elevenlabs-0.4.dev0/livekit/plugins/elevenlabs/log.py +3 -0
  3. livekit_plugins_elevenlabs-0.4.dev0/livekit/plugins/elevenlabs/tts.py +392 -0
  4. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit/plugins/elevenlabs/version.py +1 -1
  5. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit_plugins_elevenlabs.egg-info/PKG-INFO +3 -3
  6. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit_plugins_elevenlabs.egg-info/SOURCES.txt +1 -0
  7. livekit_plugins_elevenlabs-0.4.dev0/livekit_plugins_elevenlabs.egg-info/requires.txt +3 -0
  8. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/setup.py +2 -2
  9. livekit-plugins-elevenlabs-0.3.dev0/livekit/plugins/elevenlabs/tts.py +0 -344
  10. livekit-plugins-elevenlabs-0.3.dev0/livekit_plugins_elevenlabs.egg-info/requires.txt +0 -3
  11. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/README.md +0 -0
  12. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit/plugins/elevenlabs/__init__.py +0 -0
  13. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit/plugins/elevenlabs/models.py +0 -0
  14. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit/plugins/elevenlabs/py.typed +0 -0
  15. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit_plugins_elevenlabs.egg-info/dependency_links.txt +0 -0
  16. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/livekit_plugins_elevenlabs.egg-info/top_level.txt +0 -0
  17. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/pyproject.toml +0 -0
  18. {livekit-plugins-elevenlabs-0.3.dev0 → livekit_plugins_elevenlabs-0.4.dev0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.3.dev0
3
+ Version: 0.4.dev0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,8 +19,8 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit~=0.9
23
- Requires-Dist: livekit-agents~=0.5.dev0
22
+ Requires-Dist: livekit~=0.11
23
+ Requires-Dist: livekit-agents~=0.6.dev0
24
24
  Requires-Dist: aiohttp>=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.elevenlabs")
@@ -0,0 +1,392 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import asyncio
16
+ import base64
17
+ import contextlib
18
+ import dataclasses
19
+ import json
20
+ import os
21
+ from dataclasses import dataclass
22
+ from typing import AsyncIterable, List
23
+
24
+ import aiohttp
25
+ from livekit import rtc
26
+ from livekit.agents import aio, tts
27
+
28
+ from .log import logger
29
+ from .models import TTSModels
30
+
31
+
32
+ @dataclass
33
+ class VoiceSettings:
34
+ stability: float # [0.0 - 1.0]
35
+ similarity_boost: float # [0.0 - 1.0]
36
+ style: float | None = None # [0.0 - 1.0]
37
+ use_speaker_boost: bool | None = False
38
+
39
+
40
+ @dataclass
41
+ class Voice:
42
+ id: str
43
+ name: str
44
+ category: str
45
+ settings: VoiceSettings | None = None
46
+
47
+
48
+ DEFAULT_VOICE = Voice(
49
+ id="EXAVITQu4vr4xnSDxMaL",
50
+ name="Bella",
51
+ category="premade",
52
+ settings=VoiceSettings(
53
+ stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
54
+ ),
55
+ )
56
+
57
+ API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
58
+ AUTHORIZATION_HEADER = "xi-api-key"
59
+
60
+
61
+ @dataclass
62
+ class TTSOptions:
63
+ api_key: str
64
+ voice: Voice
65
+ model_id: TTSModels
66
+ base_url: str
67
+ sample_rate: int
68
+ latency: int
69
+
70
+
71
+ class TTS(tts.TTS):
72
+ def __init__(
73
+ self,
74
+ *,
75
+ voice: Voice = DEFAULT_VOICE,
76
+ model_id: TTSModels = "eleven_turbo_v2",
77
+ api_key: str | None = None,
78
+ base_url: str | None = None,
79
+ sample_rate: int = 24000,
80
+ latency: int = 3,
81
+ ) -> None:
82
+ super().__init__(
83
+ streaming_supported=True, sample_rate=sample_rate, num_channels=1
84
+ )
85
+ api_key = api_key or os.environ.get("ELEVEN_API_KEY")
86
+ if not api_key:
87
+ raise ValueError("ELEVEN_API_KEY must be set")
88
+
89
+ self._session = aiohttp.ClientSession()
90
+ self._opts = TTSOptions(
91
+ voice=voice,
92
+ model_id=model_id,
93
+ api_key=api_key,
94
+ base_url=base_url or API_BASE_URL_V1,
95
+ sample_rate=sample_rate,
96
+ latency=latency,
97
+ )
98
+
99
+ async def list_voices(self) -> List[Voice]:
100
+ async with self._session.get(
101
+ f"{self._opts.base_url}/voices",
102
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
103
+ ) as resp:
104
+ data = await resp.json()
105
+ return dict_to_voices_list(data)
106
+
107
+ def synthesize(
108
+ self,
109
+ text: str,
110
+ ) -> AsyncIterable[tts.SynthesizedAudio]:
111
+ voice = self._opts.voice
112
+ url = f"{self._opts.base_url}/text-to-speech/{voice.id}?output_format=pcm_{self._opts.sample_rate}"
113
+
114
+ async def generator():
115
+ try:
116
+ async with self._session.post(
117
+ url,
118
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
119
+ json=dict(
120
+ text=text,
121
+ model_id=self._opts.model_id,
122
+ voice_settings=dataclasses.asdict(voice.settings)
123
+ if voice.settings
124
+ else None,
125
+ ),
126
+ ) as resp:
127
+ data = await resp.read()
128
+ yield tts.SynthesizedAudio(
129
+ text=text,
130
+ data=rtc.AudioFrame(
131
+ data=data,
132
+ sample_rate=self._opts.sample_rate,
133
+ num_channels=1,
134
+ samples_per_channel=len(data) // 2, # 16-bit
135
+ ),
136
+ )
137
+ except Exception as e:
138
+ logger.error(f"failed to synthesize: {e}")
139
+
140
+ return generator()
141
+
142
+ def stream(
143
+ self,
144
+ ) -> "SynthesizeStream":
145
+ return SynthesizeStream(self._session, self._opts)
146
+
147
+
148
+ class SynthesizeStream(tts.SynthesizeStream):
149
+ _STREAM_EOS = ""
150
+
151
+ def __init__(
152
+ self,
153
+ session: aiohttp.ClientSession,
154
+ opts: TTSOptions,
155
+ max_retry: int = 32,
156
+ ):
157
+ self._opts = opts
158
+ self._session = session
159
+
160
+ self._queue = asyncio.Queue[str | None]()
161
+ self._event_queue = asyncio.Queue[tts.SynthesisEvent | None]()
162
+ self._closed = False
163
+ self._text = ""
164
+
165
+ self._main_task = asyncio.create_task(self._run(max_retry))
166
+
167
+ def _stream_url(self) -> str:
168
+ base_url = self._opts.base_url
169
+ voice_id = self._opts.voice.id
170
+ model_id = self._opts.model_id
171
+ sample_rate = self._opts.sample_rate
172
+ latency = self._opts.latency
173
+ return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
174
+
175
+ def push_text(self, token: str | None) -> None:
176
+ if self._closed:
177
+ raise ValueError("cannot push to a closed stream")
178
+
179
+ if token is None:
180
+ self._flush_if_needed()
181
+ return
182
+
183
+ if len(token) == 0:
184
+ # 11labs marks the EOS with an empty string, avoid users from pushing empty strings
185
+ return
186
+
187
+ # TODO: Naive word boundary detection may not be good enough for all languages
188
+ # fmt: off
189
+ splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
190
+ # fmt: on
191
+
192
+ self._text += token
193
+
194
+ while True:
195
+ last_split = -1
196
+ for i, c in enumerate(self._text):
197
+ if c in splitters:
198
+ last_split = i
199
+ break
200
+
201
+ if last_split == -1:
202
+ break
203
+
204
+ seg = self._text[: last_split + 1]
205
+ seg = seg.strip() + " " # 11labs expects a space at the end
206
+ self._queue.put_nowait(seg)
207
+ self._text = self._text[last_split + 1 :]
208
+
209
+ async def aclose(self, *, wait: bool = True) -> None:
210
+ self._flush_if_needed()
211
+ self._queue.put_nowait(None)
212
+ self._closed = True
213
+
214
+ if not wait:
215
+ self._main_task.cancel()
216
+
217
+ with contextlib.suppress(asyncio.CancelledError):
218
+ await self._main_task
219
+
220
+ def _flush_if_needed(self) -> None:
221
+ seg = self._text.strip()
222
+ if len(seg) > 0:
223
+ self._queue.put_nowait(seg + " ")
224
+
225
+ self._text = ""
226
+ self._queue.put_nowait(SynthesizeStream._STREAM_EOS)
227
+
228
+ async def _run(self, max_retry: int) -> None:
229
+ retry_count = 0
230
+ ws: aiohttp.ClientWebSocketResponse | None = None
231
+ ws_task: asyncio.Task | None = None
232
+ data_tx: aio.ChanSender[str] | None = None
233
+
234
+ try:
235
+ while True:
236
+ ws_connected = ws is not None and not ws.closed
237
+ try:
238
+ data = await self._queue.get()
239
+
240
+ if data is None:
241
+ if ws_task is not None:
242
+ await ws_task
243
+ break
244
+
245
+ if not ws_connected:
246
+ if data == SynthesizeStream._STREAM_EOS:
247
+ continue
248
+
249
+ with contextlib.suppress(asyncio.CancelledError):
250
+ if ws_task is not None:
251
+ await ws_task
252
+
253
+ ws = await self._session.ws_connect(
254
+ self._stream_url(),
255
+ headers={AUTHORIZATION_HEADER: self._opts.api_key},
256
+ )
257
+ data_tx, data_rx = aio.channel()
258
+ ws_task = asyncio.create_task(self._run_ws(ws, data_rx))
259
+
260
+ assert data_tx is not None
261
+ assert ws_task is not None
262
+ assert ws is not None
263
+
264
+ data_tx.send_nowait(data)
265
+
266
+ except Exception:
267
+ if retry_count >= max_retry:
268
+ logger.exception(
269
+ f"failed to connect to 11labs after {max_retry} retries"
270
+ )
271
+ break
272
+
273
+ retry_delay = min(retry_count * 5, 5) # max 5s
274
+ retry_count += 1
275
+
276
+ logger.warning(
277
+ f"failed to connect to 11labs, retrying in {retry_delay}s"
278
+ )
279
+ await asyncio.sleep(retry_delay)
280
+
281
+ except Exception:
282
+ logger.exception("11labs task failed")
283
+ finally:
284
+ with contextlib.suppress(asyncio.CancelledError):
285
+ if ws_task is not None:
286
+ ws_task.cancel()
287
+ await ws_task
288
+
289
+ self._event_queue.put_nowait(None)
290
+
291
+ async def _run_ws(
292
+ self, ws: aiohttp.ClientWebSocketResponse, data_rx: aio.ChanReceiver[str]
293
+ ) -> None:
294
+ closing_ws = False
295
+
296
+ self._event_queue.put_nowait(
297
+ tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
298
+ )
299
+
300
+ async def send_task():
301
+ nonlocal closing_ws
302
+
303
+ # 11labs stream must be initialized with a space
304
+ voice = self._opts.voice
305
+ voice_settings = (
306
+ dataclasses.asdict(voice.settings) if voice.settings else None
307
+ )
308
+ init_pkt = dict(
309
+ text=" ",
310
+ voice_settings=voice_settings,
311
+ )
312
+ await ws.send_str(json.dumps(init_pkt))
313
+
314
+ while True:
315
+ data = await data_rx.recv()
316
+ data_pkt = dict(
317
+ text=data,
318
+ try_trigger_generation=False,
319
+ )
320
+ if data == SynthesizeStream._STREAM_EOS:
321
+ closing_ws = True
322
+
323
+ await ws.send_str(json.dumps(data_pkt))
324
+
325
+ if closing_ws:
326
+ return
327
+
328
+ async def recv_task():
329
+ nonlocal closing_ws
330
+ while True:
331
+ msg = await ws.receive()
332
+ if msg.type in (
333
+ aiohttp.WSMsgType.CLOSED,
334
+ aiohttp.WSMsgType.CLOSE,
335
+ aiohttp.WSMsgType.CLOSING,
336
+ ):
337
+ if closing_ws: # close is expected
338
+ return
339
+
340
+ raise Exception("11labs connection closed unexpectedly")
341
+
342
+ if msg.type != aiohttp.WSMsgType.TEXT:
343
+ logger.warning("unexpected 11labs message type %s", msg.type)
344
+ continue
345
+
346
+ data: dict = json.loads(msg.data)
347
+ if data.get("audio"):
348
+ b64data = base64.b64decode(data["audio"])
349
+ frame = rtc.AudioFrame(
350
+ data=b64data,
351
+ sample_rate=self._opts.sample_rate,
352
+ num_channels=1,
353
+ samples_per_channel=len(data) // 2,
354
+ )
355
+ self._event_queue.put_nowait(
356
+ tts.SynthesisEvent(
357
+ type=tts.SynthesisEventType.AUDIO,
358
+ audio=tts.SynthesizedAudio(text="", data=frame),
359
+ )
360
+ )
361
+ elif data.get("isFinal"):
362
+ return
363
+
364
+ try:
365
+ await asyncio.gather(send_task(), recv_task())
366
+ except Exception:
367
+ logger.exception("11labs connection failed")
368
+ finally:
369
+ self._event_queue.put_nowait(
370
+ tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
371
+ )
372
+
373
+ async def __anext__(self) -> tts.SynthesisEvent:
374
+ evt = await self._event_queue.get()
375
+ if evt is None:
376
+ raise StopAsyncIteration
377
+
378
+ return evt
379
+
380
+
381
+ def dict_to_voices_list(data: dict) -> List[Voice]:
382
+ voices = []
383
+ for voice in data["voices"]:
384
+ voices.append(
385
+ Voice(
386
+ id=voice["voice_id"],
387
+ name=voice["name"],
388
+ category=voice["category"],
389
+ settings=None,
390
+ )
391
+ )
392
+ return voices
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.3.dev0"
15
+ __version__ = "0.4.dev0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.3.dev0
3
+ Version: 0.4.dev0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -19,8 +19,8 @@ Classifier: Programming Language :: Python :: 3.10
19
19
  Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
- Requires-Dist: livekit~=0.9
23
- Requires-Dist: livekit-agents~=0.5.dev0
22
+ Requires-Dist: livekit~=0.11
23
+ Requires-Dist: livekit-agents~=0.6.dev0
24
24
  Requires-Dist: aiohttp>=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -2,6 +2,7 @@ README.md
2
2
  pyproject.toml
3
3
  setup.py
4
4
  livekit/plugins/elevenlabs/__init__.py
5
+ livekit/plugins/elevenlabs/log.py
5
6
  livekit/plugins/elevenlabs/models.py
6
7
  livekit/plugins/elevenlabs/py.typed
7
8
  livekit/plugins/elevenlabs/tts.py
@@ -0,0 +1,3 @@
1
+ livekit~=0.11
2
+ livekit-agents~=0.6.dev0
3
+ aiohttp>=3.8.5
@@ -50,8 +50,8 @@ setuptools.setup(
50
50
  packages=setuptools.find_namespace_packages(include=["livekit.*"]),
51
51
  python_requires=">=3.9.0",
52
52
  install_requires=[
53
- "livekit ~= 0.9",
54
- "livekit-agents~=0.5.dev0",
53
+ "livekit ~= 0.11",
54
+ "livekit-agents~=0.6.dev0",
55
55
  "aiohttp >= 3.8.5",
56
56
  ],
57
57
  package_data={
@@ -1,344 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- import asyncio
16
- import base64
17
- import contextlib
18
- import dataclasses
19
- import json
20
- import logging
21
- import os
22
- from dataclasses import dataclass
23
- from typing import Any, AsyncIterable, Dict, List, Optional
24
-
25
- import aiohttp
26
- from livekit import rtc
27
- from livekit.agents import tts
28
-
29
- from .models import TTSModels
30
-
31
-
32
- @dataclass
33
- class Voice:
34
- id: str
35
- name: str
36
- category: str
37
- settings: Optional["VoiceSettings"] = None
38
-
39
-
40
- @dataclass
41
- class VoiceSettings:
42
- stability: float # [0.0 - 1.0]
43
- similarity_boost: float # [0.0 - 1.0]
44
- style: Optional[float] = None # [0.0 - 1.0]
45
- use_speaker_boost: Optional[bool] = False
46
-
47
-
48
- DEFAULT_VOICE = Voice(
49
- id="EXAVITQu4vr4xnSDxMaL",
50
- name="Bella",
51
- category="premade",
52
- settings=VoiceSettings(
53
- stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
54
- ),
55
- )
56
-
57
- API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
58
- AUTHORIZATION_HEADER = "xi-api-key"
59
- STREAM_EOS = ""
60
-
61
-
62
- @dataclass
63
- class TTSOptions:
64
- api_key: str
65
- voice: Voice
66
- model_id: TTSModels
67
- base_url: str
68
- sample_rate: int
69
- latency: int
70
-
71
-
72
- class TTS(tts.TTS):
73
- def __init__(
74
- self,
75
- *,
76
- voice: Voice = DEFAULT_VOICE,
77
- model_id: TTSModels = "eleven_multilingual_v2",
78
- api_key: Optional[str] = None,
79
- base_url: Optional[str] = None,
80
- sample_rate: int = 24000,
81
- latency: int = 2,
82
- ) -> None:
83
- super().__init__(streaming_supported=True)
84
- api_key = api_key or os.environ.get("ELEVEN_API_KEY")
85
- if not api_key:
86
- raise ValueError("ELEVEN_API_KEY must be set")
87
-
88
- self._session = aiohttp.ClientSession()
89
- self._config = TTSOptions(
90
- voice=voice,
91
- model_id=model_id,
92
- api_key=api_key,
93
- base_url=base_url or API_BASE_URL_V1,
94
- sample_rate=sample_rate,
95
- latency=latency,
96
- )
97
-
98
- async def list_voices(self) -> List[Voice]:
99
- async with self._session.get(
100
- f"{self._config.base_url}/voices",
101
- headers={AUTHORIZATION_HEADER: self._config.api_key},
102
- ) as resp:
103
- data = await resp.json()
104
- return dict_to_voices_list(data)
105
-
106
- def synthesize(
107
- self,
108
- text: str,
109
- ) -> AsyncIterable[tts.SynthesizedAudio]:
110
- voice = self._config.voice
111
-
112
- async def generator():
113
- async with self._session.post(
114
- f"{self._config.base_url}/text-to-speech/{voice.id}?output_format=pcm_44100",
115
- headers={AUTHORIZATION_HEADER: self._config.api_key},
116
- json=dict(
117
- text=text,
118
- model_id=self._config.model_id,
119
- voice_settings=dataclasses.asdict(voice.settings)
120
- if voice.settings
121
- else None,
122
- ),
123
- ) as resp:
124
- data = await resp.read()
125
- yield tts.SynthesizedAudio(
126
- text=text,
127
- data=rtc.AudioFrame(
128
- data=data,
129
- sample_rate=44100,
130
- num_channels=1,
131
- samples_per_channel=len(data) // 2, # 16-bit
132
- ),
133
- )
134
-
135
- return generator()
136
-
137
- def stream(
138
- self,
139
- ) -> "SynthesizeStream":
140
- return SynthesizeStream(self._session, self._config)
141
-
142
-
143
- class SynthesizeStream(tts.SynthesizeStream):
144
- def __init__(
145
- self,
146
- session: aiohttp.ClientSession,
147
- config: TTSOptions,
148
- ):
149
- self._config = config
150
- self._session = session
151
-
152
- self._queue = asyncio.Queue[str]()
153
- self._event_queue = asyncio.Queue[tts.SynthesisEvent]()
154
- self._closed = False
155
-
156
- self._main_task = asyncio.create_task(self._run(max_retry=32))
157
-
158
- def log_exception(task: asyncio.Task) -> None:
159
- if not task.cancelled() and task.exception():
160
- logging.error(f"elevenlabs synthesis task failed: {task.exception()}")
161
-
162
- self._main_task.add_done_callback(log_exception)
163
- self._text = ""
164
-
165
- def _stream_url(self) -> str:
166
- base_url = self._config.base_url
167
- voice_id = self._config.voice.id
168
- model_id = self._config.model_id
169
- return f"{base_url}/text-to-speech/{voice_id}/stream-input?model_id={model_id}&output_format=pcm_{self._config.sample_rate}&optimize_streaming_latency={self._config.latency}"
170
-
171
- def push_text(self, token: str | None) -> None:
172
- if self._closed:
173
- raise ValueError("cannot push to a closed stream")
174
-
175
- if not token or len(token) == 0:
176
- return
177
-
178
- # TODO: Native word boundary detection may not be good enough for all languages
179
- # fmt: off
180
- splitters = (".", ",", "?", "!", ";", ":", "—", "-", "(", ")", "[", "]", "}", " ")
181
- # fmt: on
182
-
183
- self._text += token
184
- if token[-1] in splitters:
185
- self._queue.put_nowait(self._text)
186
- self._text = ""
187
-
188
- async def _run(self, max_retry: int) -> None:
189
- retry_count = 0
190
- listen_task: Optional[asyncio.Task] = None
191
- ws: Optional[aiohttp.ClientWebSocketResponse] = None
192
- retry_text_queue: asyncio.Queue[str] = asyncio.Queue()
193
- while True:
194
- try:
195
- ws = await self._try_connect()
196
- retry_count = 0 # reset retry count
197
-
198
- listen_task = asyncio.create_task(self._listen_task(ws))
199
-
200
- # forward queued text to 11labs
201
- started = False
202
- while not ws.closed:
203
- text = None
204
- if not retry_text_queue.empty():
205
- text = await retry_text_queue.get()
206
- retry_text_queue.task_done()
207
- else:
208
- text = await self._queue.get()
209
-
210
- if not started:
211
- self._event_queue.put_nowait(
212
- tts.SynthesisEvent(type=tts.SynthesisEventType.STARTED)
213
- )
214
- started = True
215
- text_packet = dict(
216
- text=text,
217
- try_trigger_generation=True,
218
- )
219
-
220
- # This case can happen in normal operation because 11labs will not
221
- # keep connections open indefinitely if we are not sending data.
222
- try:
223
- await ws.send_str(json.dumps(text_packet))
224
- except Exception:
225
- await retry_text_queue.put(text)
226
- break
227
-
228
- # We call self._queue.task_done() even if we are retrying the text because
229
- # all text has gone through self._queue. An exception may have short-circuited
230
- # out of the loop so task_done() will not have already been called on text that
231
- # is being retried.
232
- self._queue.task_done()
233
- if text == STREAM_EOS:
234
- await listen_task
235
- # We know 11labs is closing the stream after each request/flush
236
- self._event_queue.put_nowait(
237
- tts.SynthesisEvent(type=tts.SynthesisEventType.FINISHED)
238
- )
239
- break
240
-
241
- except asyncio.CancelledError:
242
- if ws:
243
- await ws.close()
244
- if listen_task:
245
- await asyncio.shield(listen_task)
246
- break
247
- except Exception as e:
248
- if retry_count > max_retry and max_retry > 0:
249
- logging.error(f"failed to connect to ElevenLabs: {e}")
250
- break
251
-
252
- retry_delay = min(retry_count * 5, 5) # max 5s
253
- retry_count += 1
254
- logging.warning(
255
- f"failed to connect to ElevenLabs: {e} - retrying in {retry_delay}s"
256
- )
257
- await asyncio.sleep(retry_delay)
258
-
259
- self._closed = True
260
-
261
- async def _try_connect(self) -> aiohttp.ClientWebSocketResponse:
262
- ws = await self._session.ws_connect(
263
- self._stream_url(),
264
- headers={AUTHORIZATION_HEADER: self._config.api_key},
265
- )
266
-
267
- voice = self._config.voice
268
- voice_settings = dataclasses.asdict(voice.settings) if voice.settings else None
269
-
270
- init_packet = dict(
271
- text=" ",
272
- voice_settings=voice_settings,
273
- )
274
- await ws.send_str(json.dumps(init_packet))
275
- return ws
276
-
277
- async def _listen_task(self, ws: aiohttp.ClientWebSocketResponse) -> None:
278
- while True:
279
- msg = await ws.receive()
280
-
281
- if msg.type in (
282
- aiohttp.WSMsgType.CLOSED,
283
- aiohttp.WSMsgType.CLOSE,
284
- aiohttp.WSMsgType.CLOSING,
285
- ):
286
- break
287
-
288
- if msg.type != aiohttp.WSMsgType.TEXT:
289
- continue
290
-
291
- jsonMessage: Dict[str, Any] = json.loads(str(msg.data))
292
- if jsonMessage.get("audio"):
293
- data = base64.b64decode(jsonMessage["audio"])
294
- audio_frame = rtc.AudioFrame(
295
- data=data,
296
- sample_rate=self._config.sample_rate,
297
- num_channels=1,
298
- samples_per_channel=len(data) // 2,
299
- )
300
- self._event_queue.put_nowait(
301
- tts.SynthesisEvent(
302
- type=tts.SynthesisEventType.AUDIO,
303
- audio=tts.SynthesizedAudio(text="", data=audio_frame),
304
- )
305
- )
306
- elif jsonMessage.get("isFinal"):
307
- break
308
- else:
309
- logging.error(f"Unhandled message from ElevenLabs: {msg}")
310
-
311
- async def flush(self) -> None:
312
- self._queue.put_nowait(self._text + " ")
313
- self._text = ""
314
- self._queue.put_nowait(STREAM_EOS)
315
- await self._queue.join()
316
-
317
- async def aclose(self, wait=False) -> None:
318
- if wait:
319
- logging.warning(
320
- "wait=True is not yet supported for ElevenLabs TTS. Closing immediately."
321
- )
322
- self._main_task.cancel()
323
- with contextlib.suppress(asyncio.CancelledError):
324
- await self._main_task
325
-
326
- async def __anext__(self) -> tts.SynthesisEvent:
327
- if self._closed and self._event_queue.empty():
328
- raise StopAsyncIteration
329
-
330
- return await self._event_queue.get()
331
-
332
-
333
- def dict_to_voices_list(data: dict) -> List[Voice]:
334
- voices = []
335
- for voice in data["voices"]:
336
- voices.append(
337
- Voice(
338
- id=voice["voice_id"],
339
- name=voice["name"],
340
- category=voice["category"],
341
- settings=None,
342
- )
343
- )
344
- return voices
@@ -1,3 +0,0 @@
1
- livekit~=0.9
2
- livekit-agents~=0.5.dev0
3
- aiohttp>=3.8.5