livekit-plugins-elevenlabs 0.5.dev0__py3-none-any.whl → 0.6.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,10 +12,19 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
+ from .models import TTSEncoding, TTSModels
15
16
  from .tts import DEFAULT_VOICE, TTS, Voice, VoiceSettings
16
17
  from .version import __version__
17
18
 
18
- __all__ = ["TTS", "Voice", "VoiceSettings", "DEFAULT_VOICE", "__version__"]
19
+ __all__ = [
20
+ "TTS",
21
+ "Voice",
22
+ "VoiceSettings",
23
+ "TTSEncoding",
24
+ "TTSModels",
25
+ "DEFAULT_VOICE",
26
+ "__version__",
27
+ ]
19
28
 
20
29
  from livekit.agents import Plugin
21
30
 
@@ -6,3 +6,15 @@ TTSModels = Literal[
6
6
  "eleven_multilingual_v2",
7
7
  "eleven_turbo_v2",
8
8
  ]
9
+
10
+ TTSEncoding = Literal[
11
+ "mp3_22050_32",
12
+ "mp3_44100_32",
13
+ "mp3_44100_64",
14
+ "mp3_44100_96",
15
+ "mp3_44100_128",
16
+ "mp3_44100_192",
17
+ "pcm_16000",
18
+ "pcm_22050",
19
+ "pcm_44100",
20
+ ]
@@ -21,14 +21,36 @@ import dataclasses
21
21
  import json
22
22
  import os
23
23
  from dataclasses import dataclass
24
- from typing import List, Optional
24
+ from typing import List, Literal, Optional
25
25
 
26
26
  import aiohttp
27
27
  from livekit import rtc
28
- from livekit.agents import aio, tokenize, tts, utils
28
+ from livekit.agents import aio, codecs, tokenize, tts, utils
29
29
 
30
30
  from .log import logger
31
- from .models import TTSModels
31
+ from .models import (
32
+ TTSEncoding,
33
+ TTSModels,
34
+ )
35
+
36
+ _Encoding = Literal[
37
+ "mp3",
38
+ "pcm",
39
+ ]
40
+
41
+
42
+ def _sample_rate_from_format(output_format: TTSEncoding) -> int:
43
+ split = output_format.split("_") # e.g: mp3_22050_32
44
+ return int(split[1])
45
+
46
+
47
+ def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
48
+ if output_format.startswith("mp3"):
49
+ return "mp3"
50
+ elif output_format.startswith("pcm"):
51
+ return "pcm"
52
+
53
+ raise ValueError(f"Unknown format: {output_format}")
32
54
 
33
55
 
34
56
  @dataclass
@@ -66,6 +88,7 @@ class _TTSOptions:
66
88
  voice: Voice
67
89
  model_id: TTSModels
68
90
  base_url: str
91
+ encoding: TTSEncoding
69
92
  sample_rate: int
70
93
  streaming_latency: int
71
94
  word_tokenizer: tokenize.WordTokenizer
@@ -80,7 +103,7 @@ class TTS(tts.TTS):
80
103
  model_id: TTSModels = "eleven_turbo_v2",
81
104
  api_key: str | None = None,
82
105
  base_url: str | None = None,
83
- sample_rate: int = 24000,
106
+ encoding: TTSEncoding = "mp3_22050_32",
84
107
  streaming_latency: int = 3,
85
108
  word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
86
109
  ignore_punctuation=False # punctuation can help for intonation
@@ -91,7 +114,9 @@ class TTS(tts.TTS):
91
114
  http_session: aiohttp.ClientSession | None = None,
92
115
  ) -> None:
93
116
  super().__init__(
94
- streaming_supported=True, sample_rate=sample_rate, num_channels=1
117
+ streaming_supported=True,
118
+ sample_rate=_sample_rate_from_format(encoding),
119
+ num_channels=1,
95
120
  )
96
121
  api_key = api_key or os.environ.get("ELEVEN_API_KEY")
97
122
  if not api_key:
@@ -102,7 +127,8 @@ class TTS(tts.TTS):
102
127
  model_id=model_id,
103
128
  api_key=api_key,
104
129
  base_url=base_url or API_BASE_URL_V1,
105
- sample_rate=sample_rate,
130
+ encoding=encoding,
131
+ sample_rate=self.sample_rate,
106
132
  streaming_latency=streaming_latency,
107
133
  word_tokenizer=word_tokenizer,
108
134
  chunk_length_schedule=chunk_length_schedule,
@@ -150,7 +176,7 @@ class ChunkedStream(tts.ChunkedStream):
150
176
  base_url = self._opts.base_url
151
177
  voice_id = self._opts.voice.id
152
178
  model_id = self._opts.model_id
153
- sample_rate = self._opts.sample_rate
179
+ sample_rate = _sample_rate_from_format(self._opts.encoding)
154
180
  latency = self._opts.streaming_latency
155
181
  url = (
156
182
  f"{base_url}/text-to-speech/{voice_id}/stream?"
@@ -260,11 +286,11 @@ class SynthesizeStream(tts.SynthesizeStream):
260
286
  base_url = self._opts.base_url
261
287
  voice_id = self._opts.voice.id
262
288
  model_id = self._opts.model_id
263
- sample_rate = self._opts.sample_rate
289
+ output_format = self._opts.encoding
264
290
  latency = self._opts.streaming_latency
265
291
  url = (
266
292
  f"{base_url}/text-to-speech/{voice_id}/stream-input?"
267
- f"model_id={model_id}&output_format=pcm_{sample_rate}&optimize_streaming_latency={latency}"
293
+ f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
268
294
  )
269
295
 
270
296
  return url
@@ -417,6 +443,8 @@ class SynthesizeStream(tts.SynthesizeStream):
417
443
  all_tokens_consumed = True
418
444
 
419
445
  async def recv_task():
446
+ encoding = _encoding_from_format(self._opts.encoding)
447
+ mp3_decoder = codecs.Mp3StreamDecoder()
420
448
  while True:
421
449
  msg = await ws_conn.receive()
422
450
  if msg.type in (
@@ -437,19 +465,32 @@ class SynthesizeStream(tts.SynthesizeStream):
437
465
  continue
438
466
 
439
467
  data: dict = json.loads(msg.data)
440
- if data.get("audio"):
441
- b64data = base64.b64decode(data["audio"])
442
-
443
- frame = rtc.AudioFrame(
444
- data=b64data,
445
- sample_rate=self._opts.sample_rate,
446
- num_channels=1,
447
- samples_per_channel=len(b64data) // 2,
448
- )
468
+ audio = data.get("audio")
469
+
470
+ if data.get("error"):
471
+ logger.error("11labs error %s", data)
472
+ return
473
+ elif audio is not None:
474
+ if audio == "":
475
+ # 11labs sometimes sends empty audio, ignore
476
+ continue
477
+
478
+ b64data = base64.b64decode(audio)
479
+ frame: rtc.AudioFrame
480
+ if encoding == "mp3":
481
+ frames = mp3_decoder.decode_chunk(b64data)
482
+ frame = utils.merge_frames(frames)
483
+ else:
484
+ frame = rtc.AudioFrame(
485
+ data=b64data,
486
+ sample_rate=self._opts.sample_rate,
487
+ num_channels=1,
488
+ samples_per_channel=len(b64data) // 2,
489
+ )
449
490
 
450
491
  text = ""
451
492
  if data.get("alignment"):
452
- text = data["alignment"].get("chars", "")
493
+ text = "".join(data["alignment"].get("chars", ""))
453
494
 
454
495
  audio_tx.send_nowait(tts.SynthesizedAudio(text=text, data=frame))
455
496
  continue
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.5.dev0"
15
+ __version__ = "0.6.dev0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-elevenlabs
3
- Version: 0.5.dev0
3
+ Version: 0.6.dev0
4
4
  Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,7 +20,7 @@ Classifier: Programming Language :: Python :: 3 :: Only
20
20
  Requires-Python: >=3.9.0
21
21
  Description-Content-Type: text/markdown
22
22
  Requires-Dist: livekit ~=0.11
23
- Requires-Dist: livekit-agents ~=0.7.dev0
23
+ Requires-Dist: livekit-agents[codecs] ~=0.8.dev0
24
24
  Requires-Dist: aiohttp >=3.8.5
25
25
 
26
26
  # LiveKit Plugins Elevenlabs
@@ -0,0 +1,10 @@
1
+ livekit/plugins/elevenlabs/__init__.py,sha256=ez1ybDPt7GfKAKgPkxZFRB7Vyd-_i-0hfUMI79GQ5w4,1091
2
+ livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
+ livekit/plugins/elevenlabs/models.py,sha256=8jTchztgpiTokHEaWUK8PPxWWfvm5SMrOGsJpzxbYAw,362
4
+ livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ livekit/plugins/elevenlabs/tts.py,sha256=GTcyQwBVVPzCYLgsnw9q5oFOq9cV3hIKndDaBPSFMr4,17738
6
+ livekit/plugins/elevenlabs/version.py,sha256=yB6WnbnD5MFhQDT5ItJ02XWVsNanlDYiOezzwv0IdcM,603
7
+ livekit_plugins_elevenlabs-0.6.dev0.dist-info/METADATA,sha256=kfWET-iNGQYX7TGoo87CiMIoMINIwE28YT4-hbp8NDY,1373
8
+ livekit_plugins_elevenlabs-0.6.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
+ livekit_plugins_elevenlabs-0.6.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
+ livekit_plugins_elevenlabs-0.6.dev0.dist-info/RECORD,,
@@ -1,10 +0,0 @@
1
- livekit/plugins/elevenlabs/__init__.py,sha256=_IMIfE4YA7d3NxrN-iCrdfQ19mwh93SY676RJGEA57c,989
2
- livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
3
- livekit/plugins/elevenlabs/models.py,sha256=g46mCMMHP3x3qtHmybHHMcid1UwmjKCcF0T4IWjMjWE,163
4
- livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/elevenlabs/tts.py,sha256=p7mEiUDR6gbqEUrLp1lgTkJ3ounN6rhnenYoYqWNF2k,16418
6
- livekit/plugins/elevenlabs/version.py,sha256=h2gCxcJSMvCrVP7h14ON6HaghqLCkbl3--HZKEopR_8,603
7
- livekit_plugins_elevenlabs-0.5.dev0.dist-info/METADATA,sha256=5uCb2q4zTTGaCSSN448GLqhj9-41bg0jjR2CSeov8ms,1365
8
- livekit_plugins_elevenlabs-0.5.dev0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
9
- livekit_plugins_elevenlabs-0.5.dev0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
10
- livekit_plugins_elevenlabs-0.5.dev0.dist-info/RECORD,,