PyPI - livekit-plugins-elevenlabs - Versions diffs - 0.7.14__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

livekit-plugins-elevenlabs 0.7.14py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

livekit/plugins/elevenlabs/models.py CHANGED Viewed

@@ -10,14 +10,4 @@ TTSModels = Literal[
     "eleven_flash_v2",
 ]
-TTSEncoding = Literal[
-    "mp3_22050_32",
-    "mp3_44100_32",
-    "mp3_44100_64",
-    "mp3_44100_96",
-    "mp3_44100_128",
-    "mp3_44100_192",
-    "pcm_16000",
-    "pcm_22050",
-    "pcm_44100",
-]
+TTSEncoding = Literal["mp3_44100",]

livekit/plugins/elevenlabs/tts.py CHANGED Viewed

@@ -21,10 +21,9 @@ import json
 import os
 import weakref
 from dataclasses import dataclass
-from typing import Any, List, Literal, Optional
+from typing import Any, List, Optional
 import aiohttp
-from livekit import rtc
 from livekit.agents import (
     APIConnectionError,
     APIConnectOptions,
@@ -38,28 +37,20 @@ from livekit.agents import (
 from .log import logger
 from .models import TTSEncoding, TTSModels
-_Encoding = Literal["mp3", "pcm"]
+_DefaultEncoding: TTSEncoding = "mp3_44100"
 def _sample_rate_from_format(output_format: TTSEncoding) -> int:
-    split = output_format.split("_")  # e.g: mp3_22050_32
+    split = output_format.split("_")  # e.g: mp3_44100
     return int(split[1])
-def _encoding_from_format(output_format: TTSEncoding) -> _Encoding:
-    if output_format.startswith("mp3"):
-        return "mp3"
-    elif output_format.startswith("pcm"):
-        return "pcm"
-    raise ValueError(f"Unknown format: {output_format}")
 @dataclass
 class VoiceSettings:
     stability: float  # [0.0 - 1.0]
     similarity_boost: float  # [0.0 - 1.0]
     style: float | None = None  # [0.0 - 1.0]
+    speed: float | None = 1.0  # [0.8 - 1.2]
     use_speaker_boost: bool | None = False
@@ -76,12 +67,17 @@ DEFAULT_VOICE = Voice(
     name="Bella",
     category="premade",
     settings=VoiceSettings(
-        stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True
+        stability=0.71,
+        speed=1.0,
+        similarity_boost=0.5,
+        style=0.0,
+        use_speaker_boost=True,
     ),
 )
 API_BASE_URL_V1 = "https://api.elevenlabs.io/v1"
 AUTHORIZATION_HEADER = "xi-api-key"
+WS_INACTIVITY_TIMEOUT = 300
 @dataclass
@@ -97,6 +93,7 @@ class _TTSOptions:
     word_tokenizer: tokenize.WordTokenizer
     chunk_length_schedule: list[int]
     enable_ssml_parsing: bool
+    inactivity_timeout: int
 class TTS(tts.TTS):
@@ -107,11 +104,9 @@ class TTS(tts.TTS):
         model: TTSModels | str = "eleven_flash_v2_5",
         api_key: str | None = None,
         base_url: str | None = None,
-        encoding: TTSEncoding = "mp3_22050_32",
-        streaming_latency: int = 3,
-        word_tokenizer: tokenize.WordTokenizer = tokenize.basic.WordTokenizer(
-            ignore_punctuation=False  # punctuation can help for intonation
-        ),
+        streaming_latency: int = 0,
+        inactivity_timeout: int = WS_INACTIVITY_TIMEOUT,
+        word_tokenizer: Optional[tokenize.WordTokenizer] = None,
         enable_ssml_parsing: bool = False,
         chunk_length_schedule: list[int] = [80, 120, 200, 260],  # range is [50, 500]
         http_session: aiohttp.ClientSession | None = None,
@@ -127,8 +122,8 @@ class TTS(tts.TTS):
             model (TTSModels | str): TTS model to use. Defaults to "eleven_turbo_v2_5".
             api_key (str | None): ElevenLabs API key. Can be set via argument or `ELEVEN_API_KEY` environment variable.
             base_url (str | None): Custom base URL for the API. Optional.
-            encoding (TTSEncoding): Audio encoding format. Defaults to "mp3_22050_32".
-            streaming_latency (int): Latency in seconds for streaming. Defaults to 3.
+            streaming_latency (int): Optimize for streaming latency, defaults to 0 - disabled. 4 for max latency optimizations. deprecated
+            inactivity_timeout (int): Inactivity timeout in seconds for the websocket connection. Defaults to 300.
             word_tokenizer (tokenize.WordTokenizer): Tokenizer for processing text. Defaults to basic WordTokenizer.
             enable_ssml_parsing (bool): Enable SSML parsing for input text. Defaults to False.
             chunk_length_schedule (list[int]): Schedule for chunk lengths, ranging from 50 to 500. Defaults to [80, 120, 200, 260].
@@ -140,7 +135,7 @@ class TTS(tts.TTS):
             capabilities=tts.TTSCapabilities(
                 streaming=True,
             ),
-            sample_rate=_sample_rate_from_format(encoding),
+            sample_rate=_sample_rate_from_format(_DefaultEncoding),
             num_channels=1,
         )
@@ -156,23 +151,31 @@ class TTS(tts.TTS):
                 "ElevenLabs API key is required, either as argument or set ELEVEN_API_KEY environmental variable"
             )
+        if word_tokenizer is None:
+            word_tokenizer = tokenize.basic.WordTokenizer(
+                ignore_punctuation=False  # punctuation can help for intonation
+            )
         self._opts = _TTSOptions(
             voice=voice,
             model=model,
             api_key=api_key,
             base_url=base_url or API_BASE_URL_V1,
-            encoding=encoding,
+            encoding=_DefaultEncoding,
             sample_rate=self.sample_rate,
             streaming_latency=streaming_latency,
             word_tokenizer=word_tokenizer,
             chunk_length_schedule=chunk_length_schedule,
             enable_ssml_parsing=enable_ssml_parsing,
             language=language,
+            inactivity_timeout=inactivity_timeout,
         )
         self._session = http_session
         self._pool = utils.ConnectionPool[aiohttp.ClientWebSocketResponse](
             connect_cb=self._connect_ws,
             close_cb=self._close_ws,
+            max_session_duration=inactivity_timeout,
+            mark_refreshed_on_get=True,
         )
         self._streams = weakref.WeakSet[SynthesizeStream]()
@@ -195,6 +198,9 @@ class TTS(tts.TTS):
         return self._session
+    def prewarm(self) -> None:
+        self._pool.prewarm()
     async def list_voices(self) -> List[Voice]:
         async with self._ensure_session().get(
             f"{self._opts.base_url}/voices",
@@ -262,15 +268,9 @@ class ChunkedStream(tts.ChunkedStream):
     ) -> None:
         super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
         self._opts, self._session = opts, session
-        if _encoding_from_format(self._opts.encoding) == "mp3":
-            self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
     async def _run(self) -> None:
         request_id = utils.shortuuid()
-        bstream = utils.audio.AudioByteStream(
-            sample_rate=self._opts.sample_rate, num_channels=1
-        )
         voice_settings = (
             _strip_nones(dataclasses.asdict(self._opts.voice.settings))
             if self._opts.voice.settings
@@ -282,6 +282,12 @@ class ChunkedStream(tts.ChunkedStream):
             "voice_settings": voice_settings,
         }
+        decoder = utils.codecs.AudioStreamDecoder(
+            sample_rate=self._opts.sample_rate,
+            num_channels=1,
+        )
+        decode_task: asyncio.Task | None = None
         try:
             async with self._session.post(
                 _synthesize_url(self._opts),
@@ -293,32 +299,21 @@ class ChunkedStream(tts.ChunkedStream):
                     logger.error("11labs returned non-audio data: %s", content)
                     return
-                encoding = _encoding_from_format(self._opts.encoding)
-                if encoding == "mp3":
-                    async for bytes_data, _ in resp.content.iter_chunks():
-                        for frame in self._mp3_decoder.decode_chunk(bytes_data):
-                            for frame in bstream.write(frame.data.tobytes()):
-                                self._event_ch.send_nowait(
-                                    tts.SynthesizedAudio(
-                                        request_id=request_id,
-                                        frame=frame,
-                                    )
-                                )
-                else:
-                    async for bytes_data, _ in resp.content.iter_chunks():
-                        for frame in bstream.write(bytes_data):
-                            self._event_ch.send_nowait(
-                                tts.SynthesizedAudio(
-                                    request_id=request_id,
-                                    frame=frame,
-                                )
-                            )
-                for frame in bstream.flush():
-                    self._event_ch.send_nowait(
-                        tts.SynthesizedAudio(request_id=request_id, frame=frame)
-                    )
+                async def _decode_loop():
+                    try:
+                        async for bytes_data, _ in resp.content.iter_chunks():
+                            decoder.push(bytes_data)
+                    finally:
+                        decoder.end_input()
+                decode_task = asyncio.create_task(_decode_loop())
+                emitter = tts.SynthesizedAudioEmitter(
+                    event_ch=self._event_ch,
+                    request_id=request_id,
+                )
+                async for frame in decoder:
+                    emitter.push(frame)
+                emitter.flush()
         except asyncio.TimeoutError as e:
             raise APITimeoutError() from e
         except aiohttp.ClientResponseError as e:
@@ -330,6 +325,10 @@ class ChunkedStream(tts.ChunkedStream):
             ) from e
         except Exception as e:
             raise APIConnectionError() from e
+        finally:
+            if decode_task:
+                await utils.aio.gracefully_cancel(decode_task)
+            await decoder.aclose()
 class SynthesizeStream(tts.SynthesizeStream):
@@ -344,7 +343,6 @@ class SynthesizeStream(tts.SynthesizeStream):
     ):
         super().__init__(tts=tts)
         self._opts, self._pool = opts, pool
-        self._mp3_decoder = utils.codecs.Mp3StreamDecoder()
     async def _run(self) -> None:
         request_id = utils.shortuuid()
@@ -360,12 +358,13 @@ class SynthesizeStream(tts.SynthesizeStream):
                         # new segment (after flush for e.g)
                         word_stream = self._opts.word_tokenizer.stream()
                         self._segments_ch.send_nowait(word_stream)
                     word_stream.push_text(input)
                 elif isinstance(input, self._FlushSentinel):
                     if word_stream is not None:
                         word_stream.end_input()
                     word_stream = None
+            if word_stream is not None:
+                word_stream.end_input()
             self._segments_ch.close()
         @utils.log_exceptions(logger=logger)
@@ -402,6 +401,11 @@ class SynthesizeStream(tts.SynthesizeStream):
             segment_id = utils.shortuuid()
             expected_text = ""  # accumulate all tokens sent
+            decoder = utils.codecs.AudioStreamDecoder(
+                sample_rate=self._opts.sample_rate,
+                num_channels=1,
+            )
             # 11labs protocol expects the first message to be an "init msg"
             init_pkt = dict(
                 text=" ",
@@ -416,6 +420,7 @@ class SynthesizeStream(tts.SynthesizeStream):
             )
             await ws_conn.send_str(json.dumps(init_pkt))
+            @utils.log_exceptions(logger=logger)
             async def send_task():
                 nonlocal expected_text
                 xml_content = []
@@ -442,27 +447,23 @@ class SynthesizeStream(tts.SynthesizeStream):
                     logger.warning("11labs stream ended with incomplete xml content")
                 await ws_conn.send_str(json.dumps({"flush": True}))
+            # consumes from decoder and generates events
+            @utils.log_exceptions(logger=logger)
+            async def generate_task():
+                emitter = tts.SynthesizedAudioEmitter(
+                    event_ch=self._event_ch,
+                    request_id=request_id,
+                    segment_id=segment_id,
+                )
+                async for frame in decoder:
+                    emitter.push(frame)
+                emitter.flush()
+            # receives from ws and decodes audio
+            @utils.log_exceptions(logger=logger)
             async def recv_task():
                 nonlocal expected_text
                 received_text = ""
-                audio_bstream = utils.audio.AudioByteStream(
-                    sample_rate=self._opts.sample_rate,
-                    num_channels=1,
-                )
-                last_frame: rtc.AudioFrame | None = None
-                def _send_last_frame(*, segment_id: str, is_final: bool) -> None:
-                    nonlocal last_frame
-                    if last_frame is not None:
-                        self._event_ch.send_nowait(
-                            tts.SynthesizedAudio(
-                                request_id=request_id,
-                                segment_id=segment_id,
-                                frame=last_frame,
-                                is_final=is_final,
-                            )
-                        )
-                        last_frame = None
                 while True:
                     msg = await ws_conn.receive()
@@ -481,45 +482,36 @@ class SynthesizeStream(tts.SynthesizeStream):
                         continue
                     data = json.loads(msg.data)
-                    encoding = _encoding_from_format(self._opts.encoding)
                     if data.get("audio"):
                         b64data = base64.b64decode(data["audio"])
-                        if encoding == "mp3":
-                            for frame in self._mp3_decoder.decode_chunk(b64data):
-                                for frame in audio_bstream.write(frame.data.tobytes()):
-                                    _send_last_frame(
-                                        segment_id=segment_id, is_final=False
-                                    )
-                                    last_frame = frame
-                        else:
-                            for frame in audio_bstream.write(b64data):
-                                _send_last_frame(segment_id=segment_id, is_final=False)
-                                last_frame = frame
-                    elif data.get("isFinal"):
-                        for frame in audio_bstream.flush():
-                            _send_last_frame(segment_id=segment_id, is_final=False)
-                            last_frame = frame
-                        _send_last_frame(segment_id=segment_id, is_final=True)
-                        break
+                        decoder.push(b64data)
+                        if alignment := data.get("normalizedAlignment"):
+                            received_text += "".join(
+                                alignment.get("chars", [])
+                            ).replace(" ", "")
+                            if received_text == expected_text:
+                                decoder.end_input()
+                                break
                     elif data.get("error"):
-                        logger.error("11labs reported an error: %s", data["error"])
+                        raise APIStatusError(
+                            message=data["error"],
+                            status_code=500,
+                            request_id=request_id,
+                            body=None,
+                        )
                     else:
-                        logger.error("unexpected 11labs message %s", data)
-                    if alignment := data.get("normalizedAlignment"):
-                        received_text += "".join(alignment.get("chars", [])).replace(
-                            " ", ""
+                        raise APIStatusError(
+                            message=f"unexpected 11labs message {data}",
+                            status_code=500,
+                            request_id=request_id,
+                            body=None,
                         )
-                        if received_text == expected_text:
-                            for frame in audio_bstream.flush():
-                                _send_last_frame(segment_id=segment_id, is_final=False)
-                                last_frame = frame
-                            _send_last_frame(segment_id=segment_id, is_final=True)
-                            break
             tasks = [
                 asyncio.create_task(send_task()),
                 asyncio.create_task(recv_task()),
+                asyncio.create_task(generate_task()),
             ]
             try:
                 await asyncio.gather(*tasks)
@@ -532,10 +524,13 @@ class SynthesizeStream(tts.SynthesizeStream):
                     request_id=request_id,
                     body=None,
                 ) from e
+            except APIStatusError:
+                raise
             except Exception as e:
                 raise APIConnectionError() from e
             finally:
                 await utils.aio.gracefully_cancel(*tasks)
+                await decoder.aclose()
 def _dict_to_voices_list(data: dict[str, Any]):
@@ -561,11 +556,13 @@ def _synthesize_url(opts: _TTSOptions) -> str:
     voice_id = opts.voice.id
     model_id = opts.model
     output_format = opts.encoding
-    latency = opts.streaming_latency
-    return (
+    url = (
         f"{base_url}/text-to-speech/{voice_id}/stream?"
-        f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}"
+        f"model_id={model_id}&output_format={output_format}"
     )
+    if opts.streaming_latency:
+        url += f"&optimize_streaming_latency={opts.streaming_latency}"
+    return url
 def _stream_url(opts: _TTSOptions) -> str:
@@ -573,14 +570,16 @@ def _stream_url(opts: _TTSOptions) -> str:
     voice_id = opts.voice.id
     model_id = opts.model
     output_format = opts.encoding
-    latency = opts.streaming_latency
     enable_ssml = str(opts.enable_ssml_parsing).lower()
     language = opts.language
+    inactivity_timeout = opts.inactivity_timeout
     url = (
         f"{base_url}/text-to-speech/{voice_id}/stream-input?"
-        f"model_id={model_id}&output_format={output_format}&optimize_streaming_latency={latency}&"
-        f"enable_ssml_parsing={enable_ssml}"
+        f"model_id={model_id}&output_format={output_format}&"
+        f"enable_ssml_parsing={enable_ssml}&inactivity_timeout={inactivity_timeout}"
     )
     if language is not None:
         url += f"&language_code={language}"
+    if opts.streaming_latency:
+        url += f"&optimize_streaming_latency={opts.streaming_latency}"
     return url

livekit/plugins/elevenlabs/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.7.14"
+__version__ = "0.8.0"

{livekit_plugins_elevenlabs-0.7.14.dist-info → livekit_plugins_elevenlabs-0.8.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: livekit-plugins-elevenlabs
-Version: 0.7.14
+Version: 0.8.0
 Summary: Agent Framework plugin for voice synthesis with ElevenLabs' API.
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -19,7 +19,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: >=3.9.0
 Description-Content-Type: text/markdown
-Requires-Dist: livekit-agents[codecs]>=0.12.11
+Requires-Dist: livekit-agents[codecs]<1.0.0,>=0.12.16
 Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type

livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
+livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
+livekit/plugins/elevenlabs/models.py,sha256=nB43wLS1ilzS7IxLYVSQxBjKPnbiPl4AHpHAOlG2i00,273
+livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/elevenlabs/tts.py,sha256=KCZnuAngDZck4zIMMgp0BLV0GS31kKChMvdvXUVZ8vY,20491
+livekit/plugins/elevenlabs/version.py,sha256=fObgfvFfJb5Vj0qY1hgEiVKSo6z6atjrJvwAVl4KvR4,600
+livekit_plugins_elevenlabs-0.8.0.dist-info/METADATA,sha256=BwddENtvF9zqxTgjgIsHyavyRfA82TBISYEVwFfo2vs,1529
+livekit_plugins_elevenlabs-0.8.0.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
+livekit_plugins_elevenlabs-0.8.0.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
+livekit_plugins_elevenlabs-0.8.0.dist-info/RECORD,,

livekit_plugins_elevenlabs-0.7.14.dist-info/RECORD DELETED Viewed

@@ -1,10 +0,0 @@
-livekit/plugins/elevenlabs/__init__.py,sha256=YZVadomFq3JWiZN6GWXJbuE4vaNNWq1CmdH25du8qwg,1249
-livekit/plugins/elevenlabs/log.py,sha256=hIuXqDsEB5GBa7rQY3z4Uqi1oCqc_lRmCHZEmXz0LHw,73
-livekit/plugins/elevenlabs/models.py,sha256=cVoaMYNlUXZzP-HOpbtU16OM9m-bACnSat8-o87tTyk,435
-livekit/plugins/elevenlabs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/elevenlabs/tts.py,sha256=_d8V_YLx1tuScKtmDipoKHhqF3y68lXg03phixEHU3M,21419
-livekit/plugins/elevenlabs/version.py,sha256=1Trenk6kp4J1gdS0z55hdro60GNOnD1s0F3-AoNr4VM,601
-livekit_plugins_elevenlabs-0.7.14.dist-info/METADATA,sha256=WGgcKpZb9PYymh1pNvF7B5dhLXUlQj3n0ALlwJmfYfE,1523
-livekit_plugins_elevenlabs-0.7.14.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
-livekit_plugins_elevenlabs-0.7.14.dist-info/top_level.txt,sha256=OoDok3xUmXbZRvOrfvvXB-Juu4DX79dlq188E19YHoo,8
-livekit_plugins_elevenlabs-0.7.14.dist-info/RECORD,,

{livekit_plugins_elevenlabs-0.7.14.dist-info → livekit_plugins_elevenlabs-0.8.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{livekit_plugins_elevenlabs-0.7.14.dist-info → livekit_plugins_elevenlabs-0.8.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

livekit-plugins-elevenlabs 0.7.14__py3-none-any.whl → 0.8.0__py3-none-any.whl

livekit-plugins-elevenlabs 0.7.14py3-none-any.whl → 0.8.0py3-none-any.whl