PyPI - livekit-plugins-cartesia - Versions diffs - 1.0.22__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

livekit-plugins-cartesia 1.0.22py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

livekit/plugins/cartesia/__init__.py CHANGED Viewed

@@ -17,10 +17,11 @@
 See https://docs.livekit.io/agents/integrations/tts/cartesia/ for more information.
 """
+from .stt import STT
 from .tts import TTS, ChunkedStream
 from .version import __version__
-__all__ = ["TTS", "ChunkedStream", "__version__"]
+__all__ = ["STT", "TTS", "ChunkedStream", "__version__"]
 from livekit.agents import Plugin
@@ -28,7 +29,7 @@ from .log import logger
 class CartesiaPlugin(Plugin):
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__(__name__, __version__, __package__, logger)

livekit/plugins/cartesia/models.py CHANGED Viewed

@@ -39,3 +39,53 @@ TTSVoiceEmotion = Literal[
     "curiosity:high",
     "curiosity:highest",
 ]
+# STT model definitions
+STTEncoding = Literal["pcm_s16le",]
+STTModels = Literal["ink-whisper"]
+STTLanguages = Literal[
+    "en",
+    "de",
+    "es",
+    "fr",
+    "ja",
+    "pt",
+    "zh",
+    "hi",
+    "ko",
+    "it",
+    "nl",
+    "pl",
+    "ru",
+    "sv",
+    "tr",
+    "tl",
+    "bg",
+    "ro",
+    "ar",
+    "cs",
+    "el",
+    "fi",
+    "hr",
+    "ms",
+    "sk",
+    "da",
+    "ta",
+    "uk",
+    "hu",
+    "no",
+    "vi",
+    "bn",
+    "th",
+    "he",
+    "ka",
+    "id",
+    "te",
+    "gu",
+    "kn",
+    "ml",
+    "mr",
+    "or",
+    "pa",
+]

livekit/plugins/cartesia/stt.py ADDED Viewed

@@ -0,0 +1,474 @@
+# Copyright 2023 LiveKit, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+import asyncio
+import json
+import os
+import uuid
+import weakref
+from dataclasses import dataclass
+from enum import Enum
+import aiohttp
+import numpy as np
+from livekit import rtc
+from livekit.agents import (
+    DEFAULT_API_CONNECT_OPTIONS,
+    APIConnectOptions,
+    APIStatusError,
+    stt,
+    utils,
+)
+from livekit.agents.types import NOT_GIVEN, NotGivenOr
+from livekit.agents.utils import is_given
+from .log import logger
+from .models import STTEncoding, STTLanguages, STTModels
+API_AUTH_HEADER = "X-API-Key"
+API_VERSION_HEADER = "Cartesia-Version"
+API_VERSION = "2025-04-16"
+# Audio energy threshold for speech detection
+MAGIC_NUMBER_THRESHOLD = 0.004**2
+class AudioEnergyFilter:
+    """Local voice activity detection based on audio energy levels."""
+    class State(Enum):
+        START = 0
+        SPEAKING = 1
+        SILENCE = 2
+        END = 3
+    def __init__(self, *, min_silence: float = 1.5, rms_threshold: float = MAGIC_NUMBER_THRESHOLD):
+        self._cooldown_seconds = min_silence
+        self._cooldown = min_silence
+        self._state = self.State.SILENCE
+        self._rms_threshold = rms_threshold
+    def update(self, frame: rtc.AudioFrame) -> State:
+        arr = np.frombuffer(frame.data, dtype=np.int16)
+        float_arr = arr.astype(np.float32) / 32768.0
+        rms = np.mean(np.square(float_arr))
+        if rms > self._rms_threshold:
+            self._cooldown = self._cooldown_seconds
+            if self._state in (self.State.SILENCE, self.State.END):
+                self._state = self.State.START
+            else:
+                self._state = self.State.SPEAKING
+        else:
+            if self._cooldown <= 0:
+                if self._state in (self.State.SPEAKING, self.State.START):
+                    self._state = self.State.END
+                elif self._state == self.State.END:
+                    self._state = self.State.SILENCE
+            else:
+                # keep speaking during cooldown
+                self._cooldown -= frame.duration
+                self._state = self.State.SPEAKING
+        return self._state
+@dataclass
+class STTOptions:
+    model: STTModels | str
+    language: STTLanguages | str | None
+    encoding: STTEncoding
+    sample_rate: int
+    api_key: str
+    base_url: str
+    energy_filter: AudioEnergyFilter | bool
+    def get_http_url(self, path: str) -> str:
+        return f"{self.base_url}{path}"
+    def get_ws_url(self, path: str) -> str:
+        # If base_url already has a protocol, replace it, otherwise add wss://
+        if self.base_url.startswith(("http://", "https://")):
+            return f"{self.base_url.replace('http', 'ws', 1)}{path}"
+        else:
+            return f"wss://{self.base_url}{path}"
+class STT(stt.STT):
+    def __init__(
+        self,
+        *,
+        model: STTModels | str = "ink-whisper",
+        language: STTLanguages | str = "en",
+        encoding: STTEncoding = "pcm_s16le",
+        sample_rate: int = 16000,
+        api_key: str | None = None,
+        http_session: aiohttp.ClientSession | None = None,
+        base_url: str = "https://api.cartesia.ai",
+        energy_filter: AudioEnergyFilter | bool = False,
+    ) -> None:
+        """
+        Create a new instance of Cartesia STT.
+        Args:
+            model: The Cartesia STT model to use. Defaults to "ink-whisper".
+            language: The language code for recognition. Defaults to "en".
+            encoding: The audio encoding format. Defaults to "pcm_s16le".
+            sample_rate: The sample rate of the audio in Hz. Defaults to 16000.
+            api_key: The Cartesia API key. If not provided, it will be read from
+                the CARTESIA_API_KEY environment variable.
+            http_session: Optional aiohttp ClientSession to use for requests.
+            base_url: The base URL for the Cartesia API.
+                Defaults to "https://api.cartesia.ai".
+            energy_filter: The energy filter to use for local voice activity
+                detection. Defaults to False.
+        Raises:
+            ValueError: If no API key is provided or found in environment variables.
+        """
+        super().__init__(capabilities=stt.STTCapabilities(streaming=True, interim_results=False))
+        cartesia_api_key = api_key or os.environ.get("CARTESIA_API_KEY")
+        if not cartesia_api_key:
+            raise ValueError("CARTESIA_API_KEY must be set")
+        self._opts = STTOptions(
+            model=model,
+            language=language,
+            encoding=encoding,
+            sample_rate=sample_rate,
+            api_key=cartesia_api_key,
+            base_url=base_url,
+            energy_filter=AudioEnergyFilter() if energy_filter is True else energy_filter,
+        )
+        self._session = http_session
+        self._streams = weakref.WeakSet[SpeechStream]()
+    def _ensure_session(self) -> aiohttp.ClientSession:
+        if not self._session:
+            self._session = utils.http_context.http_session()
+        return self._session
+    async def _recognize_impl(
+        self,
+        buffer: utils.AudioBuffer,
+        *,
+        language: NotGivenOr[str] = NOT_GIVEN,
+        conn_options: APIConnectOptions,
+    ) -> stt.SpeechEvent:
+        raise NotImplementedError(
+            "Cartesia STT does not support batch recognition, use stream() instead"
+        )
+    def stream(
+        self,
+        *,
+        language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
+        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+    ) -> SpeechStream:
+        """Create a streaming transcription session."""
+        config = self._sanitize_options(language=language)
+        stream = SpeechStream(
+            stt=self,
+            opts=config,
+            conn_options=conn_options,
+        )
+        self._streams.add(stream)
+        return stream
+    def update_options(
+        self,
+        *,
+        model: NotGivenOr[STTModels | str] = NOT_GIVEN,
+        language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
+    ) -> None:
+        """Update STT configuration options."""
+        if is_given(model):
+            self._opts.model = model
+        if is_given(language):
+            self._opts.language = language
+        # Update all active streams
+        for stream in self._streams:
+            stream.update_options(
+                model=model,
+                language=language,
+            )
+    def _sanitize_options(
+        self, *, language: NotGivenOr[STTLanguages | str] = NOT_GIVEN
+    ) -> STTOptions:
+        """Create a sanitized copy of options with language override if provided."""
+        config = STTOptions(
+            model=self._opts.model,
+            language=self._opts.language,
+            encoding=self._opts.encoding,
+            sample_rate=self._opts.sample_rate,
+            api_key=self._opts.api_key,
+            base_url=self._opts.base_url,
+            energy_filter=self._opts.energy_filter,
+        )
+        if is_given(language):
+            config.language = language
+        return config
+class SpeechStream(stt.SpeechStream):
+    def __init__(
+        self,
+        *,
+        stt: STT,
+        opts: STTOptions,
+        conn_options: APIConnectOptions,
+    ) -> None:
+        super().__init__(stt=stt, conn_options=conn_options, sample_rate=opts.sample_rate)
+        self._opts = opts
+        self._session = stt._ensure_session()
+        self._request_id = str(uuid.uuid4())
+        self._reconnect_event = asyncio.Event()
+        self._speaking = False
+        # Set up audio energy filter for local VAD
+        self._audio_energy_filter: AudioEnergyFilter | None = None
+        if opts.energy_filter:
+            if isinstance(opts.energy_filter, AudioEnergyFilter):
+                self._audio_energy_filter = opts.energy_filter
+            else:
+                self._audio_energy_filter = AudioEnergyFilter()
+    def update_options(
+        self,
+        *,
+        model: NotGivenOr[STTModels | str] = NOT_GIVEN,
+        language: NotGivenOr[STTLanguages | str] = NOT_GIVEN,
+    ) -> None:
+        """Update streaming transcription options."""
+        if is_given(model):
+            self._opts.model = model
+        if is_given(language):
+            self._opts.language = language
+        self._reconnect_event.set()
+    def _check_energy_state(self, frame: rtc.AudioFrame) -> AudioEnergyFilter.State:
+        """Check the energy state of an audio frame for voice activity detection."""
+        if self._audio_energy_filter:
+            return self._audio_energy_filter.update(frame)
+        return AudioEnergyFilter.State.SPEAKING
+    async def _run(self) -> None:
+        """Main loop for streaming transcription."""
+        closing_ws = False
+        async def keepalive_task(ws: aiohttp.ClientWebSocketResponse) -> None:
+            try:
+                while True:
+                    await ws.ping()
+                    await asyncio.sleep(30)
+            except Exception:
+                return
+        @utils.log_exceptions(logger=logger)
+        async def send_task(ws: aiohttp.ClientWebSocketResponse) -> None:
+            nonlocal closing_ws
+            # Forward audio to Cartesia in chunks
+            samples_50ms = self._opts.sample_rate // 20
+            audio_bstream = utils.audio.AudioByteStream(
+                sample_rate=self._opts.sample_rate,
+                num_channels=1,
+                samples_per_channel=samples_50ms,
+            )
+            has_ended = False
+            last_frame: rtc.AudioFrame | None = None
+            async for data in self._input_ch:
+                frames: list[rtc.AudioFrame] = []
+                if isinstance(data, rtc.AudioFrame):
+                    state = self._check_energy_state(data)
+                    if state in (
+                        AudioEnergyFilter.State.START,
+                        AudioEnergyFilter.State.SPEAKING,
+                    ):
+                        # Send buffered silence frame if we have one
+                        if last_frame:
+                            frames.extend(audio_bstream.write(last_frame.data.tobytes()))
+                            last_frame = None
+                        frames.extend(audio_bstream.write(data.data.tobytes()))
+                        # Emit START_OF_SPEECH event if we just started speaking
+                        if state == AudioEnergyFilter.State.START and not self._speaking:
+                            self._speaking = True
+                            start_event = stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
+                            self._event_ch.send_nowait(start_event)
+                    elif state == AudioEnergyFilter.State.END:
+                        # Flush remaining audio and mark as ended
+                        frames.extend(audio_bstream.flush())
+                        has_ended = True
+                    elif state == AudioEnergyFilter.State.SILENCE:
+                        # Buffer the last silence frame in case it contains speech beginning
+                        last_frame = data
+                elif isinstance(data, self._FlushSentinel):
+                    frames.extend(audio_bstream.flush())
+                    has_ended = True
+                for frame in frames:
+                    await ws.send_bytes(frame.data.tobytes())
+                if has_ended:
+                    has_ended = False
+            closing_ws = True
+            await ws.send_str("finalize")
+        @utils.log_exceptions(logger=logger)
+        async def recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
+            nonlocal closing_ws
+            while True:
+                msg = await ws.receive()
+                if msg.type in (
+                    aiohttp.WSMsgType.CLOSED,
+                    aiohttp.WSMsgType.CLOSE,
+                    aiohttp.WSMsgType.CLOSING,
+                ):
+                    if closing_ws or self._session.closed:
+                        return
+                    raise APIStatusError(message="Cartesia STT connection closed unexpectedly")
+                if msg.type != aiohttp.WSMsgType.TEXT:
+                    logger.warning("unexpected Cartesia STT message type %s", msg.type)
+                    continue
+                try:
+                    self._process_stream_event(json.loads(msg.data))
+                except Exception:
+                    logger.exception("failed to process Cartesia STT message")
+        ws: aiohttp.ClientWebSocketResponse | None = None
+        while True:
+            try:
+                ws = await self._connect_ws()
+                tasks = [
+                    asyncio.create_task(send_task(ws)),
+                    asyncio.create_task(recv_task(ws)),
+                    asyncio.create_task(keepalive_task(ws)),
+                ]
+                tasks_group = asyncio.gather(*tasks)
+                wait_reconnect_task = asyncio.create_task(self._reconnect_event.wait())
+                try:
+                    done, _ = await asyncio.wait(
+                        (tasks_group, wait_reconnect_task),
+                        return_when=asyncio.FIRST_COMPLETED,
+                    )
+                    for task in done:
+                        if task != wait_reconnect_task:
+                            task.result()
+                    if wait_reconnect_task not in done:
+                        break
+                    self._reconnect_event.clear()
+                finally:
+                    await utils.aio.gracefully_cancel(*tasks, wait_reconnect_task)
+                    await tasks_group
+            finally:
+                if ws is not None:
+                    await ws.close()
+    async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
+        """Connect to the Cartesia STT WebSocket."""
+        params = {
+            "model": self._opts.model,
+            "sample_rate": str(self._opts.sample_rate),
+            "encoding": self._opts.encoding,
+            "cartesia_version": API_VERSION,
+            "api_key": self._opts.api_key,
+        }
+        if self._opts.language:
+            params["language"] = self._opts.language
+        # Build URL
+        url = self._opts.get_ws_url("/stt/websocket")
+        query_string = "&".join(f"{k}={v}" for k, v in params.items())
+        ws_url = f"{url}?{query_string}"
+        ws = await asyncio.wait_for(
+            self._session.ws_connect(ws_url),
+            self._conn_options.timeout,
+        )
+        return ws
+    def _process_stream_event(self, data: dict) -> None:
+        """Process incoming WebSocket messages."""
+        message_type = data.get("type")
+        if message_type == "transcript":
+            request_id = data.get("request_id", self._request_id)
+            text = data.get("text", "")
+            is_final = data.get("is_final", False)
+            language = data.get("language", self._opts.language or "en")
+            if not text and not is_final:
+                return
+            speech_data = stt.SpeechData(
+                language=language,
+                start_time=0,  # Cartesia doesn't provide word-level timestamps in this version
+                end_time=data.get("duration", 0),
+                confidence=data.get("probability", 1.0),
+                text=text,
+            )
+            if is_final:
+                event = stt.SpeechEvent(
+                    type=stt.SpeechEventType.FINAL_TRANSCRIPT,
+                    request_id=request_id,
+                    alternatives=[speech_data],
+                )
+                self._event_ch.send_nowait(event)
+                if self._speaking:
+                    self._speaking = False
+                    end_event = stt.SpeechEvent(type=stt.SpeechEventType.END_OF_SPEECH)
+                    self._event_ch.send_nowait(end_event)
+            else:
+                event = stt.SpeechEvent(
+                    type=stt.SpeechEventType.INTERIM_TRANSCRIPT,
+                    request_id=request_id,
+                    alternatives=[speech_data],
+                )
+                self._event_ch.send_nowait(event)
+        elif message_type == "flush_done":
+            logger.debug("Received flush_done acknowledgment from Cartesia STT")
+        elif message_type == "done":
+            logger.debug("Received done acknowledgment from Cartesia STT - session closing")
+        elif message_type == "error":
+            error_msg = data.get("message", "Unknown error")
+            logger.error("Cartesia STT error: %s", error_msg)
+            # We could emit an error event here if needed
+        else:
+            logger.warning("received unexpected message from Cartesia STT: %s", data)

livekit/plugins/cartesia/tts.py CHANGED Viewed

@@ -19,8 +19,8 @@ import base64
 import json
 import os
 import weakref
-from dataclasses import dataclass
-from typing import Any
+from dataclasses import dataclass, replace
+from typing import Any, Optional, Union, cast
 import aiohttp
@@ -33,11 +33,7 @@ from livekit.agents import (
     tts,
     utils,
 )
-from livekit.agents.types import (
-    DEFAULT_API_CONNECT_OPTIONS,
-    NOT_GIVEN,
-    NotGivenOr,
-)
+from livekit.agents.types import DEFAULT_API_CONNECT_OPTIONS, NOT_GIVEN, NotGivenOr
 from livekit.agents.utils import is_given
 from .log import logger
@@ -53,7 +49,6 @@ API_AUTH_HEADER = "X-API-Key"
 API_VERSION_HEADER = "Cartesia-Version"
 API_VERSION = "2024-06-10"
-NUM_CHANNELS = 1
 BUFFERED_WORDS_COUNT = 10
@@ -63,8 +58,8 @@ class _TTSOptions:
     encoding: TTSEncoding
     sample_rate: int
     voice: str | list[float]
-    speed: NotGivenOr[TTSVoiceSpeed | float]
-    emotion: NotGivenOr[list[TTSVoiceEmotion | str]]
+    speed: TTSVoiceSpeed | float | None
+    emotion: list[TTSVoiceEmotion | str] | None
     api_key: str
     language: str
     base_url: str
@@ -80,14 +75,14 @@ class TTS(tts.TTS):
     def __init__(
         self,
         *,
+        api_key: str | None = None,
         model: TTSModels | str = "sonic-2",
         language: str = "en",
         encoding: TTSEncoding = "pcm_s16le",
         voice: str | list[float] = TTSDefaultVoiceId,
-        speed: NotGivenOr[TTSVoiceSpeed | float] = NOT_GIVEN,
-        emotion: NotGivenOr[list[TTSVoiceEmotion | str]] = NOT_GIVEN,
+        speed: TTSVoiceSpeed | float | None = None,
+        emotion: list[TTSVoiceEmotion | str] | None = None,
         sample_rate: int = 24000,
-        api_key: NotGivenOr[str] = NOT_GIVEN,
         http_session: aiohttp.ClientSession | None = None,
         base_url: str = "https://api.cartesia.ai",
     ) -> None:
@@ -112,9 +107,9 @@ class TTS(tts.TTS):
         super().__init__(
             capabilities=tts.TTSCapabilities(streaming=True),
             sample_rate=sample_rate,
-            num_channels=NUM_CHANNELS,
+            num_channels=1,
         )
-        cartesia_api_key = api_key if is_given(api_key) else os.environ.get("CARTESIA_API_KEY")
+        cartesia_api_key = api_key or os.environ.get("CARTESIA_API_KEY")
         if not cartesia_api_key:
             raise ValueError("CARTESIA_API_KEY must be set")
@@ -138,14 +133,14 @@ class TTS(tts.TTS):
         )
         self._streams = weakref.WeakSet[SynthesizeStream]()
-    async def _connect_ws(self) -> aiohttp.ClientWebSocketResponse:
+    async def _connect_ws(self, timeout: float) -> aiohttp.ClientWebSocketResponse:
         session = self._ensure_session()
         url = self._opts.get_ws_url(
             f"/tts/websocket?api_key={self._opts.api_key}&cartesia_version={API_VERSION}"
         )
-        return await asyncio.wait_for(session.ws_connect(url), self._conn_options.timeout)
+        return await asyncio.wait_for(session.ws_connect(url), timeout)
-    async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse):
+    async def _close_ws(self, ws: aiohttp.ClientWebSocketResponse) -> None:
         await ws.close()
     def _ensure_session(self) -> aiohttp.ClientSession:
@@ -163,8 +158,8 @@ class TTS(tts.TTS):
         model: NotGivenOr[TTSModels | str] = NOT_GIVEN,
         language: NotGivenOr[str] = NOT_GIVEN,
         voice: NotGivenOr[str | list[float]] = NOT_GIVEN,
-        speed: NotGivenOr[TTSVoiceSpeed | float] = NOT_GIVEN,
-        emotion: NotGivenOr[list[TTSVoiceEmotion | str]] = NOT_GIVEN,
+        speed: NotGivenOr[TTSVoiceSpeed | float | None] = NOT_GIVEN,
+        emotion: NotGivenOr[list[TTSVoiceEmotion | str] | None] = NOT_GIVEN,
     ) -> None:
         """
         Update the Text-to-Speech (TTS) configuration options.
@@ -184,158 +179,123 @@ class TTS(tts.TTS):
         if is_given(language):
             self._opts.language = language
         if is_given(voice):
-            self._opts.voice = voice
+            self._opts.voice = cast(Union[str, list[float]], voice)
         if is_given(speed):
-            self._opts.speed = speed
+            self._opts.speed = cast(Optional[Union[TTSVoiceSpeed, float]], speed)
         if is_given(emotion):
             self._opts.emotion = emotion
     def synthesize(
-        self,
-        text: str,
-        *,
-        conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS,
+        self, text: str, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
     ) -> ChunkedStream:
-        return ChunkedStream(
-            tts=self,
-            input_text=text,
-            conn_options=conn_options,
-            opts=self._opts,
-            session=self._ensure_session(),
-        )
+        return ChunkedStream(tts=self, input_text=text, conn_options=conn_options)
     def stream(
         self, *, conn_options: APIConnectOptions = DEFAULT_API_CONNECT_OPTIONS
     ) -> SynthesizeStream:
-        return SynthesizeStream(
-            tts=self,
-            pool=self._pool,
-            opts=self._opts,
-        )
+        return SynthesizeStream(tts=self, conn_options=conn_options)
     async def aclose(self) -> None:
         for stream in list(self._streams):
             await stream.aclose()
         self._streams.clear()
         await self._pool.aclose()
-        await super().aclose()
 class ChunkedStream(tts.ChunkedStream):
     """Synthesize chunked text using the bytes endpoint"""
-    def __init__(
-        self,
-        *,
-        tts: TTS,
-        input_text: str,
-        opts: _TTSOptions,
-        session: aiohttp.ClientSession,
-        conn_options: APIConnectOptions,
-    ) -> None:
+    def __init__(self, *, tts: TTS, input_text: str, conn_options: APIConnectOptions) -> None:
         super().__init__(tts=tts, input_text=input_text, conn_options=conn_options)
-        self._opts, self._session = opts, session
-    async def _run(self) -> None:
-        request_id = utils.shortuuid()
-        bstream = utils.audio.AudioByteStream(
-            sample_rate=self._opts.sample_rate, num_channels=NUM_CHANNELS
-        )
+        self._tts: TTS = tts
+        self._opts = replace(tts._opts)
+    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
         json = _to_cartesia_options(self._opts)
         json["transcript"] = self._input_text
-        headers = {
-            API_AUTH_HEADER: self._opts.api_key,
-            API_VERSION_HEADER: API_VERSION,
-        }
         try:
-            async with self._session.post(
+            async with self._tts._ensure_session().post(
                 self._opts.get_http_url("/tts/bytes"),
-                headers=headers,
+                headers={
+                    API_AUTH_HEADER: self._opts.api_key,
+                    API_VERSION_HEADER: API_VERSION,
+                },
                 json=json,
-                timeout=aiohttp.ClientTimeout(
-                    total=30,
-                    sock_connect=self._conn_options.timeout,
-                ),
+                timeout=aiohttp.ClientTimeout(total=30, sock_connect=self._conn_options.timeout),
             ) as resp:
                 resp.raise_for_status()
-                emitter = tts.SynthesizedAudioEmitter(
-                    event_ch=self._event_ch,
-                    request_id=request_id,
+                output_emitter.initialize(
+                    request_id=utils.shortuuid(),
+                    sample_rate=self._opts.sample_rate,
+                    num_channels=1,
+                    mime_type="audio/pcm",
                 )
                 async for data, _ in resp.content.iter_chunks():
-                    for frame in bstream.write(data):
-                        emitter.push(frame)
+                    output_emitter.push(data)
-                for frame in bstream.flush():
-                    emitter.push(frame)
-                emitter.flush()
+                output_emitter.flush()
         except asyncio.TimeoutError:
             raise APITimeoutError() from None
         except aiohttp.ClientResponseError as e:
             raise APIStatusError(
-                message=e.message,
-                status_code=e.status,
-                request_id=None,
-                body=None,
+                message=e.message, status_code=e.status, request_id=None, body=None
             ) from None
         except Exception as e:
             raise APIConnectionError() from e
 class SynthesizeStream(tts.SynthesizeStream):
-    def __init__(
-        self,
-        *,
-        tts: TTS,
-        opts: _TTSOptions,
-        pool: utils.ConnectionPool[aiohttp.ClientWebSocketResponse],
-    ):
-        super().__init__(tts=tts)
-        self._opts, self._pool = opts, pool
+    def __init__(self, *, tts: TTS, conn_options: APIConnectOptions):
+        super().__init__(tts=tts, conn_options=conn_options)
+        self._tts: TTS = tts
         self._sent_tokenizer_stream = tokenize.basic.SentenceTokenizer(
             min_sentence_len=BUFFERED_WORDS_COUNT
         ).stream()
+        self._opts = replace(tts._opts)
-    async def _run(self) -> None:
+    async def _run(self, output_emitter: tts.AudioEmitter) -> None:
         request_id = utils.shortuuid()
+        output_emitter.initialize(
+            request_id=request_id,
+            sample_rate=self._opts.sample_rate,
+            num_channels=1,
+            mime_type="audio/pcm",
+            stream=True,
+        )
-        async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse):
+        async def _sentence_stream_task(ws: aiohttp.ClientWebSocketResponse) -> None:
+            context_id = utils.shortuuid()
             base_pkt = _to_cartesia_options(self._opts)
             async for ev in self._sent_tokenizer_stream:
                 token_pkt = base_pkt.copy()
-                token_pkt["context_id"] = request_id
+                token_pkt["context_id"] = context_id
                 token_pkt["transcript"] = ev.token + " "
                 token_pkt["continue"] = True
                 self._mark_started()
                 await ws.send_str(json.dumps(token_pkt))
             end_pkt = base_pkt.copy()
-            end_pkt["context_id"] = request_id
+            end_pkt["context_id"] = context_id
             end_pkt["transcript"] = " "
             end_pkt["continue"] = False
             await ws.send_str(json.dumps(end_pkt))
-        async def _input_task():
+        async def _input_task() -> None:
             async for data in self._input_ch:
                 if isinstance(data, self._FlushSentinel):
                     self._sent_tokenizer_stream.flush()
                     continue
                 self._sent_tokenizer_stream.push_text(data)
-            self._sent_tokenizer_stream.end_input()
-        async def _recv_task(ws: aiohttp.ClientWebSocketResponse):
-            audio_bstream = utils.audio.AudioByteStream(
-                sample_rate=self._opts.sample_rate,
-                num_channels=NUM_CHANNELS,
-            )
-            emitter = tts.SynthesizedAudioEmitter(
-                event_ch=self._event_ch,
-                request_id=request_id,
-            )
+            self._sent_tokenizer_stream.end_input()
+        async def _recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
+            current_segment_id: str | None = None
             while True:
                 msg = await ws.receive()
                 if msg.type in (
@@ -344,8 +304,7 @@ class SynthesizeStream(tts.SynthesizeStream):
                     aiohttp.WSMsgType.CLOSING,
                 ):
                     raise APIStatusError(
-                        "Cartesia connection closed unexpectedly",
-                        request_id=request_id,
+                        "Cartesia connection closed unexpectedly", request_id=request_id
                     )
                 if msg.type != aiohttp.WSMsgType.TEXT:
@@ -354,49 +313,54 @@ class SynthesizeStream(tts.SynthesizeStream):
                 data = json.loads(msg.data)
                 segment_id = data.get("context_id")
-                emitter._segment_id = segment_id
+                if current_segment_id is None:
+                    current_segment_id = segment_id
+                    output_emitter.start_segment(segment_id=segment_id)
                 if data.get("data"):
                     b64data = base64.b64decode(data["data"])
-                    for frame in audio_bstream.write(b64data):
-                        emitter.push(frame)
+                    output_emitter.push(b64data)
                 elif data.get("done"):
-                    for frame in audio_bstream.flush():
-                        emitter.push(frame)
-                    emitter.flush()
-                    if segment_id == request_id:
-                        # we're not going to receive more frames, end stream
-                        break
+                    output_emitter.end_input()
+                    break
                 else:
-                    logger.error("unexpected Cartesia message %s", data)
+                    logger.warning("unexpected message %s", data)
-        async with self._pool.connection() as ws:
-            tasks = [
-                asyncio.create_task(_input_task()),
-                asyncio.create_task(_sentence_stream_task(ws)),
-                asyncio.create_task(_recv_task(ws)),
-            ]
-            try:
-                await asyncio.gather(*tasks)
-            finally:
-                await utils.aio.gracefully_cancel(*tasks)
+        try:
+            async with self._tts._pool.connection(timeout=self._conn_options.timeout) as ws:
+                tasks = [
+                    asyncio.create_task(_input_task()),
+                    asyncio.create_task(_sentence_stream_task(ws)),
+                    asyncio.create_task(_recv_task(ws)),
+                ]
+                try:
+                    await asyncio.gather(*tasks)
+                finally:
+                    await utils.aio.gracefully_cancel(*tasks)
+        except asyncio.TimeoutError:
+            raise APITimeoutError() from None
+        except aiohttp.ClientResponseError as e:
+            raise APIStatusError(
+                message=e.message, status_code=e.status, request_id=None, body=None
+            ) from None
+        except Exception as e:
+            raise APIConnectionError() from e
 def _to_cartesia_options(opts: _TTSOptions) -> dict[str, Any]:
     voice: dict[str, Any] = {}
-    if is_given(opts.voice):
-        if isinstance(opts.voice, str):
-            voice["mode"] = "id"
-            voice["id"] = opts.voice
-        else:
-            voice["mode"] = "embedding"
-            voice["embedding"] = opts.voice
+    if isinstance(opts.voice, str):
+        voice["mode"] = "id"
+        voice["id"] = opts.voice
+    else:
+        voice["mode"] = "embedding"
+        voice["embedding"] = opts.voice
     voice_controls: dict = {}
-    if is_given(opts.speed):
+    if opts.speed:
         voice_controls["speed"] = opts.speed
-    if is_given(opts.emotion):
+    if opts.emotion:
         voice_controls["emotion"] = opts.emotion
     if voice_controls:

livekit/plugins/cartesia/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "1.0.22"
+__version__ = "1.1.0"

{livekit_plugins_cartesia-1.0.22.dist-info → livekit_plugins_cartesia-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: livekit-plugins-cartesia
-Version: 1.0.22
+Version: 1.1.0
 Summary: LiveKit Agents Plugin for Cartesia
 Project-URL: Documentation, https://docs.livekit.io
 Project-URL: Website, https://livekit.io/
@@ -18,7 +18,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
 Classifier: Topic :: Multimedia :: Video
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9.0
-Requires-Dist: livekit-agents>=1.0.22
+Requires-Dist: livekit-agents>=1.1.0
 Description-Content-Type: text/markdown
 # Cartesia plugin for LiveKit Agents

livekit_plugins_cartesia-1.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,10 @@
+livekit/plugins/cartesia/__init__.py,sha256=n8BvjZSpYiYFxOg3Hyh-UuyG7XeQw9uP48_OPDSBWdE,1259
+livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
+livekit/plugins/cartesia/models.py,sha256=TIJQa9gNKj_1t09XUjXN5hIrp6_xG1O7YZfVrr0KG4M,1530
+livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+livekit/plugins/cartesia/stt.py,sha256=2GY2o90s-Vp0E8UX89maJsY6r0D-I225L8Etv714OJs,17211
+livekit/plugins/cartesia/tts.py,sha256=gyTJIVmlA8HsWe51LCvSTLVKyO66eQZRGDZjQOOlU1E,14060
+livekit/plugins/cartesia/version.py,sha256=7SjyflIFTjH0djSotKGIRoRykPCqMpVYetIlvHMFuh0,600
+livekit_plugins_cartesia-1.1.0.dist-info/METADATA,sha256=FxSF1dGRP7fLTEOT27IXgY3Eu-3nbpTdt8JCoGdFsPg,1329
+livekit_plugins_cartesia-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+livekit_plugins_cartesia-1.1.0.dist-info/RECORD,,

livekit_plugins_cartesia-1.0.22.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-livekit/plugins/cartesia/__init__.py,sha256=DFnl1khtyLstonZ6-FzIItl6ob9132SbZDLFRfremVs,1223
-livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
-livekit/plugins/cartesia/models.py,sha256=KGY-r2luJuUNY6a3nnB0Rx-5Td12hikk-GtYLnqvysE,977
-livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/cartesia/tts.py,sha256=g3RPmTGyMjL0sG6lS-1zaq4Pa1DO2DmKfAnFeJwnHtY,14445
-livekit/plugins/cartesia/version.py,sha256=-8dkOE2vDSF9WN8VoBrSwU2sb5YBGFuwPnSQXQ-uaYM,601
-livekit_plugins_cartesia-1.0.22.dist-info/METADATA,sha256=9qFxQqS_sHBnR1i30Qx17_Ura2azcO6W8RaWKSqTaIU,1331
-livekit_plugins_cartesia-1.0.22.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-livekit_plugins_cartesia-1.0.22.dist-info/RECORD,,

{livekit_plugins_cartesia-1.0.22.dist-info → livekit_plugins_cartesia-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

livekit-plugins-cartesia 1.0.22__py3-none-any.whl → 1.1.0__py3-none-any.whl

livekit-plugins-cartesia 1.0.22py3-none-any.whl → 1.1.0py3-none-any.whl