PyPI - livekit-plugins-cartesia - Versions diffs - 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl - Mend

livekit-plugins-cartesia 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

livekit/plugins/cartesia/stt.py CHANGED Viewed

@@ -20,14 +20,13 @@ import os
 import uuid
 import weakref
 from dataclasses import dataclass
-from enum import Enum
 import aiohttp
-import numpy as np
 from livekit import rtc
 from livekit.agents import (
     DEFAULT_API_CONNECT_OPTIONS,
+    APIConnectionError,
     APIConnectOptions,
     APIStatusError,
     stt,
@@ -43,49 +42,6 @@ API_AUTH_HEADER = "X-API-Key"
 API_VERSION_HEADER = "Cartesia-Version"
 API_VERSION = "2025-04-16"
-# Audio energy threshold for speech detection
-MAGIC_NUMBER_THRESHOLD = 0.004**2
-class AudioEnergyFilter:
-    """Local voice activity detection based on audio energy levels."""
-    class State(Enum):
-        START = 0
-        SPEAKING = 1
-        SILENCE = 2
-        END = 3
-    def __init__(self, *, min_silence: float = 1.5, rms_threshold: float = MAGIC_NUMBER_THRESHOLD):
-        self._cooldown_seconds = min_silence
-        self._cooldown = min_silence
-        self._state = self.State.SILENCE
-        self._rms_threshold = rms_threshold
-    def update(self, frame: rtc.AudioFrame) -> State:
-        arr = np.frombuffer(frame.data, dtype=np.int16)
-        float_arr = arr.astype(np.float32) / 32768.0
-        rms = np.mean(np.square(float_arr))
-        if rms > self._rms_threshold:
-            self._cooldown = self._cooldown_seconds
-            if self._state in (self.State.SILENCE, self.State.END):
-                self._state = self.State.START
-            else:
-                self._state = self.State.SPEAKING
-        else:
-            if self._cooldown <= 0:
-                if self._state in (self.State.SPEAKING, self.State.START):
-                    self._state = self.State.END
-                elif self._state == self.State.END:
-                    self._state = self.State.SILENCE
-            else:
-                # keep speaking during cooldown
-                self._cooldown -= frame.duration
-                self._state = self.State.SPEAKING
-        return self._state
 @dataclass
 class STTOptions:
@@ -95,7 +51,6 @@ class STTOptions:
     sample_rate: int
     api_key: str
     base_url: str
-    energy_filter: AudioEnergyFilter | bool
     def get_http_url(self, path: str) -> str:
         return f"{self.base_url}{path}"
@@ -119,7 +74,6 @@ class STT(stt.STT):
         api_key: str | None = None,
         http_session: aiohttp.ClientSession | None = None,
         base_url: str = "https://api.cartesia.ai",
-        energy_filter: AudioEnergyFilter | bool = False,
     ) -> None:
         """
         Create a new instance of Cartesia STT.
@@ -134,8 +88,6 @@ class STT(stt.STT):
             http_session: Optional aiohttp ClientSession to use for requests.
             base_url: The base URL for the Cartesia API.
                 Defaults to "https://api.cartesia.ai".
-            energy_filter: The energy filter to use for local voice activity
-                detection. Defaults to False.
         Raises:
             ValueError: If no API key is provided or found in environment variables.
@@ -153,7 +105,6 @@ class STT(stt.STT):
             sample_rate=sample_rate,
             api_key=cartesia_api_key,
             base_url=base_url,
-            energy_filter=AudioEnergyFilter() if energy_filter is True else energy_filter,
         )
         self._session = http_session
         self._streams = weakref.WeakSet[SpeechStream]()
@@ -220,7 +171,6 @@ class STT(stt.STT):
             sample_rate=self._opts.sample_rate,
             api_key=self._opts.api_key,
             base_url=self._opts.base_url,
-            energy_filter=self._opts.energy_filter,
         )
         if is_given(language):
@@ -243,14 +193,7 @@ class SpeechStream(stt.SpeechStream):
         self._request_id = str(uuid.uuid4())
         self._reconnect_event = asyncio.Event()
         self._speaking = False
-        # Set up audio energy filter for local VAD
-        self._audio_energy_filter: AudioEnergyFilter | None = None
-        if opts.energy_filter:
-            if isinstance(opts.energy_filter, AudioEnergyFilter):
-                self._audio_energy_filter = opts.energy_filter
-            else:
-                self._audio_energy_filter = AudioEnergyFilter()
+        self._speech_duration: float = 0
     def update_options(
         self,
@@ -266,12 +209,6 @@ class SpeechStream(stt.SpeechStream):
         self._reconnect_event.set()
-    def _check_energy_state(self, frame: rtc.AudioFrame) -> AudioEnergyFilter.State:
-        """Check the energy state of an audio frame for voice activity detection."""
-        if self._audio_energy_filter:
-            return self._audio_energy_filter.update(frame)
-        return AudioEnergyFilter.State.SPEAKING
     async def _run(self) -> None:
         """Main loop for streaming transcription."""
         closing_ws = False
@@ -296,45 +233,17 @@ class SpeechStream(stt.SpeechStream):
                 samples_per_channel=samples_50ms,
             )
-            has_ended = False
-            last_frame: rtc.AudioFrame | None = None
             async for data in self._input_ch:
                 frames: list[rtc.AudioFrame] = []
                 if isinstance(data, rtc.AudioFrame):
-                    state = self._check_energy_state(data)
-                    if state in (
-                        AudioEnergyFilter.State.START,
-                        AudioEnergyFilter.State.SPEAKING,
-                    ):
-                        # Send buffered silence frame if we have one
-                        if last_frame:
-                            frames.extend(audio_bstream.write(last_frame.data.tobytes()))
-                            last_frame = None
-                        frames.extend(audio_bstream.write(data.data.tobytes()))
-                        # Emit START_OF_SPEECH event if we just started speaking
-                        if state == AudioEnergyFilter.State.START and not self._speaking:
-                            self._speaking = True
-                            start_event = stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
-                            self._event_ch.send_nowait(start_event)
-                    elif state == AudioEnergyFilter.State.END:
-                        # Flush remaining audio and mark as ended
-                        frames.extend(audio_bstream.flush())
-                        has_ended = True
-                    elif state == AudioEnergyFilter.State.SILENCE:
-                        # Buffer the last silence frame in case it contains speech beginning
-                        last_frame = data
+                    frames.extend(audio_bstream.write(data.data.tobytes()))
                 elif isinstance(data, self._FlushSentinel):
                     frames.extend(audio_bstream.flush())
-                    has_ended = True
                 for frame in frames:
+                    self._speech_duration += frame.duration
                     await ws.send_bytes(frame.data.tobytes())
-                if has_ended:
-                    has_ended = False
             closing_ws = True
             await ws.send_str("finalize")
@@ -390,7 +299,8 @@ class SpeechStream(stt.SpeechStream):
                     self._reconnect_event.clear()
                 finally:
                     await utils.aio.gracefully_cancel(*tasks, wait_reconnect_task)
-                    await tasks_group
+                    tasks_group.cancel()
+                    tasks_group.exception()  # retrieve the exception
             finally:
                 if ws is not None:
                     await ws.close()
@@ -413,14 +323,17 @@ class SpeechStream(stt.SpeechStream):
         query_string = "&".join(f"{k}={v}" for k, v in params.items())
         ws_url = f"{url}?{query_string}"
-        ws = await asyncio.wait_for(
-            self._session.ws_connect(ws_url),
-            self._conn_options.timeout,
-        )
+        try:
+            ws = await asyncio.wait_for(
+                self._session.ws_connect(ws_url),
+                self._conn_options.timeout,
+            )
+        except (aiohttp.ClientConnectorError, asyncio.TimeoutError) as e:
+            raise APIConnectionError("failed to connect to cartesia") from e
         return ws
     def _process_stream_event(self, data: dict) -> None:
-        """Process incoming WebSocket messages."""
+        """Process incoming WebSocket messages. See https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt"""
         message_type = data.get("type")
         if message_type == "transcript":
@@ -432,15 +345,35 @@ class SpeechStream(stt.SpeechStream):
             if not text and not is_final:
                 return
+            # we don't have a super accurate way of detecting when speech started.
+            # this is typically the job of the VAD, but perfoming it here just in case something's
+            # relying on STT to perform this task.
+            if not self._speaking:
+                self._speaking = True
+                start_event = stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
+                self._event_ch.send_nowait(start_event)
             speech_data = stt.SpeechData(
                 language=language,
                 start_time=0,  # Cartesia doesn't provide word-level timestamps in this version
-                end_time=data.get("duration", 0),
+                end_time=data.get("duration", 0),  # This is the duration transcribed so far
                 confidence=data.get("probability", 1.0),
                 text=text,
             )
             if is_final:
+                if self._speech_duration > 0:
+                    self._event_ch.send_nowait(
+                        stt.SpeechEvent(
+                            type=stt.SpeechEventType.RECOGNITION_USAGE,
+                            request_id=request_id,
+                            recognition_usage=stt.RecognitionUsage(
+                                audio_duration=self._speech_duration,
+                            ),
+                        )
+                    )
+                    self._speech_duration = 0
                 event = stt.SpeechEvent(
                     type=stt.SpeechEventType.FINAL_TRANSCRIPT,
                     request_id=request_id,

livekit/plugins/cartesia/version.py CHANGED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "1.1.0"
+__version__ = "1.1.2"

{livekit_plugins_cartesia-1.1.0.dist-info → livekit_plugins_cartesia-1.1.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: livekit-plugins-cartesia
-Version: 1.1.0
+Version: 1.1.2
 Summary: LiveKit Agents Plugin for Cartesia
 Project-URL: Documentation, https://docs.livekit.io
 Project-URL: Website, https://livekit.io/
@@ -18,7 +18,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
 Classifier: Topic :: Multimedia :: Video
 Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
 Requires-Python: >=3.9.0
-Requires-Dist: livekit-agents>=1.1.0
+Requires-Dist: livekit-agents>=1.1.2
 Description-Content-Type: text/markdown
 # Cartesia plugin for LiveKit Agents

{livekit_plugins_cartesia-1.1.0.dist-info → livekit_plugins_cartesia-1.1.2.dist-info}/RECORD RENAMED Viewed

@@ -2,9 +2,9 @@ livekit/plugins/cartesia/__init__.py,sha256=n8BvjZSpYiYFxOg3Hyh-UuyG7XeQw9uP48_O
 livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
 livekit/plugins/cartesia/models.py,sha256=TIJQa9gNKj_1t09XUjXN5hIrp6_xG1O7YZfVrr0KG4M,1530
 livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-livekit/plugins/cartesia/stt.py,sha256=2GY2o90s-Vp0E8UX89maJsY6r0D-I225L8Etv714OJs,17211
+livekit/plugins/cartesia/stt.py,sha256=9Y4DdSnjXlYnUYmxHWqWrbCkHt0JE6XeNTwfYbKRslM,14592
 livekit/plugins/cartesia/tts.py,sha256=gyTJIVmlA8HsWe51LCvSTLVKyO66eQZRGDZjQOOlU1E,14060
-livekit/plugins/cartesia/version.py,sha256=7SjyflIFTjH0djSotKGIRoRykPCqMpVYetIlvHMFuh0,600
-livekit_plugins_cartesia-1.1.0.dist-info/METADATA,sha256=FxSF1dGRP7fLTEOT27IXgY3Eu-3nbpTdt8JCoGdFsPg,1329
-livekit_plugins_cartesia-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-livekit_plugins_cartesia-1.1.0.dist-info/RECORD,,
+livekit/plugins/cartesia/version.py,sha256=gqaIRup9hxsq6YNsBlKPmS5PL-B8yqSRTd8wRfj8zoQ,600
+livekit_plugins_cartesia-1.1.2.dist-info/METADATA,sha256=s7MSItG25nTedPJGmQXS_pHnbbl1TIpRc4duOBkyWnw,1329
+livekit_plugins_cartesia-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+livekit_plugins_cartesia-1.1.2.dist-info/RECORD,,

{livekit_plugins_cartesia-1.1.0.dist-info → livekit_plugins_cartesia-1.1.2.dist-info}/WHEEL RENAMED Viewed

File without changes

livekit-plugins-cartesia 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

livekit-plugins-cartesia 1.1.0py3-none-any.whl → 1.1.2py3-none-any.whl