livekit-plugins-cartesia 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/cartesia/stt.py +35 -102
- livekit/plugins/cartesia/version.py +1 -1
- {livekit_plugins_cartesia-1.1.0.dist-info → livekit_plugins_cartesia-1.1.2.dist-info}/METADATA +2 -2
- {livekit_plugins_cartesia-1.1.0.dist-info → livekit_plugins_cartesia-1.1.2.dist-info}/RECORD +5 -5
- {livekit_plugins_cartesia-1.1.0.dist-info → livekit_plugins_cartesia-1.1.2.dist-info}/WHEEL +0 -0
livekit/plugins/cartesia/stt.py
CHANGED
@@ -20,14 +20,13 @@ import os
|
|
20
20
|
import uuid
|
21
21
|
import weakref
|
22
22
|
from dataclasses import dataclass
|
23
|
-
from enum import Enum
|
24
23
|
|
25
24
|
import aiohttp
|
26
|
-
import numpy as np
|
27
25
|
|
28
26
|
from livekit import rtc
|
29
27
|
from livekit.agents import (
|
30
28
|
DEFAULT_API_CONNECT_OPTIONS,
|
29
|
+
APIConnectionError,
|
31
30
|
APIConnectOptions,
|
32
31
|
APIStatusError,
|
33
32
|
stt,
|
@@ -43,49 +42,6 @@ API_AUTH_HEADER = "X-API-Key"
|
|
43
42
|
API_VERSION_HEADER = "Cartesia-Version"
|
44
43
|
API_VERSION = "2025-04-16"
|
45
44
|
|
46
|
-
# Audio energy threshold for speech detection
|
47
|
-
MAGIC_NUMBER_THRESHOLD = 0.004**2
|
48
|
-
|
49
|
-
|
50
|
-
class AudioEnergyFilter:
|
51
|
-
"""Local voice activity detection based on audio energy levels."""
|
52
|
-
|
53
|
-
class State(Enum):
|
54
|
-
START = 0
|
55
|
-
SPEAKING = 1
|
56
|
-
SILENCE = 2
|
57
|
-
END = 3
|
58
|
-
|
59
|
-
def __init__(self, *, min_silence: float = 1.5, rms_threshold: float = MAGIC_NUMBER_THRESHOLD):
|
60
|
-
self._cooldown_seconds = min_silence
|
61
|
-
self._cooldown = min_silence
|
62
|
-
self._state = self.State.SILENCE
|
63
|
-
self._rms_threshold = rms_threshold
|
64
|
-
|
65
|
-
def update(self, frame: rtc.AudioFrame) -> State:
|
66
|
-
arr = np.frombuffer(frame.data, dtype=np.int16)
|
67
|
-
float_arr = arr.astype(np.float32) / 32768.0
|
68
|
-
rms = np.mean(np.square(float_arr))
|
69
|
-
|
70
|
-
if rms > self._rms_threshold:
|
71
|
-
self._cooldown = self._cooldown_seconds
|
72
|
-
if self._state in (self.State.SILENCE, self.State.END):
|
73
|
-
self._state = self.State.START
|
74
|
-
else:
|
75
|
-
self._state = self.State.SPEAKING
|
76
|
-
else:
|
77
|
-
if self._cooldown <= 0:
|
78
|
-
if self._state in (self.State.SPEAKING, self.State.START):
|
79
|
-
self._state = self.State.END
|
80
|
-
elif self._state == self.State.END:
|
81
|
-
self._state = self.State.SILENCE
|
82
|
-
else:
|
83
|
-
# keep speaking during cooldown
|
84
|
-
self._cooldown -= frame.duration
|
85
|
-
self._state = self.State.SPEAKING
|
86
|
-
|
87
|
-
return self._state
|
88
|
-
|
89
45
|
|
90
46
|
@dataclass
|
91
47
|
class STTOptions:
|
@@ -95,7 +51,6 @@ class STTOptions:
|
|
95
51
|
sample_rate: int
|
96
52
|
api_key: str
|
97
53
|
base_url: str
|
98
|
-
energy_filter: AudioEnergyFilter | bool
|
99
54
|
|
100
55
|
def get_http_url(self, path: str) -> str:
|
101
56
|
return f"{self.base_url}{path}"
|
@@ -119,7 +74,6 @@ class STT(stt.STT):
|
|
119
74
|
api_key: str | None = None,
|
120
75
|
http_session: aiohttp.ClientSession | None = None,
|
121
76
|
base_url: str = "https://api.cartesia.ai",
|
122
|
-
energy_filter: AudioEnergyFilter | bool = False,
|
123
77
|
) -> None:
|
124
78
|
"""
|
125
79
|
Create a new instance of Cartesia STT.
|
@@ -134,8 +88,6 @@ class STT(stt.STT):
|
|
134
88
|
http_session: Optional aiohttp ClientSession to use for requests.
|
135
89
|
base_url: The base URL for the Cartesia API.
|
136
90
|
Defaults to "https://api.cartesia.ai".
|
137
|
-
energy_filter: The energy filter to use for local voice activity
|
138
|
-
detection. Defaults to False.
|
139
91
|
|
140
92
|
Raises:
|
141
93
|
ValueError: If no API key is provided or found in environment variables.
|
@@ -153,7 +105,6 @@ class STT(stt.STT):
|
|
153
105
|
sample_rate=sample_rate,
|
154
106
|
api_key=cartesia_api_key,
|
155
107
|
base_url=base_url,
|
156
|
-
energy_filter=AudioEnergyFilter() if energy_filter is True else energy_filter,
|
157
108
|
)
|
158
109
|
self._session = http_session
|
159
110
|
self._streams = weakref.WeakSet[SpeechStream]()
|
@@ -220,7 +171,6 @@ class STT(stt.STT):
|
|
220
171
|
sample_rate=self._opts.sample_rate,
|
221
172
|
api_key=self._opts.api_key,
|
222
173
|
base_url=self._opts.base_url,
|
223
|
-
energy_filter=self._opts.energy_filter,
|
224
174
|
)
|
225
175
|
|
226
176
|
if is_given(language):
|
@@ -243,14 +193,7 @@ class SpeechStream(stt.SpeechStream):
|
|
243
193
|
self._request_id = str(uuid.uuid4())
|
244
194
|
self._reconnect_event = asyncio.Event()
|
245
195
|
self._speaking = False
|
246
|
-
|
247
|
-
# Set up audio energy filter for local VAD
|
248
|
-
self._audio_energy_filter: AudioEnergyFilter | None = None
|
249
|
-
if opts.energy_filter:
|
250
|
-
if isinstance(opts.energy_filter, AudioEnergyFilter):
|
251
|
-
self._audio_energy_filter = opts.energy_filter
|
252
|
-
else:
|
253
|
-
self._audio_energy_filter = AudioEnergyFilter()
|
196
|
+
self._speech_duration: float = 0
|
254
197
|
|
255
198
|
def update_options(
|
256
199
|
self,
|
@@ -266,12 +209,6 @@ class SpeechStream(stt.SpeechStream):
|
|
266
209
|
|
267
210
|
self._reconnect_event.set()
|
268
211
|
|
269
|
-
def _check_energy_state(self, frame: rtc.AudioFrame) -> AudioEnergyFilter.State:
|
270
|
-
"""Check the energy state of an audio frame for voice activity detection."""
|
271
|
-
if self._audio_energy_filter:
|
272
|
-
return self._audio_energy_filter.update(frame)
|
273
|
-
return AudioEnergyFilter.State.SPEAKING
|
274
|
-
|
275
212
|
async def _run(self) -> None:
|
276
213
|
"""Main loop for streaming transcription."""
|
277
214
|
closing_ws = False
|
@@ -296,45 +233,17 @@ class SpeechStream(stt.SpeechStream):
|
|
296
233
|
samples_per_channel=samples_50ms,
|
297
234
|
)
|
298
235
|
|
299
|
-
has_ended = False
|
300
|
-
last_frame: rtc.AudioFrame | None = None
|
301
236
|
async for data in self._input_ch:
|
302
237
|
frames: list[rtc.AudioFrame] = []
|
303
238
|
if isinstance(data, rtc.AudioFrame):
|
304
|
-
|
305
|
-
if state in (
|
306
|
-
AudioEnergyFilter.State.START,
|
307
|
-
AudioEnergyFilter.State.SPEAKING,
|
308
|
-
):
|
309
|
-
# Send buffered silence frame if we have one
|
310
|
-
if last_frame:
|
311
|
-
frames.extend(audio_bstream.write(last_frame.data.tobytes()))
|
312
|
-
last_frame = None
|
313
|
-
frames.extend(audio_bstream.write(data.data.tobytes()))
|
314
|
-
|
315
|
-
# Emit START_OF_SPEECH event if we just started speaking
|
316
|
-
if state == AudioEnergyFilter.State.START and not self._speaking:
|
317
|
-
self._speaking = True
|
318
|
-
start_event = stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
|
319
|
-
self._event_ch.send_nowait(start_event)
|
320
|
-
|
321
|
-
elif state == AudioEnergyFilter.State.END:
|
322
|
-
# Flush remaining audio and mark as ended
|
323
|
-
frames.extend(audio_bstream.flush())
|
324
|
-
has_ended = True
|
325
|
-
elif state == AudioEnergyFilter.State.SILENCE:
|
326
|
-
# Buffer the last silence frame in case it contains speech beginning
|
327
|
-
last_frame = data
|
239
|
+
frames.extend(audio_bstream.write(data.data.tobytes()))
|
328
240
|
elif isinstance(data, self._FlushSentinel):
|
329
241
|
frames.extend(audio_bstream.flush())
|
330
|
-
has_ended = True
|
331
242
|
|
332
243
|
for frame in frames:
|
244
|
+
self._speech_duration += frame.duration
|
333
245
|
await ws.send_bytes(frame.data.tobytes())
|
334
246
|
|
335
|
-
if has_ended:
|
336
|
-
has_ended = False
|
337
|
-
|
338
247
|
closing_ws = True
|
339
248
|
await ws.send_str("finalize")
|
340
249
|
|
@@ -390,7 +299,8 @@ class SpeechStream(stt.SpeechStream):
|
|
390
299
|
self._reconnect_event.clear()
|
391
300
|
finally:
|
392
301
|
await utils.aio.gracefully_cancel(*tasks, wait_reconnect_task)
|
393
|
-
|
302
|
+
tasks_group.cancel()
|
303
|
+
tasks_group.exception() # retrieve the exception
|
394
304
|
finally:
|
395
305
|
if ws is not None:
|
396
306
|
await ws.close()
|
@@ -413,14 +323,17 @@ class SpeechStream(stt.SpeechStream):
|
|
413
323
|
query_string = "&".join(f"{k}={v}" for k, v in params.items())
|
414
324
|
ws_url = f"{url}?{query_string}"
|
415
325
|
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
326
|
+
try:
|
327
|
+
ws = await asyncio.wait_for(
|
328
|
+
self._session.ws_connect(ws_url),
|
329
|
+
self._conn_options.timeout,
|
330
|
+
)
|
331
|
+
except (aiohttp.ClientConnectorError, asyncio.TimeoutError) as e:
|
332
|
+
raise APIConnectionError("failed to connect to cartesia") from e
|
420
333
|
return ws
|
421
334
|
|
422
335
|
def _process_stream_event(self, data: dict) -> None:
|
423
|
-
"""Process incoming WebSocket messages."""
|
336
|
+
"""Process incoming WebSocket messages. See https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt"""
|
424
337
|
message_type = data.get("type")
|
425
338
|
|
426
339
|
if message_type == "transcript":
|
@@ -432,15 +345,35 @@ class SpeechStream(stt.SpeechStream):
|
|
432
345
|
if not text and not is_final:
|
433
346
|
return
|
434
347
|
|
348
|
+
# we don't have a super accurate way of detecting when speech started.
|
349
|
+
# this is typically the job of the VAD, but perfoming it here just in case something's
|
350
|
+
# relying on STT to perform this task.
|
351
|
+
if not self._speaking:
|
352
|
+
self._speaking = True
|
353
|
+
start_event = stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
|
354
|
+
self._event_ch.send_nowait(start_event)
|
355
|
+
|
435
356
|
speech_data = stt.SpeechData(
|
436
357
|
language=language,
|
437
358
|
start_time=0, # Cartesia doesn't provide word-level timestamps in this version
|
438
|
-
end_time=data.get("duration", 0),
|
359
|
+
end_time=data.get("duration", 0), # This is the duration transcribed so far
|
439
360
|
confidence=data.get("probability", 1.0),
|
440
361
|
text=text,
|
441
362
|
)
|
442
363
|
|
443
364
|
if is_final:
|
365
|
+
if self._speech_duration > 0:
|
366
|
+
self._event_ch.send_nowait(
|
367
|
+
stt.SpeechEvent(
|
368
|
+
type=stt.SpeechEventType.RECOGNITION_USAGE,
|
369
|
+
request_id=request_id,
|
370
|
+
recognition_usage=stt.RecognitionUsage(
|
371
|
+
audio_duration=self._speech_duration,
|
372
|
+
),
|
373
|
+
)
|
374
|
+
)
|
375
|
+
self._speech_duration = 0
|
376
|
+
|
444
377
|
event = stt.SpeechEvent(
|
445
378
|
type=stt.SpeechEventType.FINAL_TRANSCRIPT,
|
446
379
|
request_id=request_id,
|
{livekit_plugins_cartesia-1.1.0.dist-info → livekit_plugins_cartesia-1.1.2.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: livekit-plugins-cartesia
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.2
|
4
4
|
Summary: LiveKit Agents Plugin for Cartesia
|
5
5
|
Project-URL: Documentation, https://docs.livekit.io
|
6
6
|
Project-URL: Website, https://livekit.io/
|
@@ -18,7 +18,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
18
18
|
Classifier: Topic :: Multimedia :: Video
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
20
|
Requires-Python: >=3.9.0
|
21
|
-
Requires-Dist: livekit-agents>=1.1.
|
21
|
+
Requires-Dist: livekit-agents>=1.1.2
|
22
22
|
Description-Content-Type: text/markdown
|
23
23
|
|
24
24
|
# Cartesia plugin for LiveKit Agents
|
{livekit_plugins_cartesia-1.1.0.dist-info → livekit_plugins_cartesia-1.1.2.dist-info}/RECORD
RENAMED
@@ -2,9 +2,9 @@ livekit/plugins/cartesia/__init__.py,sha256=n8BvjZSpYiYFxOg3Hyh-UuyG7XeQw9uP48_O
|
|
2
2
|
livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
|
3
3
|
livekit/plugins/cartesia/models.py,sha256=TIJQa9gNKj_1t09XUjXN5hIrp6_xG1O7YZfVrr0KG4M,1530
|
4
4
|
livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
livekit/plugins/cartesia/stt.py,sha256=
|
5
|
+
livekit/plugins/cartesia/stt.py,sha256=9Y4DdSnjXlYnUYmxHWqWrbCkHt0JE6XeNTwfYbKRslM,14592
|
6
6
|
livekit/plugins/cartesia/tts.py,sha256=gyTJIVmlA8HsWe51LCvSTLVKyO66eQZRGDZjQOOlU1E,14060
|
7
|
-
livekit/plugins/cartesia/version.py,sha256=
|
8
|
-
livekit_plugins_cartesia-1.1.
|
9
|
-
livekit_plugins_cartesia-1.1.
|
10
|
-
livekit_plugins_cartesia-1.1.
|
7
|
+
livekit/plugins/cartesia/version.py,sha256=gqaIRup9hxsq6YNsBlKPmS5PL-B8yqSRTd8wRfj8zoQ,600
|
8
|
+
livekit_plugins_cartesia-1.1.2.dist-info/METADATA,sha256=s7MSItG25nTedPJGmQXS_pHnbbl1TIpRc4duOBkyWnw,1329
|
9
|
+
livekit_plugins_cartesia-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
10
|
+
livekit_plugins_cartesia-1.1.2.dist-info/RECORD,,
|
File without changes
|