livekit-plugins-cartesia 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,14 +20,13 @@ import os
20
20
  import uuid
21
21
  import weakref
22
22
  from dataclasses import dataclass
23
- from enum import Enum
24
23
 
25
24
  import aiohttp
26
- import numpy as np
27
25
 
28
26
  from livekit import rtc
29
27
  from livekit.agents import (
30
28
  DEFAULT_API_CONNECT_OPTIONS,
29
+ APIConnectionError,
31
30
  APIConnectOptions,
32
31
  APIStatusError,
33
32
  stt,
@@ -43,49 +42,6 @@ API_AUTH_HEADER = "X-API-Key"
43
42
  API_VERSION_HEADER = "Cartesia-Version"
44
43
  API_VERSION = "2025-04-16"
45
44
 
46
- # Audio energy threshold for speech detection
47
- MAGIC_NUMBER_THRESHOLD = 0.004**2
48
-
49
-
50
- class AudioEnergyFilter:
51
- """Local voice activity detection based on audio energy levels."""
52
-
53
- class State(Enum):
54
- START = 0
55
- SPEAKING = 1
56
- SILENCE = 2
57
- END = 3
58
-
59
- def __init__(self, *, min_silence: float = 1.5, rms_threshold: float = MAGIC_NUMBER_THRESHOLD):
60
- self._cooldown_seconds = min_silence
61
- self._cooldown = min_silence
62
- self._state = self.State.SILENCE
63
- self._rms_threshold = rms_threshold
64
-
65
- def update(self, frame: rtc.AudioFrame) -> State:
66
- arr = np.frombuffer(frame.data, dtype=np.int16)
67
- float_arr = arr.astype(np.float32) / 32768.0
68
- rms = np.mean(np.square(float_arr))
69
-
70
- if rms > self._rms_threshold:
71
- self._cooldown = self._cooldown_seconds
72
- if self._state in (self.State.SILENCE, self.State.END):
73
- self._state = self.State.START
74
- else:
75
- self._state = self.State.SPEAKING
76
- else:
77
- if self._cooldown <= 0:
78
- if self._state in (self.State.SPEAKING, self.State.START):
79
- self._state = self.State.END
80
- elif self._state == self.State.END:
81
- self._state = self.State.SILENCE
82
- else:
83
- # keep speaking during cooldown
84
- self._cooldown -= frame.duration
85
- self._state = self.State.SPEAKING
86
-
87
- return self._state
88
-
89
45
 
90
46
  @dataclass
91
47
  class STTOptions:
@@ -95,7 +51,6 @@ class STTOptions:
95
51
  sample_rate: int
96
52
  api_key: str
97
53
  base_url: str
98
- energy_filter: AudioEnergyFilter | bool
99
54
 
100
55
  def get_http_url(self, path: str) -> str:
101
56
  return f"{self.base_url}{path}"
@@ -119,7 +74,6 @@ class STT(stt.STT):
119
74
  api_key: str | None = None,
120
75
  http_session: aiohttp.ClientSession | None = None,
121
76
  base_url: str = "https://api.cartesia.ai",
122
- energy_filter: AudioEnergyFilter | bool = False,
123
77
  ) -> None:
124
78
  """
125
79
  Create a new instance of Cartesia STT.
@@ -134,8 +88,6 @@ class STT(stt.STT):
134
88
  http_session: Optional aiohttp ClientSession to use for requests.
135
89
  base_url: The base URL for the Cartesia API.
136
90
  Defaults to "https://api.cartesia.ai".
137
- energy_filter: The energy filter to use for local voice activity
138
- detection. Defaults to False.
139
91
 
140
92
  Raises:
141
93
  ValueError: If no API key is provided or found in environment variables.
@@ -153,7 +105,6 @@ class STT(stt.STT):
153
105
  sample_rate=sample_rate,
154
106
  api_key=cartesia_api_key,
155
107
  base_url=base_url,
156
- energy_filter=AudioEnergyFilter() if energy_filter is True else energy_filter,
157
108
  )
158
109
  self._session = http_session
159
110
  self._streams = weakref.WeakSet[SpeechStream]()
@@ -220,7 +171,6 @@ class STT(stt.STT):
220
171
  sample_rate=self._opts.sample_rate,
221
172
  api_key=self._opts.api_key,
222
173
  base_url=self._opts.base_url,
223
- energy_filter=self._opts.energy_filter,
224
174
  )
225
175
 
226
176
  if is_given(language):
@@ -243,14 +193,7 @@ class SpeechStream(stt.SpeechStream):
243
193
  self._request_id = str(uuid.uuid4())
244
194
  self._reconnect_event = asyncio.Event()
245
195
  self._speaking = False
246
-
247
- # Set up audio energy filter for local VAD
248
- self._audio_energy_filter: AudioEnergyFilter | None = None
249
- if opts.energy_filter:
250
- if isinstance(opts.energy_filter, AudioEnergyFilter):
251
- self._audio_energy_filter = opts.energy_filter
252
- else:
253
- self._audio_energy_filter = AudioEnergyFilter()
196
+ self._speech_duration: float = 0
254
197
 
255
198
  def update_options(
256
199
  self,
@@ -266,12 +209,6 @@ class SpeechStream(stt.SpeechStream):
266
209
 
267
210
  self._reconnect_event.set()
268
211
 
269
- def _check_energy_state(self, frame: rtc.AudioFrame) -> AudioEnergyFilter.State:
270
- """Check the energy state of an audio frame for voice activity detection."""
271
- if self._audio_energy_filter:
272
- return self._audio_energy_filter.update(frame)
273
- return AudioEnergyFilter.State.SPEAKING
274
-
275
212
  async def _run(self) -> None:
276
213
  """Main loop for streaming transcription."""
277
214
  closing_ws = False
@@ -296,45 +233,17 @@ class SpeechStream(stt.SpeechStream):
296
233
  samples_per_channel=samples_50ms,
297
234
  )
298
235
 
299
- has_ended = False
300
- last_frame: rtc.AudioFrame | None = None
301
236
  async for data in self._input_ch:
302
237
  frames: list[rtc.AudioFrame] = []
303
238
  if isinstance(data, rtc.AudioFrame):
304
- state = self._check_energy_state(data)
305
- if state in (
306
- AudioEnergyFilter.State.START,
307
- AudioEnergyFilter.State.SPEAKING,
308
- ):
309
- # Send buffered silence frame if we have one
310
- if last_frame:
311
- frames.extend(audio_bstream.write(last_frame.data.tobytes()))
312
- last_frame = None
313
- frames.extend(audio_bstream.write(data.data.tobytes()))
314
-
315
- # Emit START_OF_SPEECH event if we just started speaking
316
- if state == AudioEnergyFilter.State.START and not self._speaking:
317
- self._speaking = True
318
- start_event = stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
319
- self._event_ch.send_nowait(start_event)
320
-
321
- elif state == AudioEnergyFilter.State.END:
322
- # Flush remaining audio and mark as ended
323
- frames.extend(audio_bstream.flush())
324
- has_ended = True
325
- elif state == AudioEnergyFilter.State.SILENCE:
326
- # Buffer the last silence frame in case it contains speech beginning
327
- last_frame = data
239
+ frames.extend(audio_bstream.write(data.data.tobytes()))
328
240
  elif isinstance(data, self._FlushSentinel):
329
241
  frames.extend(audio_bstream.flush())
330
- has_ended = True
331
242
 
332
243
  for frame in frames:
244
+ self._speech_duration += frame.duration
333
245
  await ws.send_bytes(frame.data.tobytes())
334
246
 
335
- if has_ended:
336
- has_ended = False
337
-
338
247
  closing_ws = True
339
248
  await ws.send_str("finalize")
340
249
 
@@ -390,7 +299,8 @@ class SpeechStream(stt.SpeechStream):
390
299
  self._reconnect_event.clear()
391
300
  finally:
392
301
  await utils.aio.gracefully_cancel(*tasks, wait_reconnect_task)
393
- await tasks_group
302
+ tasks_group.cancel()
303
+ tasks_group.exception() # retrieve the exception
394
304
  finally:
395
305
  if ws is not None:
396
306
  await ws.close()
@@ -413,14 +323,17 @@ class SpeechStream(stt.SpeechStream):
413
323
  query_string = "&".join(f"{k}={v}" for k, v in params.items())
414
324
  ws_url = f"{url}?{query_string}"
415
325
 
416
- ws = await asyncio.wait_for(
417
- self._session.ws_connect(ws_url),
418
- self._conn_options.timeout,
419
- )
326
+ try:
327
+ ws = await asyncio.wait_for(
328
+ self._session.ws_connect(ws_url),
329
+ self._conn_options.timeout,
330
+ )
331
+ except (aiohttp.ClientConnectorError, asyncio.TimeoutError) as e:
332
+ raise APIConnectionError("failed to connect to cartesia") from e
420
333
  return ws
421
334
 
422
335
  def _process_stream_event(self, data: dict) -> None:
423
- """Process incoming WebSocket messages."""
336
+ """Process incoming WebSocket messages. See https://docs.cartesia.ai/2025-04-16/api-reference/stt/stt"""
424
337
  message_type = data.get("type")
425
338
 
426
339
  if message_type == "transcript":
@@ -432,15 +345,35 @@ class SpeechStream(stt.SpeechStream):
432
345
  if not text and not is_final:
433
346
  return
434
347
 
348
+ # we don't have a super accurate way of detecting when speech started.
349
+ # this is typically the job of the VAD, but perfoming it here just in case something's
350
+ # relying on STT to perform this task.
351
+ if not self._speaking:
352
+ self._speaking = True
353
+ start_event = stt.SpeechEvent(type=stt.SpeechEventType.START_OF_SPEECH)
354
+ self._event_ch.send_nowait(start_event)
355
+
435
356
  speech_data = stt.SpeechData(
436
357
  language=language,
437
358
  start_time=0, # Cartesia doesn't provide word-level timestamps in this version
438
- end_time=data.get("duration", 0),
359
+ end_time=data.get("duration", 0), # This is the duration transcribed so far
439
360
  confidence=data.get("probability", 1.0),
440
361
  text=text,
441
362
  )
442
363
 
443
364
  if is_final:
365
+ if self._speech_duration > 0:
366
+ self._event_ch.send_nowait(
367
+ stt.SpeechEvent(
368
+ type=stt.SpeechEventType.RECOGNITION_USAGE,
369
+ request_id=request_id,
370
+ recognition_usage=stt.RecognitionUsage(
371
+ audio_duration=self._speech_duration,
372
+ ),
373
+ )
374
+ )
375
+ self._speech_duration = 0
376
+
444
377
  event = stt.SpeechEvent(
445
378
  type=stt.SpeechEventType.FINAL_TRANSCRIPT,
446
379
  request_id=request_id,
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.1.0"
15
+ __version__ = "1.1.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-cartesia
3
- Version: 1.1.0
3
+ Version: 1.1.2
4
4
  Summary: LiveKit Agents Plugin for Cartesia
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -18,7 +18,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
18
18
  Classifier: Topic :: Multimedia :: Video
19
19
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Requires-Python: >=3.9.0
21
- Requires-Dist: livekit-agents>=1.1.0
21
+ Requires-Dist: livekit-agents>=1.1.2
22
22
  Description-Content-Type: text/markdown
23
23
 
24
24
  # Cartesia plugin for LiveKit Agents
@@ -2,9 +2,9 @@ livekit/plugins/cartesia/__init__.py,sha256=n8BvjZSpYiYFxOg3Hyh-UuyG7XeQw9uP48_O
2
2
  livekit/plugins/cartesia/log.py,sha256=4Mnhjng_DU1dIWP9IWjIQGZ67EV3LnQhWMWCHVudJbo,71
3
3
  livekit/plugins/cartesia/models.py,sha256=TIJQa9gNKj_1t09XUjXN5hIrp6_xG1O7YZfVrr0KG4M,1530
4
4
  livekit/plugins/cartesia/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
- livekit/plugins/cartesia/stt.py,sha256=2GY2o90s-Vp0E8UX89maJsY6r0D-I225L8Etv714OJs,17211
5
+ livekit/plugins/cartesia/stt.py,sha256=9Y4DdSnjXlYnUYmxHWqWrbCkHt0JE6XeNTwfYbKRslM,14592
6
6
  livekit/plugins/cartesia/tts.py,sha256=gyTJIVmlA8HsWe51LCvSTLVKyO66eQZRGDZjQOOlU1E,14060
7
- livekit/plugins/cartesia/version.py,sha256=7SjyflIFTjH0djSotKGIRoRykPCqMpVYetIlvHMFuh0,600
8
- livekit_plugins_cartesia-1.1.0.dist-info/METADATA,sha256=FxSF1dGRP7fLTEOT27IXgY3Eu-3nbpTdt8JCoGdFsPg,1329
9
- livekit_plugins_cartesia-1.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
- livekit_plugins_cartesia-1.1.0.dist-info/RECORD,,
7
+ livekit/plugins/cartesia/version.py,sha256=gqaIRup9hxsq6YNsBlKPmS5PL-B8yqSRTd8wRfj8zoQ,600
8
+ livekit_plugins_cartesia-1.1.2.dist-info/METADATA,sha256=s7MSItG25nTedPJGmQXS_pHnbbl1TIpRc4duOBkyWnw,1329
9
+ livekit_plugins_cartesia-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
10
+ livekit_plugins_cartesia-1.1.2.dist-info/RECORD,,