livekit-plugins-deepgram 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -19,11 +19,11 @@ Support for speech-to-text with [Deepgram](https://deepgram.com/).
19
19
  See https://docs.livekit.io/agents/integrations/stt/deepgram/ for more information.
20
20
  """
21
21
 
22
- from .stt import STT, AudioEnergyFilter, SpeechStream
22
+ from .stt import STT, SpeechStream
23
23
  from .tts import TTS
24
24
  from .version import __version__
25
25
 
26
- __all__ = ["STT", "SpeechStream", "AudioEnergyFilter", "__version__", "TTS"]
26
+ __all__ = ["STT", "SpeechStream", "__version__", "TTS"]
27
27
 
28
28
 
29
29
  from livekit.agents import Plugin
@@ -20,11 +20,9 @@ import json
20
20
  import os
21
21
  import weakref
22
22
  from dataclasses import dataclass
23
- from enum import Enum
24
23
  from typing import Any
25
24
 
26
25
  import aiohttp
27
- import numpy as np
28
26
 
29
27
  from livekit import rtc
30
28
  from livekit.agents import (
@@ -49,49 +47,6 @@ from .models import DeepgramLanguages, DeepgramModels
49
47
  BASE_URL = "https://api.deepgram.com/v1/listen"
50
48
 
51
49
 
52
- # This is the magic number during testing that we use to determine if a frame is loud enough
53
- # to possibly contain speech. It's very conservative.
54
- MAGIC_NUMBER_THRESHOLD = 0.004**2
55
-
56
-
57
- class AudioEnergyFilter:
58
- class State(Enum):
59
- START = 0
60
- SPEAKING = 1
61
- SILENCE = 2
62
- END = 3
63
-
64
- def __init__(self, *, min_silence: float = 1.5, rms_threshold: float = MAGIC_NUMBER_THRESHOLD):
65
- self._cooldown_seconds = min_silence
66
- self._cooldown = min_silence
67
- self._state = self.State.SILENCE
68
- self._rms_threshold = rms_threshold
69
-
70
- def update(self, frame: rtc.AudioFrame) -> State:
71
- arr = np.frombuffer(frame.data, dtype=np.int16)
72
- float_arr = arr.astype(np.float32) / 32768.0
73
- rms = np.mean(np.square(float_arr))
74
-
75
- if rms > self._rms_threshold:
76
- self._cooldown = self._cooldown_seconds
77
- if self._state in (self.State.SILENCE, self.State.END):
78
- self._state = self.State.START
79
- else:
80
- self._state = self.State.SPEAKING
81
- else:
82
- if self._cooldown <= 0:
83
- if self._state in (self.State.SPEAKING, self.State.START):
84
- self._state = self.State.END
85
- elif self._state == self.State.END:
86
- self._state = self.State.SILENCE
87
- else:
88
- # keep speaking during cooldown
89
- self._cooldown -= frame.duration
90
- self._state = self.State.SPEAKING
91
-
92
- return self._state
93
-
94
-
95
50
  @dataclass
96
51
  class STTOptions:
97
52
  language: DeepgramLanguages | str | None
@@ -108,7 +63,6 @@ class STTOptions:
108
63
  keywords: list[tuple[str, float]]
109
64
  keyterms: list[str]
110
65
  profanity_filter: bool
111
- energy_filter: AudioEnergyFilter | bool = False
112
66
  numerals: bool = False
113
67
  mip_opt_out: bool = False
114
68
  tags: NotGivenOr[list[str]] = NOT_GIVEN
@@ -136,7 +90,6 @@ class STT(stt.STT):
136
90
  api_key: NotGivenOr[str] = NOT_GIVEN,
137
91
  http_session: aiohttp.ClientSession | None = None,
138
92
  base_url: str = BASE_URL,
139
- energy_filter: AudioEnergyFilter | bool = False,
140
93
  numerals: bool = False,
141
94
  mip_opt_out: bool = False,
142
95
  ) -> None:
@@ -163,8 +116,6 @@ class STT(stt.STT):
163
116
  api_key: Your Deepgram API key. If not provided, will look for DEEPGRAM_API_KEY environment variable.
164
117
  http_session: Optional aiohttp ClientSession to use for requests.
165
118
  base_url: The base URL for Deepgram API. Defaults to "https://api.deepgram.com/v1/listen".
166
- energy_filter: Audio energy filter configuration for voice activity detection.
167
- Can be a boolean or AudioEnergyFilter instance. Defaults to False.
168
119
  numerals: Whether to include numerals in the transcription. Defaults to False.
169
120
  mip_opt_out: Whether to take part in the model improvement program
170
121
 
@@ -204,7 +155,6 @@ class STT(stt.STT):
204
155
  keywords=keywords if is_given(keywords) else [],
205
156
  keyterms=keyterms if is_given(keyterms) else [],
206
157
  profanity_filter=profanity_filter,
207
- energy_filter=energy_filter,
208
158
  numerals=numerals,
209
159
  mip_opt_out=mip_opt_out,
210
160
  tags=_validate_tags(tags) if is_given(tags) else [],
@@ -401,13 +351,6 @@ class SpeechStream(stt.SpeechStream):
401
351
  duration=5.0,
402
352
  )
403
353
 
404
- self._audio_energy_filter: AudioEnergyFilter | None = None
405
- if opts.energy_filter:
406
- if isinstance(opts.energy_filter, AudioEnergyFilter):
407
- self._audio_energy_filter = opts.energy_filter
408
- else:
409
- self._audio_energy_filter = AudioEnergyFilter()
410
-
411
354
  self._request_id = ""
412
355
  self._reconnect_event = asyncio.Event()
413
356
 
@@ -490,27 +433,10 @@ class SpeechStream(stt.SpeechStream):
490
433
  )
491
434
 
492
435
  has_ended = False
493
- last_frame: rtc.AudioFrame | None = None
494
436
  async for data in self._input_ch:
495
437
  frames: list[rtc.AudioFrame] = []
496
438
  if isinstance(data, rtc.AudioFrame):
497
- state = self._check_energy_state(data)
498
- if state in (
499
- AudioEnergyFilter.State.START,
500
- AudioEnergyFilter.State.SPEAKING,
501
- ):
502
- if last_frame:
503
- frames.extend(audio_bstream.write(last_frame.data.tobytes()))
504
- last_frame = None
505
- frames.extend(audio_bstream.write(data.data.tobytes()))
506
- elif state == AudioEnergyFilter.State.END:
507
- # no need to buffer as we have cooldown period
508
- frames.extend(audio_bstream.flush())
509
- has_ended = True
510
- elif state == AudioEnergyFilter.State.SILENCE:
511
- # buffer the last silence frame, since it could contain beginning of speech
512
- # TODO: improve accuracy by using a ring buffer with longer window
513
- last_frame = data
439
+ frames.extend(audio_bstream.write(data.data.tobytes()))
514
440
  elif isinstance(data, self._FlushSentinel):
515
441
  frames.extend(audio_bstream.flush())
516
442
  has_ended = True
@@ -584,7 +510,8 @@ class SpeechStream(stt.SpeechStream):
584
510
  self._reconnect_event.clear()
585
511
  finally:
586
512
  await utils.aio.gracefully_cancel(*tasks, wait_reconnect_task)
587
- await tasks_group
513
+ tasks_group.cancel()
514
+ tasks_group.exception() # retrieve the exception
588
515
  finally:
589
516
  if ws is not None:
590
517
  await ws.close()
@@ -619,20 +546,18 @@ class SpeechStream(stt.SpeechStream):
619
546
  if self._opts.tags:
620
547
  live_config["tag"] = self._opts.tags
621
548
 
622
- ws = await asyncio.wait_for(
623
- self._session.ws_connect(
624
- _to_deepgram_url(live_config, base_url=self._base_url, websocket=True),
625
- headers={"Authorization": f"Token {self._api_key}"},
626
- ),
627
- self._conn_options.timeout,
628
- )
549
+ try:
550
+ ws = await asyncio.wait_for(
551
+ self._session.ws_connect(
552
+ _to_deepgram_url(live_config, base_url=self._base_url, websocket=True),
553
+ headers={"Authorization": f"Token {self._api_key}"},
554
+ ),
555
+ self._conn_options.timeout,
556
+ )
557
+ except (aiohttp.ClientConnectorError, asyncio.TimeoutError) as e:
558
+ raise APIConnectionError("failed to connect to deepgram") from e
629
559
  return ws
630
560
 
631
- def _check_energy_state(self, frame: rtc.AudioFrame) -> AudioEnergyFilter.State:
632
- if self._audio_energy_filter:
633
- return self._audio_energy_filter.update(frame)
634
- return AudioEnergyFilter.State.SPEAKING
635
-
636
561
  def _on_audio_duration_report(self, duration: float) -> None:
637
562
  usage_event = stt.SpeechEvent(
638
563
  type=stt.SpeechEventType.RECOGNITION_USAGE,
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "1.1.1"
15
+ __version__ = "1.1.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: livekit-plugins-deepgram
3
- Version: 1.1.1
3
+ Version: 1.1.2
4
4
  Summary: Agent Framework plugin for services using Deepgram's API.
5
5
  Project-URL: Documentation, https://docs.livekit.io
6
6
  Project-URL: Website, https://livekit.io/
@@ -18,7 +18,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
18
18
  Classifier: Topic :: Multimedia :: Video
19
19
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
20
  Requires-Python: >=3.9.0
21
- Requires-Dist: livekit-agents[codecs]>=1.1.1
21
+ Requires-Dist: livekit-agents[codecs]>=1.1.2
22
22
  Requires-Dist: numpy>=1.26
23
23
  Description-Content-Type: text/markdown
24
24
 
@@ -0,0 +1,11 @@
1
+ livekit/plugins/deepgram/__init__.py,sha256=4DG4S7BVYsxVeak2SjWGCIIsnIbdbm3uC-vYg3obgZc,1326
2
+ livekit/plugins/deepgram/_utils.py,sha256=NgeR4qKZOeqs1wr8v4G2Q_KPZ5xUSFDE4f2N6WXnZH4,2041
3
+ livekit/plugins/deepgram/log.py,sha256=isjd2-ROJXiDFhRRnqRmYxv16U5H9dBV6ut2g5bU7q0,71
4
+ livekit/plugins/deepgram/models.py,sha256=dVguYc9AfjlexreN_O1C0NxX3q-ZK9k8s5B3hWsbtZ0,1236
5
+ livekit/plugins/deepgram/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
+ livekit/plugins/deepgram/stt.py,sha256=iIByvfTy_w3VRL40wu7fJKCsZQ2sVerWBItHKoUWnEs,28925
7
+ livekit/plugins/deepgram/tts.py,sha256=BDfzJ6PyHvixs0yNJPVthyAwoGxMcctVetR7v9k8bqg,11198
8
+ livekit/plugins/deepgram/version.py,sha256=gqaIRup9hxsq6YNsBlKPmS5PL-B8yqSRTd8wRfj8zoQ,600
9
+ livekit_plugins_deepgram-1.1.2.dist-info/METADATA,sha256=N9m8MMo29JRuuRezEz9v03hUF8PaXCFTbBnbcXAVFBo,1448
10
+ livekit_plugins_deepgram-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
+ livekit_plugins_deepgram-1.1.2.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- livekit/plugins/deepgram/__init__.py,sha256=GCCbVaiaD8Bw74F34l8Z1_yUCFDvgy6Tg4CnViUPJiI,1366
2
- livekit/plugins/deepgram/_utils.py,sha256=NgeR4qKZOeqs1wr8v4G2Q_KPZ5xUSFDE4f2N6WXnZH4,2041
3
- livekit/plugins/deepgram/log.py,sha256=isjd2-ROJXiDFhRRnqRmYxv16U5H9dBV6ut2g5bU7q0,71
4
- livekit/plugins/deepgram/models.py,sha256=dVguYc9AfjlexreN_O1C0NxX3q-ZK9k8s5B3hWsbtZ0,1236
5
- livekit/plugins/deepgram/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- livekit/plugins/deepgram/stt.py,sha256=w-T8J4hltd-hQUMfOuw54tMewhrakxJQ6IFWHrIyV4k,32102
7
- livekit/plugins/deepgram/tts.py,sha256=BDfzJ6PyHvixs0yNJPVthyAwoGxMcctVetR7v9k8bqg,11198
8
- livekit/plugins/deepgram/version.py,sha256=E83fn58yMTAPuXx54IpvCWS3c9SBNF9zNIhbism7Hz0,600
9
- livekit_plugins_deepgram-1.1.1.dist-info/METADATA,sha256=V46B5xH4glA0N9ntz6bafe3DTnUJclBQ_L80fyDULTw,1448
10
- livekit_plugins_deepgram-1.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
11
- livekit_plugins_deepgram-1.1.1.dist-info/RECORD,,