livekit-plugins-deepgram 1.1.1__py3-none-any.whl → 1.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- livekit/plugins/deepgram/__init__.py +2 -2
- livekit/plugins/deepgram/stt.py +13 -88
- livekit/plugins/deepgram/version.py +1 -1
- {livekit_plugins_deepgram-1.1.1.dist-info → livekit_plugins_deepgram-1.1.2.dist-info}/METADATA +2 -2
- livekit_plugins_deepgram-1.1.2.dist-info/RECORD +11 -0
- livekit_plugins_deepgram-1.1.1.dist-info/RECORD +0 -11
- {livekit_plugins_deepgram-1.1.1.dist-info → livekit_plugins_deepgram-1.1.2.dist-info}/WHEEL +0 -0
@@ -19,11 +19,11 @@ Support for speech-to-text with [Deepgram](https://deepgram.com/).
|
|
19
19
|
See https://docs.livekit.io/agents/integrations/stt/deepgram/ for more information.
|
20
20
|
"""
|
21
21
|
|
22
|
-
from .stt import STT,
|
22
|
+
from .stt import STT, SpeechStream
|
23
23
|
from .tts import TTS
|
24
24
|
from .version import __version__
|
25
25
|
|
26
|
-
__all__ = ["STT", "SpeechStream", "
|
26
|
+
__all__ = ["STT", "SpeechStream", "__version__", "TTS"]
|
27
27
|
|
28
28
|
|
29
29
|
from livekit.agents import Plugin
|
livekit/plugins/deepgram/stt.py
CHANGED
@@ -20,11 +20,9 @@ import json
|
|
20
20
|
import os
|
21
21
|
import weakref
|
22
22
|
from dataclasses import dataclass
|
23
|
-
from enum import Enum
|
24
23
|
from typing import Any
|
25
24
|
|
26
25
|
import aiohttp
|
27
|
-
import numpy as np
|
28
26
|
|
29
27
|
from livekit import rtc
|
30
28
|
from livekit.agents import (
|
@@ -49,49 +47,6 @@ from .models import DeepgramLanguages, DeepgramModels
|
|
49
47
|
BASE_URL = "https://api.deepgram.com/v1/listen"
|
50
48
|
|
51
49
|
|
52
|
-
# This is the magic number during testing that we use to determine if a frame is loud enough
|
53
|
-
# to possibly contain speech. It's very conservative.
|
54
|
-
MAGIC_NUMBER_THRESHOLD = 0.004**2
|
55
|
-
|
56
|
-
|
57
|
-
class AudioEnergyFilter:
|
58
|
-
class State(Enum):
|
59
|
-
START = 0
|
60
|
-
SPEAKING = 1
|
61
|
-
SILENCE = 2
|
62
|
-
END = 3
|
63
|
-
|
64
|
-
def __init__(self, *, min_silence: float = 1.5, rms_threshold: float = MAGIC_NUMBER_THRESHOLD):
|
65
|
-
self._cooldown_seconds = min_silence
|
66
|
-
self._cooldown = min_silence
|
67
|
-
self._state = self.State.SILENCE
|
68
|
-
self._rms_threshold = rms_threshold
|
69
|
-
|
70
|
-
def update(self, frame: rtc.AudioFrame) -> State:
|
71
|
-
arr = np.frombuffer(frame.data, dtype=np.int16)
|
72
|
-
float_arr = arr.astype(np.float32) / 32768.0
|
73
|
-
rms = np.mean(np.square(float_arr))
|
74
|
-
|
75
|
-
if rms > self._rms_threshold:
|
76
|
-
self._cooldown = self._cooldown_seconds
|
77
|
-
if self._state in (self.State.SILENCE, self.State.END):
|
78
|
-
self._state = self.State.START
|
79
|
-
else:
|
80
|
-
self._state = self.State.SPEAKING
|
81
|
-
else:
|
82
|
-
if self._cooldown <= 0:
|
83
|
-
if self._state in (self.State.SPEAKING, self.State.START):
|
84
|
-
self._state = self.State.END
|
85
|
-
elif self._state == self.State.END:
|
86
|
-
self._state = self.State.SILENCE
|
87
|
-
else:
|
88
|
-
# keep speaking during cooldown
|
89
|
-
self._cooldown -= frame.duration
|
90
|
-
self._state = self.State.SPEAKING
|
91
|
-
|
92
|
-
return self._state
|
93
|
-
|
94
|
-
|
95
50
|
@dataclass
|
96
51
|
class STTOptions:
|
97
52
|
language: DeepgramLanguages | str | None
|
@@ -108,7 +63,6 @@ class STTOptions:
|
|
108
63
|
keywords: list[tuple[str, float]]
|
109
64
|
keyterms: list[str]
|
110
65
|
profanity_filter: bool
|
111
|
-
energy_filter: AudioEnergyFilter | bool = False
|
112
66
|
numerals: bool = False
|
113
67
|
mip_opt_out: bool = False
|
114
68
|
tags: NotGivenOr[list[str]] = NOT_GIVEN
|
@@ -136,7 +90,6 @@ class STT(stt.STT):
|
|
136
90
|
api_key: NotGivenOr[str] = NOT_GIVEN,
|
137
91
|
http_session: aiohttp.ClientSession | None = None,
|
138
92
|
base_url: str = BASE_URL,
|
139
|
-
energy_filter: AudioEnergyFilter | bool = False,
|
140
93
|
numerals: bool = False,
|
141
94
|
mip_opt_out: bool = False,
|
142
95
|
) -> None:
|
@@ -163,8 +116,6 @@ class STT(stt.STT):
|
|
163
116
|
api_key: Your Deepgram API key. If not provided, will look for DEEPGRAM_API_KEY environment variable.
|
164
117
|
http_session: Optional aiohttp ClientSession to use for requests.
|
165
118
|
base_url: The base URL for Deepgram API. Defaults to "https://api.deepgram.com/v1/listen".
|
166
|
-
energy_filter: Audio energy filter configuration for voice activity detection.
|
167
|
-
Can be a boolean or AudioEnergyFilter instance. Defaults to False.
|
168
119
|
numerals: Whether to include numerals in the transcription. Defaults to False.
|
169
120
|
mip_opt_out: Whether to take part in the model improvement program
|
170
121
|
|
@@ -204,7 +155,6 @@ class STT(stt.STT):
|
|
204
155
|
keywords=keywords if is_given(keywords) else [],
|
205
156
|
keyterms=keyterms if is_given(keyterms) else [],
|
206
157
|
profanity_filter=profanity_filter,
|
207
|
-
energy_filter=energy_filter,
|
208
158
|
numerals=numerals,
|
209
159
|
mip_opt_out=mip_opt_out,
|
210
160
|
tags=_validate_tags(tags) if is_given(tags) else [],
|
@@ -401,13 +351,6 @@ class SpeechStream(stt.SpeechStream):
|
|
401
351
|
duration=5.0,
|
402
352
|
)
|
403
353
|
|
404
|
-
self._audio_energy_filter: AudioEnergyFilter | None = None
|
405
|
-
if opts.energy_filter:
|
406
|
-
if isinstance(opts.energy_filter, AudioEnergyFilter):
|
407
|
-
self._audio_energy_filter = opts.energy_filter
|
408
|
-
else:
|
409
|
-
self._audio_energy_filter = AudioEnergyFilter()
|
410
|
-
|
411
354
|
self._request_id = ""
|
412
355
|
self._reconnect_event = asyncio.Event()
|
413
356
|
|
@@ -490,27 +433,10 @@ class SpeechStream(stt.SpeechStream):
|
|
490
433
|
)
|
491
434
|
|
492
435
|
has_ended = False
|
493
|
-
last_frame: rtc.AudioFrame | None = None
|
494
436
|
async for data in self._input_ch:
|
495
437
|
frames: list[rtc.AudioFrame] = []
|
496
438
|
if isinstance(data, rtc.AudioFrame):
|
497
|
-
|
498
|
-
if state in (
|
499
|
-
AudioEnergyFilter.State.START,
|
500
|
-
AudioEnergyFilter.State.SPEAKING,
|
501
|
-
):
|
502
|
-
if last_frame:
|
503
|
-
frames.extend(audio_bstream.write(last_frame.data.tobytes()))
|
504
|
-
last_frame = None
|
505
|
-
frames.extend(audio_bstream.write(data.data.tobytes()))
|
506
|
-
elif state == AudioEnergyFilter.State.END:
|
507
|
-
# no need to buffer as we have cooldown period
|
508
|
-
frames.extend(audio_bstream.flush())
|
509
|
-
has_ended = True
|
510
|
-
elif state == AudioEnergyFilter.State.SILENCE:
|
511
|
-
# buffer the last silence frame, since it could contain beginning of speech
|
512
|
-
# TODO: improve accuracy by using a ring buffer with longer window
|
513
|
-
last_frame = data
|
439
|
+
frames.extend(audio_bstream.write(data.data.tobytes()))
|
514
440
|
elif isinstance(data, self._FlushSentinel):
|
515
441
|
frames.extend(audio_bstream.flush())
|
516
442
|
has_ended = True
|
@@ -584,7 +510,8 @@ class SpeechStream(stt.SpeechStream):
|
|
584
510
|
self._reconnect_event.clear()
|
585
511
|
finally:
|
586
512
|
await utils.aio.gracefully_cancel(*tasks, wait_reconnect_task)
|
587
|
-
|
513
|
+
tasks_group.cancel()
|
514
|
+
tasks_group.exception() # retrieve the exception
|
588
515
|
finally:
|
589
516
|
if ws is not None:
|
590
517
|
await ws.close()
|
@@ -619,20 +546,18 @@ class SpeechStream(stt.SpeechStream):
|
|
619
546
|
if self._opts.tags:
|
620
547
|
live_config["tag"] = self._opts.tags
|
621
548
|
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
549
|
+
try:
|
550
|
+
ws = await asyncio.wait_for(
|
551
|
+
self._session.ws_connect(
|
552
|
+
_to_deepgram_url(live_config, base_url=self._base_url, websocket=True),
|
553
|
+
headers={"Authorization": f"Token {self._api_key}"},
|
554
|
+
),
|
555
|
+
self._conn_options.timeout,
|
556
|
+
)
|
557
|
+
except (aiohttp.ClientConnectorError, asyncio.TimeoutError) as e:
|
558
|
+
raise APIConnectionError("failed to connect to deepgram") from e
|
629
559
|
return ws
|
630
560
|
|
631
|
-
def _check_energy_state(self, frame: rtc.AudioFrame) -> AudioEnergyFilter.State:
|
632
|
-
if self._audio_energy_filter:
|
633
|
-
return self._audio_energy_filter.update(frame)
|
634
|
-
return AudioEnergyFilter.State.SPEAKING
|
635
|
-
|
636
561
|
def _on_audio_duration_report(self, duration: float) -> None:
|
637
562
|
usage_event = stt.SpeechEvent(
|
638
563
|
type=stt.SpeechEventType.RECOGNITION_USAGE,
|
{livekit_plugins_deepgram-1.1.1.dist-info → livekit_plugins_deepgram-1.1.2.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: livekit-plugins-deepgram
|
3
|
-
Version: 1.1.
|
3
|
+
Version: 1.1.2
|
4
4
|
Summary: Agent Framework plugin for services using Deepgram's API.
|
5
5
|
Project-URL: Documentation, https://docs.livekit.io
|
6
6
|
Project-URL: Website, https://livekit.io/
|
@@ -18,7 +18,7 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
18
18
|
Classifier: Topic :: Multimedia :: Video
|
19
19
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
20
|
Requires-Python: >=3.9.0
|
21
|
-
Requires-Dist: livekit-agents[codecs]>=1.1.
|
21
|
+
Requires-Dist: livekit-agents[codecs]>=1.1.2
|
22
22
|
Requires-Dist: numpy>=1.26
|
23
23
|
Description-Content-Type: text/markdown
|
24
24
|
|
@@ -0,0 +1,11 @@
|
|
1
|
+
livekit/plugins/deepgram/__init__.py,sha256=4DG4S7BVYsxVeak2SjWGCIIsnIbdbm3uC-vYg3obgZc,1326
|
2
|
+
livekit/plugins/deepgram/_utils.py,sha256=NgeR4qKZOeqs1wr8v4G2Q_KPZ5xUSFDE4f2N6WXnZH4,2041
|
3
|
+
livekit/plugins/deepgram/log.py,sha256=isjd2-ROJXiDFhRRnqRmYxv16U5H9dBV6ut2g5bU7q0,71
|
4
|
+
livekit/plugins/deepgram/models.py,sha256=dVguYc9AfjlexreN_O1C0NxX3q-ZK9k8s5B3hWsbtZ0,1236
|
5
|
+
livekit/plugins/deepgram/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
livekit/plugins/deepgram/stt.py,sha256=iIByvfTy_w3VRL40wu7fJKCsZQ2sVerWBItHKoUWnEs,28925
|
7
|
+
livekit/plugins/deepgram/tts.py,sha256=BDfzJ6PyHvixs0yNJPVthyAwoGxMcctVetR7v9k8bqg,11198
|
8
|
+
livekit/plugins/deepgram/version.py,sha256=gqaIRup9hxsq6YNsBlKPmS5PL-B8yqSRTd8wRfj8zoQ,600
|
9
|
+
livekit_plugins_deepgram-1.1.2.dist-info/METADATA,sha256=N9m8MMo29JRuuRezEz9v03hUF8PaXCFTbBnbcXAVFBo,1448
|
10
|
+
livekit_plugins_deepgram-1.1.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
11
|
+
livekit_plugins_deepgram-1.1.2.dist-info/RECORD,,
|
@@ -1,11 +0,0 @@
|
|
1
|
-
livekit/plugins/deepgram/__init__.py,sha256=GCCbVaiaD8Bw74F34l8Z1_yUCFDvgy6Tg4CnViUPJiI,1366
|
2
|
-
livekit/plugins/deepgram/_utils.py,sha256=NgeR4qKZOeqs1wr8v4G2Q_KPZ5xUSFDE4f2N6WXnZH4,2041
|
3
|
-
livekit/plugins/deepgram/log.py,sha256=isjd2-ROJXiDFhRRnqRmYxv16U5H9dBV6ut2g5bU7q0,71
|
4
|
-
livekit/plugins/deepgram/models.py,sha256=dVguYc9AfjlexreN_O1C0NxX3q-ZK9k8s5B3hWsbtZ0,1236
|
5
|
-
livekit/plugins/deepgram/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
-
livekit/plugins/deepgram/stt.py,sha256=w-T8J4hltd-hQUMfOuw54tMewhrakxJQ6IFWHrIyV4k,32102
|
7
|
-
livekit/plugins/deepgram/tts.py,sha256=BDfzJ6PyHvixs0yNJPVthyAwoGxMcctVetR7v9k8bqg,11198
|
8
|
-
livekit/plugins/deepgram/version.py,sha256=E83fn58yMTAPuXx54IpvCWS3c9SBNF9zNIhbism7Hz0,600
|
9
|
-
livekit_plugins_deepgram-1.1.1.dist-info/METADATA,sha256=V46B5xH4glA0N9ntz6bafe3DTnUJclBQ_L80fyDULTw,1448
|
10
|
-
livekit_plugins_deepgram-1.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
11
|
-
livekit_plugins_deepgram-1.1.1.dist-info/RECORD,,
|
File without changes
|