livekit-plugins-silero 0.3.dev0__tar.gz → 0.4.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/PKG-INFO +3 -3
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/__init__.py +1 -0
- livekit_plugins_silero-0.4.dev0/livekit/plugins/silero/log.py +3 -0
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/vad.py +57 -45
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/version.py +1 -1
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/PKG-INFO +3 -3
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/SOURCES.txt +1 -0
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/requires.txt +2 -2
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/setup.py +2 -2
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/README.md +0 -0
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/py.typed +0 -0
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/dependency_links.txt +0 -0
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/top_level.txt +0 -0
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/pyproject.toml +0 -0
- {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: livekit-plugins-silero
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.dev0
|
|
4
4
|
Summary: Agent Framework Plugin for Silero
|
|
5
5
|
Home-page: https://github.com/livekit/agents
|
|
6
6
|
License: Apache-2.0
|
|
@@ -20,8 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
21
|
Requires-Python: >=3.8.0
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
|
-
Requires-Dist: livekit~=0.
|
|
24
|
-
Requires-Dist: livekit-agents~=0.
|
|
23
|
+
Requires-Dist: livekit~=0.11
|
|
24
|
+
Requires-Dist: livekit-agents~=0.6.dev0
|
|
25
25
|
Requires-Dist: torch<3,>=2
|
|
26
26
|
Requires-Dist: torchaudio>=2
|
|
27
27
|
Requires-Dist: numpy<2,>=1
|
{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/vad.py
RENAMED
|
@@ -16,19 +16,19 @@ from __future__ import annotations
|
|
|
16
16
|
|
|
17
17
|
import asyncio
|
|
18
18
|
import contextlib
|
|
19
|
-
import
|
|
19
|
+
import time
|
|
20
20
|
from collections import deque
|
|
21
|
-
from typing import List
|
|
21
|
+
from typing import List
|
|
22
22
|
|
|
23
23
|
import numpy as np
|
|
24
24
|
import torch
|
|
25
25
|
from livekit import agents, rtc
|
|
26
26
|
|
|
27
|
+
from .log import logger
|
|
28
|
+
|
|
27
29
|
|
|
28
30
|
class VAD(agents.vad.VAD):
|
|
29
|
-
def __init__(
|
|
30
|
-
self, *, model_path: Optional[str] = None, use_onnx: bool = True
|
|
31
|
-
) -> None:
|
|
31
|
+
def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
|
|
32
32
|
if model_path:
|
|
33
33
|
model = torch.jit.load(model_path)
|
|
34
34
|
model.eval()
|
|
@@ -43,12 +43,12 @@ class VAD(agents.vad.VAD):
|
|
|
43
43
|
def stream(
|
|
44
44
|
self,
|
|
45
45
|
*,
|
|
46
|
-
min_speaking_duration: float = 0.
|
|
47
|
-
min_silence_duration: float = 0.
|
|
46
|
+
min_speaking_duration: float = 0.2,
|
|
47
|
+
min_silence_duration: float = 0.8,
|
|
48
48
|
padding_duration: float = 0.1,
|
|
49
49
|
sample_rate: int = 16000,
|
|
50
50
|
max_buffered_speech: float = 45.0,
|
|
51
|
-
threshold: float = 0.
|
|
51
|
+
threshold: float = 0.2,
|
|
52
52
|
) -> "VADStream":
|
|
53
53
|
return VADStream(
|
|
54
54
|
self._model,
|
|
@@ -93,6 +93,7 @@ class VADStream(agents.vad.VADStream):
|
|
|
93
93
|
self._waiting_start = False
|
|
94
94
|
self._waiting_end = False
|
|
95
95
|
self._current_sample = 0
|
|
96
|
+
self._filter = agents.utils.ExpFilter(0.8)
|
|
96
97
|
self._min_speaking_samples = min_speaking_duration * sample_rate
|
|
97
98
|
self._min_silence_samples = min_silence_duration * sample_rate
|
|
98
99
|
self._padding_duration_samples = padding_duration * sample_rate
|
|
@@ -103,12 +104,6 @@ class VADStream(agents.vad.VADStream):
|
|
|
103
104
|
self._buffered_frames: List[rtc.AudioFrame] = []
|
|
104
105
|
self._main_task = asyncio.create_task(self._run())
|
|
105
106
|
|
|
106
|
-
def log_exception(task: asyncio.Task) -> None:
|
|
107
|
-
if not task.cancelled() and task.exception():
|
|
108
|
-
logging.error(f"silero vad task failed: {task.exception()}")
|
|
109
|
-
|
|
110
|
-
self._main_task.add_done_callback(log_exception)
|
|
111
|
-
|
|
112
107
|
def push_frame(self, frame: rtc.AudioFrame) -> None:
|
|
113
108
|
if self._closed:
|
|
114
109
|
raise ValueError("cannot push frame to closed stream")
|
|
@@ -151,6 +146,9 @@ class VADStream(agents.vad.VADStream):
|
|
|
151
146
|
break
|
|
152
147
|
|
|
153
148
|
await asyncio.shield(self._run_inference())
|
|
149
|
+
|
|
150
|
+
except Exception:
|
|
151
|
+
logger.exception("silero stream failed")
|
|
154
152
|
finally:
|
|
155
153
|
self._event_queue.put_nowait(None)
|
|
156
154
|
|
|
@@ -169,13 +167,33 @@ class VADStream(agents.vad.VADStream):
|
|
|
169
167
|
tensor = tensor.to(torch.float32) / 32768.0
|
|
170
168
|
|
|
171
169
|
# run inference
|
|
172
|
-
|
|
170
|
+
start_time = time.time()
|
|
171
|
+
raw_prob = await asyncio.to_thread(
|
|
173
172
|
lambda: self._model(tensor, self._sample_rate).item()
|
|
174
173
|
)
|
|
175
|
-
self.
|
|
174
|
+
probability = self._filter.apply(1.0, raw_prob)
|
|
175
|
+
inference_duration = time.time() - start_time
|
|
176
|
+
|
|
177
|
+
# inference done
|
|
178
|
+
event = agents.vad.VADEvent(
|
|
179
|
+
type=agents.vad.VADEventType.INFERENCE_DONE,
|
|
180
|
+
samples_index=self._current_sample,
|
|
181
|
+
probability=probability,
|
|
182
|
+
raw_inference_prob=raw_prob,
|
|
183
|
+
inference_duration=inference_duration,
|
|
184
|
+
)
|
|
185
|
+
self._event_queue.put_nowait(event)
|
|
186
|
+
|
|
187
|
+
self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
|
|
176
188
|
self._current_sample += merged_frame.samples_per_channel
|
|
177
189
|
|
|
178
|
-
def _dispatch_event(
|
|
190
|
+
def _dispatch_event(
|
|
191
|
+
self,
|
|
192
|
+
original_frames: List[rtc.AudioFrame],
|
|
193
|
+
probability: float,
|
|
194
|
+
raw_inference_prob: float,
|
|
195
|
+
inference_duration: float,
|
|
196
|
+
):
|
|
179
197
|
"""
|
|
180
198
|
Dispatches a VAD event based on the speech probability and the options
|
|
181
199
|
Args:
|
|
@@ -203,15 +221,11 @@ class VADStream(agents.vad.VADStream):
|
|
|
203
221
|
int(self._min_speaking_samples // samples_10ms),
|
|
204
222
|
)
|
|
205
223
|
if len(self._buffered_frames) > max_buffer_len:
|
|
206
|
-
# if unaware of this, may be hard to debug, so logging seems ok here
|
|
207
|
-
logging.warning(
|
|
208
|
-
f"VAD buffer overflow, dropping {len(self._buffered_frames) - max_buffer_len} frames"
|
|
209
|
-
)
|
|
210
224
|
self._buffered_frames = self._buffered_frames[
|
|
211
225
|
len(self._buffered_frames) - max_buffer_len :
|
|
212
226
|
]
|
|
213
227
|
|
|
214
|
-
if
|
|
228
|
+
if probability >= self._threshold:
|
|
215
229
|
# speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
|
|
216
230
|
self._waiting_end = False
|
|
217
231
|
if not self._waiting_start and not self._speaking:
|
|
@@ -223,34 +237,31 @@ class VADStream(agents.vad.VADStream):
|
|
|
223
237
|
):
|
|
224
238
|
self._waiting_start = False
|
|
225
239
|
self._speaking = True
|
|
226
|
-
event = agents.vad.VADEvent(
|
|
227
|
-
type=agents.vad.VADEventType.START_OF_SPEECH,
|
|
228
|
-
samples_index=self._start_speech,
|
|
229
|
-
)
|
|
230
|
-
self._event_queue.put_nowait(event)
|
|
231
240
|
|
|
232
241
|
# since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
|
|
233
|
-
# the
|
|
234
|
-
# TODO(theomonnom): Maybe it is better to put the data inside the START_OF_SPEECH event?
|
|
242
|
+
# put the speech that were used to trigger the start here
|
|
235
243
|
event = agents.vad.VADEvent(
|
|
236
|
-
type=agents.vad.VADEventType.
|
|
244
|
+
type=agents.vad.VADEventType.START_OF_SPEECH,
|
|
237
245
|
samples_index=self._start_speech,
|
|
238
|
-
|
|
246
|
+
frames=self._buffered_frames[padding_count:],
|
|
247
|
+
speaking=True,
|
|
239
248
|
)
|
|
249
|
+
self._event_queue.put_nowait(event)
|
|
240
250
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
+
# we don't check the speech_prob here
|
|
252
|
+
event = agents.vad.VADEvent(
|
|
253
|
+
type=agents.vad.VADEventType.INFERENCE_DONE,
|
|
254
|
+
samples_index=self._current_sample,
|
|
255
|
+
frames=original_frames,
|
|
256
|
+
probability=probability,
|
|
257
|
+
raw_inference_prob=raw_inference_prob,
|
|
258
|
+
inference_duration=inference_duration,
|
|
259
|
+
speaking=self._speaking,
|
|
260
|
+
)
|
|
261
|
+
self._event_queue.put_nowait(event)
|
|
251
262
|
|
|
252
|
-
if
|
|
253
|
-
# stopped speaking,
|
|
263
|
+
if probability < self._threshold:
|
|
264
|
+
# stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
|
|
254
265
|
self._waiting_start = False
|
|
255
266
|
if not self._waiting_end and self._speaking:
|
|
256
267
|
self._waiting_end = True
|
|
@@ -265,9 +276,10 @@ class VADStream(agents.vad.VADStream):
|
|
|
265
276
|
event = agents.vad.VADEvent(
|
|
266
277
|
type=agents.vad.VADEventType.END_OF_SPEECH,
|
|
267
278
|
samples_index=self._end_speech,
|
|
268
|
-
duration=(self.
|
|
279
|
+
duration=(self._end_speech - self._start_speech)
|
|
269
280
|
/ self._sample_rate,
|
|
270
|
-
|
|
281
|
+
frames=self._buffered_frames,
|
|
282
|
+
speaking=False,
|
|
271
283
|
)
|
|
272
284
|
self._event_queue.put_nowait(event)
|
|
273
285
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: livekit-plugins-silero
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.dev0
|
|
4
4
|
Summary: Agent Framework Plugin for Silero
|
|
5
5
|
Home-page: https://github.com/livekit/agents
|
|
6
6
|
License: Apache-2.0
|
|
@@ -20,8 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
20
20
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
21
|
Requires-Python: >=3.8.0
|
|
22
22
|
Description-Content-Type: text/markdown
|
|
23
|
-
Requires-Dist: livekit~=0.
|
|
24
|
-
Requires-Dist: livekit-agents~=0.
|
|
23
|
+
Requires-Dist: livekit~=0.11
|
|
24
|
+
Requires-Dist: livekit-agents~=0.6.dev0
|
|
25
25
|
Requires-Dist: torch<3,>=2
|
|
26
26
|
Requires-Dist: torchaudio>=2
|
|
27
27
|
Requires-Dist: numpy<2,>=1
|
|
@@ -49,8 +49,8 @@ setuptools.setup(
|
|
|
49
49
|
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
|
|
50
50
|
python_requires=">=3.8.0",
|
|
51
51
|
install_requires=[
|
|
52
|
-
"livekit ~= 0.
|
|
53
|
-
"livekit-agents~=0.
|
|
52
|
+
"livekit ~= 0.11",
|
|
53
|
+
"livekit-agents~=0.6.dev0",
|
|
54
54
|
"torch >= 2, < 3",
|
|
55
55
|
"torchaudio >= 2",
|
|
56
56
|
"numpy >= 1, < 2",
|
|
File without changes
|
{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/py.typed
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|