livekit-plugins-silero 0.6.dev0__tar.gz → 0.6.0.dev2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/PKG-INFO +5 -9
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/__init__.py +1 -4
- livekit_plugins_silero-0.6.0.dev2/livekit/plugins/silero/onnx_model.py +86 -0
- livekit_plugins_silero-0.6.0.dev2/livekit/plugins/silero/resources/__init__.py +1 -0
- livekit_plugins_silero-0.6.0.dev2/livekit/plugins/silero/resources/silero_vad.onnx +3 -0
- livekit_plugins_silero-0.6.0.dev2/livekit/plugins/silero/vad.py +286 -0
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/version.py +1 -1
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit_plugins_silero.egg-info/PKG-INFO +5 -9
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit_plugins_silero.egg-info/SOURCES.txt +3 -0
- livekit_plugins_silero-0.6.0.dev2/livekit_plugins_silero.egg-info/requires.txt +3 -0
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/setup.py +6 -11
- livekit_plugins_silero-0.6.dev0/livekit/plugins/silero/vad.py +0 -291
- livekit_plugins_silero-0.6.dev0/livekit_plugins_silero.egg-info/requires.txt +0 -6
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/README.md +0 -0
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/log.py +0 -0
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/py.typed +0 -0
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit_plugins_silero.egg-info/dependency_links.txt +0 -0
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit_plugins_silero.egg-info/top_level.txt +0 -0
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/pyproject.toml +0 -0
- {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: livekit-plugins-silero
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.0.dev2
|
|
4
4
|
Summary: Agent Framework Plugin for Silero
|
|
5
5
|
Home-page: https://github.com/livekit/agents
|
|
6
6
|
License: Apache-2.0
|
|
@@ -14,18 +14,14 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
14
14
|
Classifier: Topic :: Multimedia :: Video
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
|
-
Requires-Python: >=3.
|
|
20
|
+
Requires-Python: >=3.9.0
|
|
22
21
|
Description-Content-Type: text/markdown
|
|
23
|
-
Requires-Dist: livekit~=0.
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist: torchaudio>=2
|
|
27
|
-
Requires-Dist: numpy<2,>=1
|
|
28
|
-
Requires-Dist: onnxruntime~=1.17.0
|
|
22
|
+
Requires-Dist: livekit-agents~=0.7
|
|
23
|
+
Requires-Dist: onnxruntime>=1.18
|
|
24
|
+
Requires-Dist: numpy>=1.26
|
|
29
25
|
|
|
30
26
|
# LiveKit Plugins Silero
|
|
31
27
|
|
|
@@ -17,7 +17,6 @@ from .version import __version__
|
|
|
17
17
|
|
|
18
18
|
__all__ = ["VAD", "VADStream", "__version__"]
|
|
19
19
|
|
|
20
|
-
import torch
|
|
21
20
|
from livekit.agents import Plugin
|
|
22
21
|
|
|
23
22
|
|
|
@@ -26,9 +25,7 @@ class SileroPlugin(Plugin):
|
|
|
26
25
|
super().__init__(__name__, __version__, __package__)
|
|
27
26
|
|
|
28
27
|
def download_files(self):
|
|
29
|
-
|
|
30
|
-
repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=True
|
|
31
|
-
)
|
|
28
|
+
pass
|
|
32
29
|
|
|
33
30
|
|
|
34
31
|
Plugin.register_plugin(SileroPlugin())
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import atexit
|
|
2
|
+
import importlib.resources
|
|
3
|
+
from contextlib import ExitStack
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import onnxruntime # type: ignore
|
|
7
|
+
|
|
8
|
+
_resource_files = ExitStack()
|
|
9
|
+
atexit.register(_resource_files.close)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
SUPPORTED_SAMPLE_RATES = [8000, 16000]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def new_inference_session(force_cpu: bool) -> onnxruntime.InferenceSession:
|
|
16
|
+
res = (
|
|
17
|
+
importlib.resources.files("livekit.plugins.silero.resources")
|
|
18
|
+
/ "silero_vad.onnx"
|
|
19
|
+
)
|
|
20
|
+
ctx = importlib.resources.as_file(res)
|
|
21
|
+
path = _resource_files.enter_context(ctx)
|
|
22
|
+
|
|
23
|
+
opts = onnxruntime.SessionOptions()
|
|
24
|
+
opts.inter_op_num_threads = 1
|
|
25
|
+
opts.intra_op_num_threads = 1
|
|
26
|
+
opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
|
|
27
|
+
|
|
28
|
+
if force_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers():
|
|
29
|
+
session = onnxruntime.InferenceSession(
|
|
30
|
+
str(path), providers=["CPUExecutionProvider"], ess_options=opts
|
|
31
|
+
)
|
|
32
|
+
else:
|
|
33
|
+
session = onnxruntime.InferenceSession(str(path), sess_options=opts)
|
|
34
|
+
|
|
35
|
+
return session
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class OnnxModel:
|
|
39
|
+
def __init__(
|
|
40
|
+
self, *, onnx_session: onnxruntime.InferenceSession, sample_rate: int
|
|
41
|
+
) -> None:
|
|
42
|
+
self._sess = onnx_session
|
|
43
|
+
self._sample_rate = sample_rate
|
|
44
|
+
|
|
45
|
+
if sample_rate not in SUPPORTED_SAMPLE_RATES:
|
|
46
|
+
raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
|
|
47
|
+
|
|
48
|
+
if sample_rate == 8000:
|
|
49
|
+
self._window_size_samples = 256
|
|
50
|
+
self._context_size = 32
|
|
51
|
+
elif sample_rate == 16000:
|
|
52
|
+
self._window_size_samples = 512
|
|
53
|
+
self._context_size = 64
|
|
54
|
+
|
|
55
|
+
self.reset_states()
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def window_size_samples(self) -> int:
|
|
59
|
+
return self._window_size_samples
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def context_size(self) -> int:
|
|
63
|
+
return self._context_size
|
|
64
|
+
|
|
65
|
+
def reset_states(self) -> None:
|
|
66
|
+
self._state = np.zeros((2, 1, 128), dtype=np.float32)
|
|
67
|
+
self._context = np.zeros((1, self._context_size), dtype=np.float32)
|
|
68
|
+
|
|
69
|
+
def __call__(self, x: np.ndarray) -> float:
|
|
70
|
+
if x.ndim == 1:
|
|
71
|
+
x = np.expand_dims(x, axis=0)
|
|
72
|
+
|
|
73
|
+
if x.shape[1] != self._window_size_samples:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f"Input shape must be (N, {self._window_size_samples}), got {x.shape}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
x = np.concatenate([self._context, x], axis=1)
|
|
79
|
+
ort_inputs = {
|
|
80
|
+
"input": x,
|
|
81
|
+
"state": self._state,
|
|
82
|
+
"sr": np.array(self._sample_rate, dtype=np.int64),
|
|
83
|
+
}
|
|
84
|
+
out, self._state = self._sess.run(None, ort_inputs)
|
|
85
|
+
self._context = x[..., -self._context_size :]
|
|
86
|
+
return out.item()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Used by importlib.resources and setuptools"""
|
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
# Copyright 2023 LiveKit, Inc.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import time
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
from livekit import agents, rtc
|
|
23
|
+
from livekit.agents import utils
|
|
24
|
+
|
|
25
|
+
from . import onnx_model
|
|
26
|
+
from .log import logger
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class _VADOptions:
|
|
31
|
+
min_speech_duration: float
|
|
32
|
+
min_silence_duration: float
|
|
33
|
+
padding_duration: float
|
|
34
|
+
max_buffered_speech: float
|
|
35
|
+
activation_threshold: float
|
|
36
|
+
sample_rate: int
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class VAD(agents.vad.VAD):
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
*,
|
|
43
|
+
min_speech_duration: float = 0.05,
|
|
44
|
+
min_silence_duration: float = 0.1,
|
|
45
|
+
padding_duration: float = 0.1,
|
|
46
|
+
max_buffered_speech: float = 60.0,
|
|
47
|
+
activation_threshold: float = 0.25,
|
|
48
|
+
sample_rate: int = 16000,
|
|
49
|
+
force_cpu: bool = True,
|
|
50
|
+
) -> None:
|
|
51
|
+
"""
|
|
52
|
+
Initialize the Silero VAD with the given options.
|
|
53
|
+
The options are already set to strong defaults.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
min_speech_duration: minimum duration of speech to start a new speech chunk
|
|
57
|
+
min_silence_duration: In the end of each speech, wait min_silence_duration before ending the speech
|
|
58
|
+
padding_duration: pad the chunks with this duration on both sides
|
|
59
|
+
max_buffered_speech: maximum duration of speech to keep in the buffer (in seconds)
|
|
60
|
+
activation_threshold: threshold to consider a frame as speech
|
|
61
|
+
sample_rate: sample rate for the inference (only 8KHz and 16KHz are supported)
|
|
62
|
+
force_cpu: force to use CPU for inference
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
if sample_rate not in onnx_model.SUPPORTED_SAMPLE_RATES:
|
|
66
|
+
raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
|
|
67
|
+
|
|
68
|
+
self._onnx_session = onnx_model.new_inference_session(force_cpu)
|
|
69
|
+
self._opts = _VADOptions(
|
|
70
|
+
min_speech_duration=min_speech_duration,
|
|
71
|
+
min_silence_duration=min_silence_duration,
|
|
72
|
+
padding_duration=padding_duration,
|
|
73
|
+
max_buffered_speech=max_buffered_speech,
|
|
74
|
+
activation_threshold=activation_threshold,
|
|
75
|
+
sample_rate=sample_rate,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def stream(self) -> "VADStream":
|
|
79
|
+
return VADStream(
|
|
80
|
+
self._opts,
|
|
81
|
+
onnx_model.OnnxModel(
|
|
82
|
+
onnx_session=self._onnx_session, sample_rate=self._opts.sample_rate
|
|
83
|
+
),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@dataclass
|
|
88
|
+
class _WindowData:
|
|
89
|
+
inference_data: np.ndarray
|
|
90
|
+
# data returned to the user are the original frames (int16)
|
|
91
|
+
original_data: np.ndarray
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class VADStream(agents.vad.VADStream):
|
|
95
|
+
def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None:
|
|
96
|
+
super().__init__()
|
|
97
|
+
self._opts, self._model = opts, model
|
|
98
|
+
self._original_sample_rate: int | None = None
|
|
99
|
+
self._window_data: _WindowData | None = None
|
|
100
|
+
self._remaining_samples = model.window_size_samples
|
|
101
|
+
|
|
102
|
+
@agents.utils.log_exceptions(logger=logger)
|
|
103
|
+
async def _main_task(self):
|
|
104
|
+
window_ch = utils.aio.Chan[_WindowData]()
|
|
105
|
+
await asyncio.gather(
|
|
106
|
+
self._run_inference(window_ch), self._forward_input(window_ch)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
async def _forward_input(self, window_tx: utils.aio.ChanSender[_WindowData]):
|
|
110
|
+
"""
|
|
111
|
+
Push frame to the VAD stream for processing.
|
|
112
|
+
The frames are split into chunks of the given window size and processed.
|
|
113
|
+
(Buffered if the window size is not reached yet)
|
|
114
|
+
"""
|
|
115
|
+
async for frame in self._input_ch:
|
|
116
|
+
if frame.sample_rate != 8000 and frame.sample_rate % 16000 != 0:
|
|
117
|
+
logger.error("only 8KHz and 16KHz*X sample rates are supported")
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
if (
|
|
121
|
+
self._original_sample_rate is not None
|
|
122
|
+
and self._original_sample_rate != frame.sample_rate
|
|
123
|
+
):
|
|
124
|
+
raise ValueError("a frame with another sample rate was already pushed")
|
|
125
|
+
|
|
126
|
+
self._original_sample_rate = frame.sample_rate
|
|
127
|
+
step = frame.sample_rate // 16000
|
|
128
|
+
|
|
129
|
+
if self._window_data is None:
|
|
130
|
+
self._window_data = _WindowData(
|
|
131
|
+
inference_data=np.zeros(
|
|
132
|
+
self._model.window_size_samples, dtype=np.float32
|
|
133
|
+
),
|
|
134
|
+
original_data=np.zeros(
|
|
135
|
+
self._model.window_size_samples * step, dtype=np.int16
|
|
136
|
+
),
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if frame.num_channels != 1:
|
|
140
|
+
raise ValueError("vad currently only supports mono audio frames")
|
|
141
|
+
|
|
142
|
+
og_frame = np.frombuffer(frame.data, dtype=np.int16)
|
|
143
|
+
if_frame = og_frame[::step].astype(np.float32) / np.iinfo(np.int16).max
|
|
144
|
+
|
|
145
|
+
remaining_data = len(if_frame)
|
|
146
|
+
while remaining_data > 0:
|
|
147
|
+
i = self._model.window_size_samples - self._remaining_samples
|
|
148
|
+
to_copy = min(remaining_data, self._remaining_samples)
|
|
149
|
+
self._remaining_samples -= to_copy
|
|
150
|
+
remaining_data -= to_copy
|
|
151
|
+
|
|
152
|
+
self._window_data.original_data[
|
|
153
|
+
i * step : i * step + to_copy * step
|
|
154
|
+
] = og_frame[: to_copy * step]
|
|
155
|
+
self._window_data.inference_data[i : i + to_copy] = if_frame[:to_copy]
|
|
156
|
+
|
|
157
|
+
if self._remaining_samples == 0:
|
|
158
|
+
window_tx.send_nowait(self._window_data)
|
|
159
|
+
self._window_data = _WindowData(
|
|
160
|
+
inference_data=np.zeros(
|
|
161
|
+
self._model.window_size_samples, dtype=np.float32
|
|
162
|
+
),
|
|
163
|
+
original_data=np.zeros(
|
|
164
|
+
self._model.window_size_samples * step, dtype=np.int16
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
self._remaining_samples = self._model.window_size_samples
|
|
168
|
+
|
|
169
|
+
window_tx.close()
|
|
170
|
+
|
|
171
|
+
async def _run_inference(self, window_rx: utils.aio.ChanReceiver[_WindowData]):
|
|
172
|
+
pub_speaking = False
|
|
173
|
+
pub_speech_duration = 0.0
|
|
174
|
+
pub_silence_duration = 0.0
|
|
175
|
+
pub_speech_buf = np.array([], dtype=np.int16)
|
|
176
|
+
|
|
177
|
+
may_start_at_sample = -1
|
|
178
|
+
may_end_at_sample = -1
|
|
179
|
+
|
|
180
|
+
min_speech_samples = int(
|
|
181
|
+
self._opts.min_speech_duration * self._opts.sample_rate
|
|
182
|
+
)
|
|
183
|
+
min_silence_samples = int(
|
|
184
|
+
self._opts.min_silence_duration * self._opts.sample_rate
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
current_sample = 0
|
|
188
|
+
|
|
189
|
+
async for window_data in window_rx:
|
|
190
|
+
inference_data = window_data.inference_data
|
|
191
|
+
start_time = time.time()
|
|
192
|
+
raw_prob = await asyncio.to_thread(lambda: self._model(inference_data))
|
|
193
|
+
inference_duration = time.time() - start_time
|
|
194
|
+
|
|
195
|
+
window_duration = self._model.window_size_samples / self._opts.sample_rate
|
|
196
|
+
if inference_duration > window_duration:
|
|
197
|
+
# slower than realtime
|
|
198
|
+
logger.warning(
|
|
199
|
+
"vad inference took too long - slower than realtime: %f",
|
|
200
|
+
inference_duration,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
# append new data to current speech buffer
|
|
204
|
+
pub_speech_buf = np.append(pub_speech_buf, window_data.original_data)
|
|
205
|
+
max_data_s = self._opts.padding_duration
|
|
206
|
+
if not pub_speaking:
|
|
207
|
+
max_data_s += self._opts.min_speech_duration
|
|
208
|
+
else:
|
|
209
|
+
max_data_s += self._opts.max_buffered_speech
|
|
210
|
+
|
|
211
|
+
assert self._original_sample_rate is not None
|
|
212
|
+
cl = int(max_data_s) * self._original_sample_rate
|
|
213
|
+
if len(pub_speech_buf) > cl:
|
|
214
|
+
pub_speech_buf = pub_speech_buf[-cl:]
|
|
215
|
+
|
|
216
|
+
# dispatch start/end when needed
|
|
217
|
+
if raw_prob >= self._opts.activation_threshold:
|
|
218
|
+
may_end_at_sample = -1
|
|
219
|
+
|
|
220
|
+
if may_start_at_sample == -1:
|
|
221
|
+
may_start_at_sample = current_sample + min_speech_samples
|
|
222
|
+
|
|
223
|
+
if may_start_at_sample <= current_sample and not pub_speaking:
|
|
224
|
+
pub_speaking = True
|
|
225
|
+
self._event_ch.send_nowait(
|
|
226
|
+
agents.vad.VADEvent(
|
|
227
|
+
type=agents.vad.VADEventType.START_OF_SPEECH,
|
|
228
|
+
silence_duration=pub_silence_duration,
|
|
229
|
+
speech_duration=0.0,
|
|
230
|
+
samples_index=current_sample,
|
|
231
|
+
speaking=True,
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
pub_silence_duration = 0
|
|
236
|
+
pub_speech_duration += self._opts.min_speech_duration
|
|
237
|
+
|
|
238
|
+
if pub_speaking:
|
|
239
|
+
pub_speech_duration += window_duration
|
|
240
|
+
else:
|
|
241
|
+
pub_silence_duration = 0
|
|
242
|
+
|
|
243
|
+
self._event_ch.send_nowait(
|
|
244
|
+
agents.vad.VADEvent(
|
|
245
|
+
type=agents.vad.VADEventType.INFERENCE_DONE,
|
|
246
|
+
samples_index=current_sample,
|
|
247
|
+
silence_duration=0.0,
|
|
248
|
+
speech_duration=pub_speech_duration,
|
|
249
|
+
probability=raw_prob,
|
|
250
|
+
inference_duration=inference_duration,
|
|
251
|
+
speaking=pub_speaking,
|
|
252
|
+
)
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
if raw_prob < self._opts.activation_threshold:
|
|
256
|
+
may_start_at_sample = -1
|
|
257
|
+
|
|
258
|
+
if may_end_at_sample == -1:
|
|
259
|
+
may_end_at_sample = current_sample + min_silence_samples
|
|
260
|
+
|
|
261
|
+
if may_end_at_sample <= current_sample and pub_speaking:
|
|
262
|
+
pub_speaking = False
|
|
263
|
+
|
|
264
|
+
frame = rtc.AudioFrame(
|
|
265
|
+
sample_rate=self._original_sample_rate,
|
|
266
|
+
num_channels=1,
|
|
267
|
+
samples_per_channel=len(pub_speech_buf),
|
|
268
|
+
data=pub_speech_buf.tobytes(),
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
self._event_ch.send_nowait(
|
|
272
|
+
agents.vad.VADEvent(
|
|
273
|
+
type=agents.vad.VADEventType.END_OF_SPEECH,
|
|
274
|
+
samples_index=current_sample,
|
|
275
|
+
silence_duration=0.0,
|
|
276
|
+
speech_duration=pub_speech_duration,
|
|
277
|
+
frames=[frame],
|
|
278
|
+
speaking=False,
|
|
279
|
+
)
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
pub_speech_buf = np.array([], dtype=np.int16)
|
|
283
|
+
pub_speech_duration = 0
|
|
284
|
+
pub_silence_duration += self._opts.min_silence_duration
|
|
285
|
+
|
|
286
|
+
current_sample += self._model.window_size_samples
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: livekit-plugins-silero
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.0.dev2
|
|
4
4
|
Summary: Agent Framework Plugin for Silero
|
|
5
5
|
Home-page: https://github.com/livekit/agents
|
|
6
6
|
License: Apache-2.0
|
|
@@ -14,18 +14,14 @@ Classifier: Topic :: Multimedia :: Sound/Audio
|
|
|
14
14
|
Classifier: Topic :: Multimedia :: Video
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
16
|
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.9
|
|
19
18
|
Classifier: Programming Language :: Python :: 3.10
|
|
20
19
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
|
-
Requires-Python: >=3.
|
|
20
|
+
Requires-Python: >=3.9.0
|
|
22
21
|
Description-Content-Type: text/markdown
|
|
23
|
-
Requires-Dist: livekit~=0.
|
|
24
|
-
Requires-Dist:
|
|
25
|
-
Requires-Dist:
|
|
26
|
-
Requires-Dist: torchaudio>=2
|
|
27
|
-
Requires-Dist: numpy<2,>=1
|
|
28
|
-
Requires-Dist: onnxruntime~=1.17.0
|
|
22
|
+
Requires-Dist: livekit-agents~=0.7
|
|
23
|
+
Requires-Dist: onnxruntime>=1.18
|
|
24
|
+
Requires-Dist: numpy>=1.26
|
|
29
25
|
|
|
30
26
|
# LiveKit Plugins Silero
|
|
31
27
|
|
|
@@ -3,9 +3,12 @@ pyproject.toml
|
|
|
3
3
|
setup.py
|
|
4
4
|
livekit/plugins/silero/__init__.py
|
|
5
5
|
livekit/plugins/silero/log.py
|
|
6
|
+
livekit/plugins/silero/onnx_model.py
|
|
6
7
|
livekit/plugins/silero/py.typed
|
|
7
8
|
livekit/plugins/silero/vad.py
|
|
8
9
|
livekit/plugins/silero/version.py
|
|
10
|
+
livekit/plugins/silero/resources/__init__.py
|
|
11
|
+
livekit/plugins/silero/resources/silero_vad.onnx
|
|
9
12
|
livekit_plugins_silero.egg-info/PKG-INFO
|
|
10
13
|
livekit_plugins_silero.egg-info/SOURCES.txt
|
|
11
14
|
livekit_plugins_silero.egg-info/dependency_links.txt
|
|
@@ -39,7 +39,6 @@ setuptools.setup(
|
|
|
39
39
|
"Topic :: Multimedia :: Video",
|
|
40
40
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
41
41
|
"Programming Language :: Python :: 3",
|
|
42
|
-
"Programming Language :: Python :: 3.8",
|
|
43
42
|
"Programming Language :: Python :: 3.9",
|
|
44
43
|
"Programming Language :: Python :: 3.10",
|
|
45
44
|
"Programming Language :: Python :: 3 :: Only",
|
|
@@ -47,16 +46,12 @@ setuptools.setup(
|
|
|
47
46
|
keywords=["webrtc", "realtime", "audio", "video", "livekit"],
|
|
48
47
|
license="Apache-2.0",
|
|
49
48
|
packages=setuptools.find_namespace_packages(include=["livekit.*"]),
|
|
50
|
-
python_requires=">=3.
|
|
51
|
-
install_requires=[
|
|
52
|
-
|
|
53
|
-
"livekit
|
|
54
|
-
"
|
|
55
|
-
|
|
56
|
-
"numpy >= 1, < 2",
|
|
57
|
-
"onnxruntime~=1.17.0",
|
|
58
|
-
],
|
|
59
|
-
package_data={"livekit.plugins.silero": ["files/silero_vad.jit"]},
|
|
49
|
+
python_requires=">=3.9.0",
|
|
50
|
+
install_requires=["livekit-agents~=0.7", "onnxruntime>=1.18", "numpy>=1.26"],
|
|
51
|
+
package_data={
|
|
52
|
+
"livekit.plugins.silero.resources": ["silero_vad.onnx"],
|
|
53
|
+
"livekit.plugins.silero": ["py.typed"],
|
|
54
|
+
},
|
|
60
55
|
project_urls={
|
|
61
56
|
"Documentation": "https://docs.livekit.io",
|
|
62
57
|
"Website": "https://livekit.io/",
|
|
@@ -1,291 +0,0 @@
|
|
|
1
|
-
# Copyright 2023 LiveKit, Inc.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from __future__ import annotations
|
|
16
|
-
|
|
17
|
-
import asyncio
|
|
18
|
-
import contextlib
|
|
19
|
-
import time
|
|
20
|
-
from collections import deque
|
|
21
|
-
from typing import List, Optional
|
|
22
|
-
|
|
23
|
-
import numpy as np
|
|
24
|
-
import torch
|
|
25
|
-
from livekit import agents, rtc
|
|
26
|
-
|
|
27
|
-
from .log import logger
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class VAD(agents.vad.VAD):
|
|
31
|
-
def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
|
|
32
|
-
if model_path:
|
|
33
|
-
model = torch.jit.load(model_path)
|
|
34
|
-
model.eval()
|
|
35
|
-
else:
|
|
36
|
-
model, _ = torch.hub.load(
|
|
37
|
-
repo_or_dir="snakers4/silero-vad",
|
|
38
|
-
model="silero_vad",
|
|
39
|
-
onnx=use_onnx,
|
|
40
|
-
)
|
|
41
|
-
self._model = model
|
|
42
|
-
|
|
43
|
-
def stream(
|
|
44
|
-
self,
|
|
45
|
-
*,
|
|
46
|
-
min_speaking_duration: float = 0.2,
|
|
47
|
-
min_silence_duration: float = 0.8,
|
|
48
|
-
padding_duration: float = 0.1,
|
|
49
|
-
sample_rate: int = 16000,
|
|
50
|
-
max_buffered_speech: float = 45.0,
|
|
51
|
-
threshold: float = 0.2,
|
|
52
|
-
) -> "VADStream":
|
|
53
|
-
return VADStream(
|
|
54
|
-
self._model,
|
|
55
|
-
min_speaking_duration=min_speaking_duration,
|
|
56
|
-
min_silence_duration=min_silence_duration,
|
|
57
|
-
padding_duration=padding_duration,
|
|
58
|
-
sample_rate=sample_rate,
|
|
59
|
-
max_buffered_speech=max_buffered_speech,
|
|
60
|
-
threshold=threshold,
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
# Based on https://github.com/snakers4/silero-vad/blob/94504ece54c8caeebb808410b08ae55ee82dba82/utils_vad.py#L428
|
|
65
|
-
class VADStream(agents.vad.VADStream):
|
|
66
|
-
def __init__(
|
|
67
|
-
self,
|
|
68
|
-
model,
|
|
69
|
-
*,
|
|
70
|
-
min_speaking_duration: float,
|
|
71
|
-
min_silence_duration: float,
|
|
72
|
-
padding_duration: float,
|
|
73
|
-
sample_rate: int,
|
|
74
|
-
max_buffered_speech: float,
|
|
75
|
-
threshold: float,
|
|
76
|
-
) -> None:
|
|
77
|
-
self._min_speaking_duration = min_speaking_duration
|
|
78
|
-
self._min_silence_duration = min_silence_duration
|
|
79
|
-
self._padding_duration = padding_duration
|
|
80
|
-
self._sample_rate = sample_rate
|
|
81
|
-
self._max_buffered_speech = max_buffered_speech
|
|
82
|
-
self._threshold = threshold
|
|
83
|
-
|
|
84
|
-
if sample_rate not in [8000, 16000]:
|
|
85
|
-
raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
|
|
86
|
-
|
|
87
|
-
self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
|
|
88
|
-
self._event_queue = asyncio.Queue[Optional[agents.vad.VADEvent]]()
|
|
89
|
-
self._model = model
|
|
90
|
-
|
|
91
|
-
self._closed = False
|
|
92
|
-
self._speaking = False
|
|
93
|
-
self._waiting_start = False
|
|
94
|
-
self._waiting_end = False
|
|
95
|
-
self._current_sample = 0
|
|
96
|
-
self._filter = agents.utils.ExpFilter(0.8)
|
|
97
|
-
self._min_speaking_samples = min_speaking_duration * sample_rate
|
|
98
|
-
self._min_silence_samples = min_silence_duration * sample_rate
|
|
99
|
-
self._padding_duration_samples = padding_duration * sample_rate
|
|
100
|
-
self._max_buffered_samples = max_buffered_speech * sample_rate
|
|
101
|
-
|
|
102
|
-
self._queued_frames: deque[rtc.AudioFrame] = deque()
|
|
103
|
-
self._original_frames: deque[rtc.AudioFrame] = deque()
|
|
104
|
-
self._buffered_frames: List[rtc.AudioFrame] = []
|
|
105
|
-
self._main_task = asyncio.create_task(self._run())
|
|
106
|
-
|
|
107
|
-
def push_frame(self, frame: rtc.AudioFrame) -> None:
|
|
108
|
-
if self._closed:
|
|
109
|
-
raise ValueError("cannot push frame to closed stream")
|
|
110
|
-
|
|
111
|
-
self._queue.put_nowait(frame)
|
|
112
|
-
|
|
113
|
-
async def aclose(self, *, wait: bool = True) -> None:
|
|
114
|
-
self._closed = True
|
|
115
|
-
if not wait:
|
|
116
|
-
self._main_task.cancel()
|
|
117
|
-
|
|
118
|
-
self._queue.put_nowait(None)
|
|
119
|
-
with contextlib.suppress(asyncio.CancelledError):
|
|
120
|
-
await self._main_task
|
|
121
|
-
|
|
122
|
-
async def _run(self):
|
|
123
|
-
try:
|
|
124
|
-
while True:
|
|
125
|
-
frame = await self._queue.get()
|
|
126
|
-
if frame is None:
|
|
127
|
-
break # None is sent inside aclose
|
|
128
|
-
|
|
129
|
-
self._queue.task_done()
|
|
130
|
-
|
|
131
|
-
# resample to silero's sample rate
|
|
132
|
-
resampled_frame = frame.remix_and_resample(
|
|
133
|
-
self._sample_rate, 1
|
|
134
|
-
) # TODO: This is technically wrong, fix when we have a better resampler
|
|
135
|
-
self._original_frames.append(frame)
|
|
136
|
-
self._queued_frames.append(resampled_frame)
|
|
137
|
-
|
|
138
|
-
# run inference by chunks of 40ms until we run out of data
|
|
139
|
-
while True:
|
|
140
|
-
available_length = sum(
|
|
141
|
-
f.samples_per_channel for f in self._queued_frames
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
samples_40ms = self._sample_rate // 1000 * 40
|
|
145
|
-
if available_length < samples_40ms:
|
|
146
|
-
break
|
|
147
|
-
|
|
148
|
-
await asyncio.shield(self._run_inference())
|
|
149
|
-
|
|
150
|
-
except Exception:
|
|
151
|
-
logger.exception("silero stream failed")
|
|
152
|
-
finally:
|
|
153
|
-
self._event_queue.put_nowait(None)
|
|
154
|
-
|
|
155
|
-
async def _run_inference(self) -> None:
|
|
156
|
-
# merge the first 4 frames (we know each is 10ms)
|
|
157
|
-
if len(self._queued_frames) < 4:
|
|
158
|
-
return
|
|
159
|
-
|
|
160
|
-
original_frames = [self._original_frames.popleft() for _ in range(4)]
|
|
161
|
-
merged_frame = agents.utils.merge_frames(
|
|
162
|
-
[self._queued_frames.popleft() for _ in range(4)]
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
# convert data_40ms to tensor & f32
|
|
166
|
-
tensor = torch.from_numpy(np.frombuffer(merged_frame.data, dtype=np.int16))
|
|
167
|
-
tensor = tensor.to(torch.float32) / 32768.0
|
|
168
|
-
|
|
169
|
-
# run inference
|
|
170
|
-
start_time = time.time()
|
|
171
|
-
raw_prob = await asyncio.to_thread(
|
|
172
|
-
lambda: self._model(tensor, self._sample_rate).item()
|
|
173
|
-
)
|
|
174
|
-
probability = self._filter.apply(1.0, raw_prob)
|
|
175
|
-
inference_duration = time.time() - start_time
|
|
176
|
-
|
|
177
|
-
# inference done
|
|
178
|
-
event = agents.vad.VADEvent(
|
|
179
|
-
type=agents.vad.VADEventType.INFERENCE_DONE,
|
|
180
|
-
samples_index=self._current_sample,
|
|
181
|
-
probability=probability,
|
|
182
|
-
raw_inference_prob=raw_prob,
|
|
183
|
-
inference_duration=inference_duration,
|
|
184
|
-
)
|
|
185
|
-
self._event_queue.put_nowait(event)
|
|
186
|
-
|
|
187
|
-
self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
|
|
188
|
-
self._current_sample += merged_frame.samples_per_channel
|
|
189
|
-
|
|
190
|
-
def _dispatch_event(
|
|
191
|
-
self,
|
|
192
|
-
original_frames: List[rtc.AudioFrame],
|
|
193
|
-
probability: float,
|
|
194
|
-
raw_inference_prob: float,
|
|
195
|
-
inference_duration: float,
|
|
196
|
-
):
|
|
197
|
-
"""
|
|
198
|
-
Dispatches a VAD event based on the speech probability and the options
|
|
199
|
-
Args:
|
|
200
|
-
speech_prob: speech probability of the current frame
|
|
201
|
-
original_frames: original frames of the current inference
|
|
202
|
-
"""
|
|
203
|
-
|
|
204
|
-
samples_10ms = self._sample_rate / 100
|
|
205
|
-
padding_count = int(
|
|
206
|
-
self._padding_duration_samples // samples_10ms
|
|
207
|
-
) # number of frames to keep for the padding (one side)
|
|
208
|
-
|
|
209
|
-
self._buffered_frames.extend(original_frames)
|
|
210
|
-
if (
|
|
211
|
-
not self._speaking
|
|
212
|
-
and not self._waiting_start
|
|
213
|
-
and len(self._buffered_frames) > padding_count
|
|
214
|
-
):
|
|
215
|
-
self._buffered_frames = self._buffered_frames[
|
|
216
|
-
len(self._buffered_frames) - padding_count :
|
|
217
|
-
]
|
|
218
|
-
|
|
219
|
-
max_buffer_len = padding_count + max(
|
|
220
|
-
int(self._max_buffered_samples // samples_10ms),
|
|
221
|
-
int(self._min_speaking_samples // samples_10ms),
|
|
222
|
-
)
|
|
223
|
-
if len(self._buffered_frames) > max_buffer_len:
|
|
224
|
-
self._buffered_frames = self._buffered_frames[
|
|
225
|
-
len(self._buffered_frames) - max_buffer_len :
|
|
226
|
-
]
|
|
227
|
-
|
|
228
|
-
if probability >= self._threshold:
|
|
229
|
-
# speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
|
|
230
|
-
self._waiting_end = False
|
|
231
|
-
if not self._waiting_start and not self._speaking:
|
|
232
|
-
self._waiting_start = True
|
|
233
|
-
self._start_speech = self._current_sample
|
|
234
|
-
|
|
235
|
-
if self._waiting_start and (
|
|
236
|
-
self._current_sample - self._start_speech >= self._min_speaking_samples
|
|
237
|
-
):
|
|
238
|
-
self._waiting_start = False
|
|
239
|
-
self._speaking = True
|
|
240
|
-
|
|
241
|
-
# since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
|
|
242
|
-
# put the speech that were used to trigger the start here
|
|
243
|
-
event = agents.vad.VADEvent(
|
|
244
|
-
type=agents.vad.VADEventType.START_OF_SPEECH,
|
|
245
|
-
samples_index=self._start_speech,
|
|
246
|
-
frames=self._buffered_frames[padding_count:],
|
|
247
|
-
speaking=True,
|
|
248
|
-
)
|
|
249
|
-
self._event_queue.put_nowait(event)
|
|
250
|
-
|
|
251
|
-
# we don't check the speech_prob here
|
|
252
|
-
event = agents.vad.VADEvent(
|
|
253
|
-
type=agents.vad.VADEventType.INFERENCE_DONE,
|
|
254
|
-
samples_index=self._current_sample,
|
|
255
|
-
frames=original_frames,
|
|
256
|
-
probability=probability,
|
|
257
|
-
raw_inference_prob=raw_inference_prob,
|
|
258
|
-
inference_duration=inference_duration,
|
|
259
|
-
speaking=self._speaking,
|
|
260
|
-
)
|
|
261
|
-
self._event_queue.put_nowait(event)
|
|
262
|
-
|
|
263
|
-
if probability < self._threshold:
|
|
264
|
-
# stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
|
|
265
|
-
self._waiting_start = False
|
|
266
|
-
if not self._waiting_end and self._speaking:
|
|
267
|
-
self._waiting_end = True
|
|
268
|
-
self._end_speech = self._current_sample
|
|
269
|
-
|
|
270
|
-
if self._waiting_end and (
|
|
271
|
-
self._current_sample - self._end_speech
|
|
272
|
-
>= max(self._min_silence_samples, self._padding_duration_samples)
|
|
273
|
-
):
|
|
274
|
-
self._waiting_end = False
|
|
275
|
-
self._speaking = False
|
|
276
|
-
event = agents.vad.VADEvent(
|
|
277
|
-
type=agents.vad.VADEventType.END_OF_SPEECH,
|
|
278
|
-
samples_index=self._end_speech,
|
|
279
|
-
duration=(self._end_speech - self._start_speech)
|
|
280
|
-
/ self._sample_rate,
|
|
281
|
-
frames=self._buffered_frames,
|
|
282
|
-
speaking=False,
|
|
283
|
-
)
|
|
284
|
-
self._event_queue.put_nowait(event)
|
|
285
|
-
|
|
286
|
-
async def __anext__(self) -> agents.vad.VADEvent:
|
|
287
|
-
evt = await self._event_queue.get()
|
|
288
|
-
if evt is None:
|
|
289
|
-
raise StopAsyncIteration
|
|
290
|
-
|
|
291
|
-
return evt
|
|
File without changes
|
{livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/log.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|