livekit-plugins-silero 0.6.dev0__tar.gz → 0.6.0.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/PKG-INFO +5 -9
  2. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/__init__.py +1 -4
  3. livekit_plugins_silero-0.6.0.dev2/livekit/plugins/silero/onnx_model.py +86 -0
  4. livekit_plugins_silero-0.6.0.dev2/livekit/plugins/silero/resources/__init__.py +1 -0
  5. livekit_plugins_silero-0.6.0.dev2/livekit/plugins/silero/resources/silero_vad.onnx +3 -0
  6. livekit_plugins_silero-0.6.0.dev2/livekit/plugins/silero/vad.py +286 -0
  7. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/version.py +1 -1
  8. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit_plugins_silero.egg-info/PKG-INFO +5 -9
  9. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit_plugins_silero.egg-info/SOURCES.txt +3 -0
  10. livekit_plugins_silero-0.6.0.dev2/livekit_plugins_silero.egg-info/requires.txt +3 -0
  11. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/setup.py +6 -11
  12. livekit_plugins_silero-0.6.dev0/livekit/plugins/silero/vad.py +0 -291
  13. livekit_plugins_silero-0.6.dev0/livekit_plugins_silero.egg-info/requires.txt +0 -6
  14. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/README.md +0 -0
  15. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/log.py +0 -0
  16. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit/plugins/silero/py.typed +0 -0
  17. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit_plugins_silero.egg-info/dependency_links.txt +0 -0
  18. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/livekit_plugins_silero.egg-info/top_level.txt +0 -0
  19. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/pyproject.toml +0 -0
  20. {livekit_plugins_silero-0.6.dev0 → livekit_plugins_silero-0.6.0.dev2}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-silero
3
- Version: 0.6.dev0
3
+ Version: 0.6.0.dev2
4
4
  Summary: Agent Framework Plugin for Silero
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -14,18 +14,14 @@ Classifier: Topic :: Multimedia :: Sound/Audio
14
14
  Classifier: Topic :: Multimedia :: Video
15
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
16
  Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.8
18
17
  Classifier: Programming Language :: Python :: 3.9
19
18
  Classifier: Programming Language :: Python :: 3.10
20
19
  Classifier: Programming Language :: Python :: 3 :: Only
21
- Requires-Python: >=3.8.0
20
+ Requires-Python: >=3.9.0
22
21
  Description-Content-Type: text/markdown
23
- Requires-Dist: livekit~=0.11
24
- Requires-Dist: livekit-agents~=0.8.dev0
25
- Requires-Dist: torch<3,>=2
26
- Requires-Dist: torchaudio>=2
27
- Requires-Dist: numpy<2,>=1
28
- Requires-Dist: onnxruntime~=1.17.0
22
+ Requires-Dist: livekit-agents~=0.7
23
+ Requires-Dist: onnxruntime>=1.18
24
+ Requires-Dist: numpy>=1.26
29
25
 
30
26
  # LiveKit Plugins Silero
31
27
 
@@ -17,7 +17,6 @@ from .version import __version__
17
17
 
18
18
  __all__ = ["VAD", "VADStream", "__version__"]
19
19
 
20
- import torch
21
20
  from livekit.agents import Plugin
22
21
 
23
22
 
@@ -26,9 +25,7 @@ class SileroPlugin(Plugin):
26
25
  super().__init__(__name__, __version__, __package__)
27
26
 
28
27
  def download_files(self):
29
- _ = torch.hub.load(
30
- repo_or_dir="snakers4/silero-vad", model="silero_vad", onnx=True
31
- )
28
+ pass
32
29
 
33
30
 
34
31
  Plugin.register_plugin(SileroPlugin())
@@ -0,0 +1,86 @@
1
+ import atexit
2
+ import importlib.resources
3
+ from contextlib import ExitStack
4
+
5
+ import numpy as np
6
+ import onnxruntime # type: ignore
7
+
8
+ _resource_files = ExitStack()
9
+ atexit.register(_resource_files.close)
10
+
11
+
12
+ SUPPORTED_SAMPLE_RATES = [8000, 16000]
13
+
14
+
15
+ def new_inference_session(force_cpu: bool) -> onnxruntime.InferenceSession:
16
+ res = (
17
+ importlib.resources.files("livekit.plugins.silero.resources")
18
+ / "silero_vad.onnx"
19
+ )
20
+ ctx = importlib.resources.as_file(res)
21
+ path = _resource_files.enter_context(ctx)
22
+
23
+ opts = onnxruntime.SessionOptions()
24
+ opts.inter_op_num_threads = 1
25
+ opts.intra_op_num_threads = 1
26
+ opts.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
27
+
28
+ if force_cpu and "CPUExecutionProvider" in onnxruntime.get_available_providers():
29
+ session = onnxruntime.InferenceSession(
30
+ str(path), providers=["CPUExecutionProvider"], ess_options=opts
31
+ )
32
+ else:
33
+ session = onnxruntime.InferenceSession(str(path), sess_options=opts)
34
+
35
+ return session
36
+
37
+
38
+ class OnnxModel:
39
+ def __init__(
40
+ self, *, onnx_session: onnxruntime.InferenceSession, sample_rate: int
41
+ ) -> None:
42
+ self._sess = onnx_session
43
+ self._sample_rate = sample_rate
44
+
45
+ if sample_rate not in SUPPORTED_SAMPLE_RATES:
46
+ raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
47
+
48
+ if sample_rate == 8000:
49
+ self._window_size_samples = 256
50
+ self._context_size = 32
51
+ elif sample_rate == 16000:
52
+ self._window_size_samples = 512
53
+ self._context_size = 64
54
+
55
+ self.reset_states()
56
+
57
+ @property
58
+ def window_size_samples(self) -> int:
59
+ return self._window_size_samples
60
+
61
+ @property
62
+ def context_size(self) -> int:
63
+ return self._context_size
64
+
65
+ def reset_states(self) -> None:
66
+ self._state = np.zeros((2, 1, 128), dtype=np.float32)
67
+ self._context = np.zeros((1, self._context_size), dtype=np.float32)
68
+
69
+ def __call__(self, x: np.ndarray) -> float:
70
+ if x.ndim == 1:
71
+ x = np.expand_dims(x, axis=0)
72
+
73
+ if x.shape[1] != self._window_size_samples:
74
+ raise ValueError(
75
+ f"Input shape must be (N, {self._window_size_samples}), got {x.shape}"
76
+ )
77
+
78
+ x = np.concatenate([self._context, x], axis=1)
79
+ ort_inputs = {
80
+ "input": x,
81
+ "state": self._state,
82
+ "sr": np.array(self._sample_rate, dtype=np.int64),
83
+ }
84
+ out, self._state = self._sess.run(None, ort_inputs)
85
+ self._context = x[..., -self._context_size :]
86
+ return out.item()
@@ -0,0 +1 @@
1
+ """Used by importlib.resources and setuptools"""
@@ -0,0 +1,3 @@
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b99cbfd39246b6706f98ec13c7c50c6b299181f2474fa05cbc8046acc274396
3
+ size 2313101
@@ -0,0 +1,286 @@
1
+ # Copyright 2023 LiveKit, Inc.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import time
19
+ from dataclasses import dataclass
20
+
21
+ import numpy as np
22
+ from livekit import agents, rtc
23
+ from livekit.agents import utils
24
+
25
+ from . import onnx_model
26
+ from .log import logger
27
+
28
+
29
+ @dataclass
30
+ class _VADOptions:
31
+ min_speech_duration: float
32
+ min_silence_duration: float
33
+ padding_duration: float
34
+ max_buffered_speech: float
35
+ activation_threshold: float
36
+ sample_rate: int
37
+
38
+
39
+ class VAD(agents.vad.VAD):
40
+ def __init__(
41
+ self,
42
+ *,
43
+ min_speech_duration: float = 0.05,
44
+ min_silence_duration: float = 0.1,
45
+ padding_duration: float = 0.1,
46
+ max_buffered_speech: float = 60.0,
47
+ activation_threshold: float = 0.25,
48
+ sample_rate: int = 16000,
49
+ force_cpu: bool = True,
50
+ ) -> None:
51
+ """
52
+ Initialize the Silero VAD with the given options.
53
+ The options are already set to strong defaults.
54
+
55
+ Args:
56
+ min_speech_duration: minimum duration of speech to start a new speech chunk
57
+ min_silence_duration: In the end of each speech, wait min_silence_duration before ending the speech
58
+ padding_duration: pad the chunks with this duration on both sides
59
+ max_buffered_speech: maximum duration of speech to keep in the buffer (in seconds)
60
+ activation_threshold: threshold to consider a frame as speech
61
+ sample_rate: sample rate for the inference (only 8KHz and 16KHz are supported)
62
+ force_cpu: force to use CPU for inference
63
+ """
64
+
65
+ if sample_rate not in onnx_model.SUPPORTED_SAMPLE_RATES:
66
+ raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
67
+
68
+ self._onnx_session = onnx_model.new_inference_session(force_cpu)
69
+ self._opts = _VADOptions(
70
+ min_speech_duration=min_speech_duration,
71
+ min_silence_duration=min_silence_duration,
72
+ padding_duration=padding_duration,
73
+ max_buffered_speech=max_buffered_speech,
74
+ activation_threshold=activation_threshold,
75
+ sample_rate=sample_rate,
76
+ )
77
+
78
+ def stream(self) -> "VADStream":
79
+ return VADStream(
80
+ self._opts,
81
+ onnx_model.OnnxModel(
82
+ onnx_session=self._onnx_session, sample_rate=self._opts.sample_rate
83
+ ),
84
+ )
85
+
86
+
87
+ @dataclass
88
+ class _WindowData:
89
+ inference_data: np.ndarray
90
+ # data returned to the user are the original frames (int16)
91
+ original_data: np.ndarray
92
+
93
+
94
+ class VADStream(agents.vad.VADStream):
95
+ def __init__(self, opts: _VADOptions, model: onnx_model.OnnxModel) -> None:
96
+ super().__init__()
97
+ self._opts, self._model = opts, model
98
+ self._original_sample_rate: int | None = None
99
+ self._window_data: _WindowData | None = None
100
+ self._remaining_samples = model.window_size_samples
101
+
102
+ @agents.utils.log_exceptions(logger=logger)
103
+ async def _main_task(self):
104
+ window_ch = utils.aio.Chan[_WindowData]()
105
+ await asyncio.gather(
106
+ self._run_inference(window_ch), self._forward_input(window_ch)
107
+ )
108
+
109
+ async def _forward_input(self, window_tx: utils.aio.ChanSender[_WindowData]):
110
+ """
111
+ Push frame to the VAD stream for processing.
112
+ The frames are split into chunks of the given window size and processed.
113
+ (Buffered if the window size is not reached yet)
114
+ """
115
+ async for frame in self._input_ch:
116
+ if frame.sample_rate != 8000 and frame.sample_rate % 16000 != 0:
117
+ logger.error("only 8KHz and 16KHz*X sample rates are supported")
118
+ continue
119
+
120
+ if (
121
+ self._original_sample_rate is not None
122
+ and self._original_sample_rate != frame.sample_rate
123
+ ):
124
+ raise ValueError("a frame with another sample rate was already pushed")
125
+
126
+ self._original_sample_rate = frame.sample_rate
127
+ step = frame.sample_rate // 16000
128
+
129
+ if self._window_data is None:
130
+ self._window_data = _WindowData(
131
+ inference_data=np.zeros(
132
+ self._model.window_size_samples, dtype=np.float32
133
+ ),
134
+ original_data=np.zeros(
135
+ self._model.window_size_samples * step, dtype=np.int16
136
+ ),
137
+ )
138
+
139
+ if frame.num_channels != 1:
140
+ raise ValueError("vad currently only supports mono audio frames")
141
+
142
+ og_frame = np.frombuffer(frame.data, dtype=np.int16)
143
+ if_frame = og_frame[::step].astype(np.float32) / np.iinfo(np.int16).max
144
+
145
+ remaining_data = len(if_frame)
146
+ while remaining_data > 0:
147
+ i = self._model.window_size_samples - self._remaining_samples
148
+ to_copy = min(remaining_data, self._remaining_samples)
149
+ self._remaining_samples -= to_copy
150
+ remaining_data -= to_copy
151
+
152
+ self._window_data.original_data[
153
+ i * step : i * step + to_copy * step
154
+ ] = og_frame[: to_copy * step]
155
+ self._window_data.inference_data[i : i + to_copy] = if_frame[:to_copy]
156
+
157
+ if self._remaining_samples == 0:
158
+ window_tx.send_nowait(self._window_data)
159
+ self._window_data = _WindowData(
160
+ inference_data=np.zeros(
161
+ self._model.window_size_samples, dtype=np.float32
162
+ ),
163
+ original_data=np.zeros(
164
+ self._model.window_size_samples * step, dtype=np.int16
165
+ ),
166
+ )
167
+ self._remaining_samples = self._model.window_size_samples
168
+
169
+ window_tx.close()
170
+
171
+ async def _run_inference(self, window_rx: utils.aio.ChanReceiver[_WindowData]):
172
+ pub_speaking = False
173
+ pub_speech_duration = 0.0
174
+ pub_silence_duration = 0.0
175
+ pub_speech_buf = np.array([], dtype=np.int16)
176
+
177
+ may_start_at_sample = -1
178
+ may_end_at_sample = -1
179
+
180
+ min_speech_samples = int(
181
+ self._opts.min_speech_duration * self._opts.sample_rate
182
+ )
183
+ min_silence_samples = int(
184
+ self._opts.min_silence_duration * self._opts.sample_rate
185
+ )
186
+
187
+ current_sample = 0
188
+
189
+ async for window_data in window_rx:
190
+ inference_data = window_data.inference_data
191
+ start_time = time.time()
192
+ raw_prob = await asyncio.to_thread(lambda: self._model(inference_data))
193
+ inference_duration = time.time() - start_time
194
+
195
+ window_duration = self._model.window_size_samples / self._opts.sample_rate
196
+ if inference_duration > window_duration:
197
+ # slower than realtime
198
+ logger.warning(
199
+ "vad inference took too long - slower than realtime: %f",
200
+ inference_duration,
201
+ )
202
+
203
+ # append new data to current speech buffer
204
+ pub_speech_buf = np.append(pub_speech_buf, window_data.original_data)
205
+ max_data_s = self._opts.padding_duration
206
+ if not pub_speaking:
207
+ max_data_s += self._opts.min_speech_duration
208
+ else:
209
+ max_data_s += self._opts.max_buffered_speech
210
+
211
+ assert self._original_sample_rate is not None
212
+ cl = int(max_data_s) * self._original_sample_rate
213
+ if len(pub_speech_buf) > cl:
214
+ pub_speech_buf = pub_speech_buf[-cl:]
215
+
216
+ # dispatch start/end when needed
217
+ if raw_prob >= self._opts.activation_threshold:
218
+ may_end_at_sample = -1
219
+
220
+ if may_start_at_sample == -1:
221
+ may_start_at_sample = current_sample + min_speech_samples
222
+
223
+ if may_start_at_sample <= current_sample and not pub_speaking:
224
+ pub_speaking = True
225
+ self._event_ch.send_nowait(
226
+ agents.vad.VADEvent(
227
+ type=agents.vad.VADEventType.START_OF_SPEECH,
228
+ silence_duration=pub_silence_duration,
229
+ speech_duration=0.0,
230
+ samples_index=current_sample,
231
+ speaking=True,
232
+ )
233
+ )
234
+
235
+ pub_silence_duration = 0
236
+ pub_speech_duration += self._opts.min_speech_duration
237
+
238
+ if pub_speaking:
239
+ pub_speech_duration += window_duration
240
+ else:
241
+ pub_silence_duration = 0
242
+
243
+ self._event_ch.send_nowait(
244
+ agents.vad.VADEvent(
245
+ type=agents.vad.VADEventType.INFERENCE_DONE,
246
+ samples_index=current_sample,
247
+ silence_duration=0.0,
248
+ speech_duration=pub_speech_duration,
249
+ probability=raw_prob,
250
+ inference_duration=inference_duration,
251
+ speaking=pub_speaking,
252
+ )
253
+ )
254
+
255
+ if raw_prob < self._opts.activation_threshold:
256
+ may_start_at_sample = -1
257
+
258
+ if may_end_at_sample == -1:
259
+ may_end_at_sample = current_sample + min_silence_samples
260
+
261
+ if may_end_at_sample <= current_sample and pub_speaking:
262
+ pub_speaking = False
263
+
264
+ frame = rtc.AudioFrame(
265
+ sample_rate=self._original_sample_rate,
266
+ num_channels=1,
267
+ samples_per_channel=len(pub_speech_buf),
268
+ data=pub_speech_buf.tobytes(),
269
+ )
270
+
271
+ self._event_ch.send_nowait(
272
+ agents.vad.VADEvent(
273
+ type=agents.vad.VADEventType.END_OF_SPEECH,
274
+ samples_index=current_sample,
275
+ silence_duration=0.0,
276
+ speech_duration=pub_speech_duration,
277
+ frames=[frame],
278
+ speaking=False,
279
+ )
280
+ )
281
+
282
+ pub_speech_buf = np.array([], dtype=np.int16)
283
+ pub_speech_duration = 0
284
+ pub_silence_duration += self._opts.min_silence_duration
285
+
286
+ current_sample += self._model.window_size_samples
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.6.dev0"
15
+ __version__ = "0.6.0-dev.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-silero
3
- Version: 0.6.dev0
3
+ Version: 0.6.0.dev2
4
4
  Summary: Agent Framework Plugin for Silero
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -14,18 +14,14 @@ Classifier: Topic :: Multimedia :: Sound/Audio
14
14
  Classifier: Topic :: Multimedia :: Video
15
15
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
16
  Classifier: Programming Language :: Python :: 3
17
- Classifier: Programming Language :: Python :: 3.8
18
17
  Classifier: Programming Language :: Python :: 3.9
19
18
  Classifier: Programming Language :: Python :: 3.10
20
19
  Classifier: Programming Language :: Python :: 3 :: Only
21
- Requires-Python: >=3.8.0
20
+ Requires-Python: >=3.9.0
22
21
  Description-Content-Type: text/markdown
23
- Requires-Dist: livekit~=0.11
24
- Requires-Dist: livekit-agents~=0.8.dev0
25
- Requires-Dist: torch<3,>=2
26
- Requires-Dist: torchaudio>=2
27
- Requires-Dist: numpy<2,>=1
28
- Requires-Dist: onnxruntime~=1.17.0
22
+ Requires-Dist: livekit-agents~=0.7
23
+ Requires-Dist: onnxruntime>=1.18
24
+ Requires-Dist: numpy>=1.26
29
25
 
30
26
  # LiveKit Plugins Silero
31
27
 
@@ -3,9 +3,12 @@ pyproject.toml
3
3
  setup.py
4
4
  livekit/plugins/silero/__init__.py
5
5
  livekit/plugins/silero/log.py
6
+ livekit/plugins/silero/onnx_model.py
6
7
  livekit/plugins/silero/py.typed
7
8
  livekit/plugins/silero/vad.py
8
9
  livekit/plugins/silero/version.py
10
+ livekit/plugins/silero/resources/__init__.py
11
+ livekit/plugins/silero/resources/silero_vad.onnx
9
12
  livekit_plugins_silero.egg-info/PKG-INFO
10
13
  livekit_plugins_silero.egg-info/SOURCES.txt
11
14
  livekit_plugins_silero.egg-info/dependency_links.txt
@@ -0,0 +1,3 @@
1
+ livekit-agents~=0.7
2
+ onnxruntime>=1.18
3
+ numpy>=1.26
@@ -39,7 +39,6 @@ setuptools.setup(
39
39
  "Topic :: Multimedia :: Video",
40
40
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
41
41
  "Programming Language :: Python :: 3",
42
- "Programming Language :: Python :: 3.8",
43
42
  "Programming Language :: Python :: 3.9",
44
43
  "Programming Language :: Python :: 3.10",
45
44
  "Programming Language :: Python :: 3 :: Only",
@@ -47,16 +46,12 @@ setuptools.setup(
47
46
  keywords=["webrtc", "realtime", "audio", "video", "livekit"],
48
47
  license="Apache-2.0",
49
48
  packages=setuptools.find_namespace_packages(include=["livekit.*"]),
50
- python_requires=">=3.8.0",
51
- install_requires=[
52
- "livekit ~= 0.11",
53
- "livekit-agents~=0.8.dev0",
54
- "torch >= 2, < 3",
55
- "torchaudio >= 2",
56
- "numpy >= 1, < 2",
57
- "onnxruntime~=1.17.0",
58
- ],
59
- package_data={"livekit.plugins.silero": ["files/silero_vad.jit"]},
49
+ python_requires=">=3.9.0",
50
+ install_requires=["livekit-agents~=0.7", "onnxruntime>=1.18", "numpy>=1.26"],
51
+ package_data={
52
+ "livekit.plugins.silero.resources": ["silero_vad.onnx"],
53
+ "livekit.plugins.silero": ["py.typed"],
54
+ },
60
55
  project_urls={
61
56
  "Documentation": "https://docs.livekit.io",
62
57
  "Website": "https://livekit.io/",
@@ -1,291 +0,0 @@
1
- # Copyright 2023 LiveKit, Inc.
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
6
- #
7
- # http://www.apache.org/licenses/LICENSE-2.0
8
- #
9
- # Unless required by applicable law or agreed to in writing, software
10
- # distributed under the License is distributed on an "AS IS" BASIS,
11
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- # See the License for the specific language governing permissions and
13
- # limitations under the License.
14
-
15
- from __future__ import annotations
16
-
17
- import asyncio
18
- import contextlib
19
- import time
20
- from collections import deque
21
- from typing import List, Optional
22
-
23
- import numpy as np
24
- import torch
25
- from livekit import agents, rtc
26
-
27
- from .log import logger
28
-
29
-
30
- class VAD(agents.vad.VAD):
31
- def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
32
- if model_path:
33
- model = torch.jit.load(model_path)
34
- model.eval()
35
- else:
36
- model, _ = torch.hub.load(
37
- repo_or_dir="snakers4/silero-vad",
38
- model="silero_vad",
39
- onnx=use_onnx,
40
- )
41
- self._model = model
42
-
43
- def stream(
44
- self,
45
- *,
46
- min_speaking_duration: float = 0.2,
47
- min_silence_duration: float = 0.8,
48
- padding_duration: float = 0.1,
49
- sample_rate: int = 16000,
50
- max_buffered_speech: float = 45.0,
51
- threshold: float = 0.2,
52
- ) -> "VADStream":
53
- return VADStream(
54
- self._model,
55
- min_speaking_duration=min_speaking_duration,
56
- min_silence_duration=min_silence_duration,
57
- padding_duration=padding_duration,
58
- sample_rate=sample_rate,
59
- max_buffered_speech=max_buffered_speech,
60
- threshold=threshold,
61
- )
62
-
63
-
64
- # Based on https://github.com/snakers4/silero-vad/blob/94504ece54c8caeebb808410b08ae55ee82dba82/utils_vad.py#L428
65
- class VADStream(agents.vad.VADStream):
66
- def __init__(
67
- self,
68
- model,
69
- *,
70
- min_speaking_duration: float,
71
- min_silence_duration: float,
72
- padding_duration: float,
73
- sample_rate: int,
74
- max_buffered_speech: float,
75
- threshold: float,
76
- ) -> None:
77
- self._min_speaking_duration = min_speaking_duration
78
- self._min_silence_duration = min_silence_duration
79
- self._padding_duration = padding_duration
80
- self._sample_rate = sample_rate
81
- self._max_buffered_speech = max_buffered_speech
82
- self._threshold = threshold
83
-
84
- if sample_rate not in [8000, 16000]:
85
- raise ValueError("Silero VAD only supports 8KHz and 16KHz sample rates")
86
-
87
- self._queue = asyncio.Queue[Optional[rtc.AudioFrame]]()
88
- self._event_queue = asyncio.Queue[Optional[agents.vad.VADEvent]]()
89
- self._model = model
90
-
91
- self._closed = False
92
- self._speaking = False
93
- self._waiting_start = False
94
- self._waiting_end = False
95
- self._current_sample = 0
96
- self._filter = agents.utils.ExpFilter(0.8)
97
- self._min_speaking_samples = min_speaking_duration * sample_rate
98
- self._min_silence_samples = min_silence_duration * sample_rate
99
- self._padding_duration_samples = padding_duration * sample_rate
100
- self._max_buffered_samples = max_buffered_speech * sample_rate
101
-
102
- self._queued_frames: deque[rtc.AudioFrame] = deque()
103
- self._original_frames: deque[rtc.AudioFrame] = deque()
104
- self._buffered_frames: List[rtc.AudioFrame] = []
105
- self._main_task = asyncio.create_task(self._run())
106
-
107
- def push_frame(self, frame: rtc.AudioFrame) -> None:
108
- if self._closed:
109
- raise ValueError("cannot push frame to closed stream")
110
-
111
- self._queue.put_nowait(frame)
112
-
113
- async def aclose(self, *, wait: bool = True) -> None:
114
- self._closed = True
115
- if not wait:
116
- self._main_task.cancel()
117
-
118
- self._queue.put_nowait(None)
119
- with contextlib.suppress(asyncio.CancelledError):
120
- await self._main_task
121
-
122
- async def _run(self):
123
- try:
124
- while True:
125
- frame = await self._queue.get()
126
- if frame is None:
127
- break # None is sent inside aclose
128
-
129
- self._queue.task_done()
130
-
131
- # resample to silero's sample rate
132
- resampled_frame = frame.remix_and_resample(
133
- self._sample_rate, 1
134
- ) # TODO: This is technically wrong, fix when we have a better resampler
135
- self._original_frames.append(frame)
136
- self._queued_frames.append(resampled_frame)
137
-
138
- # run inference by chunks of 40ms until we run out of data
139
- while True:
140
- available_length = sum(
141
- f.samples_per_channel for f in self._queued_frames
142
- )
143
-
144
- samples_40ms = self._sample_rate // 1000 * 40
145
- if available_length < samples_40ms:
146
- break
147
-
148
- await asyncio.shield(self._run_inference())
149
-
150
- except Exception:
151
- logger.exception("silero stream failed")
152
- finally:
153
- self._event_queue.put_nowait(None)
154
-
155
- async def _run_inference(self) -> None:
156
- # merge the first 4 frames (we know each is 10ms)
157
- if len(self._queued_frames) < 4:
158
- return
159
-
160
- original_frames = [self._original_frames.popleft() for _ in range(4)]
161
- merged_frame = agents.utils.merge_frames(
162
- [self._queued_frames.popleft() for _ in range(4)]
163
- )
164
-
165
- # convert data_40ms to tensor & f32
166
- tensor = torch.from_numpy(np.frombuffer(merged_frame.data, dtype=np.int16))
167
- tensor = tensor.to(torch.float32) / 32768.0
168
-
169
- # run inference
170
- start_time = time.time()
171
- raw_prob = await asyncio.to_thread(
172
- lambda: self._model(tensor, self._sample_rate).item()
173
- )
174
- probability = self._filter.apply(1.0, raw_prob)
175
- inference_duration = time.time() - start_time
176
-
177
- # inference done
178
- event = agents.vad.VADEvent(
179
- type=agents.vad.VADEventType.INFERENCE_DONE,
180
- samples_index=self._current_sample,
181
- probability=probability,
182
- raw_inference_prob=raw_prob,
183
- inference_duration=inference_duration,
184
- )
185
- self._event_queue.put_nowait(event)
186
-
187
- self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
188
- self._current_sample += merged_frame.samples_per_channel
189
-
190
- def _dispatch_event(
191
- self,
192
- original_frames: List[rtc.AudioFrame],
193
- probability: float,
194
- raw_inference_prob: float,
195
- inference_duration: float,
196
- ):
197
- """
198
- Dispatches a VAD event based on the speech probability and the options
199
- Args:
200
- speech_prob: speech probability of the current frame
201
- original_frames: original frames of the current inference
202
- """
203
-
204
- samples_10ms = self._sample_rate / 100
205
- padding_count = int(
206
- self._padding_duration_samples // samples_10ms
207
- ) # number of frames to keep for the padding (one side)
208
-
209
- self._buffered_frames.extend(original_frames)
210
- if (
211
- not self._speaking
212
- and not self._waiting_start
213
- and len(self._buffered_frames) > padding_count
214
- ):
215
- self._buffered_frames = self._buffered_frames[
216
- len(self._buffered_frames) - padding_count :
217
- ]
218
-
219
- max_buffer_len = padding_count + max(
220
- int(self._max_buffered_samples // samples_10ms),
221
- int(self._min_speaking_samples // samples_10ms),
222
- )
223
- if len(self._buffered_frames) > max_buffer_len:
224
- self._buffered_frames = self._buffered_frames[
225
- len(self._buffered_frames) - max_buffer_len :
226
- ]
227
-
228
- if probability >= self._threshold:
229
- # speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
230
- self._waiting_end = False
231
- if not self._waiting_start and not self._speaking:
232
- self._waiting_start = True
233
- self._start_speech = self._current_sample
234
-
235
- if self._waiting_start and (
236
- self._current_sample - self._start_speech >= self._min_speaking_samples
237
- ):
238
- self._waiting_start = False
239
- self._speaking = True
240
-
241
- # since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
242
- # put the speech that were used to trigger the start here
243
- event = agents.vad.VADEvent(
244
- type=agents.vad.VADEventType.START_OF_SPEECH,
245
- samples_index=self._start_speech,
246
- frames=self._buffered_frames[padding_count:],
247
- speaking=True,
248
- )
249
- self._event_queue.put_nowait(event)
250
-
251
- # we don't check the speech_prob here
252
- event = agents.vad.VADEvent(
253
- type=agents.vad.VADEventType.INFERENCE_DONE,
254
- samples_index=self._current_sample,
255
- frames=original_frames,
256
- probability=probability,
257
- raw_inference_prob=raw_inference_prob,
258
- inference_duration=inference_duration,
259
- speaking=self._speaking,
260
- )
261
- self._event_queue.put_nowait(event)
262
-
263
- if probability < self._threshold:
264
- # stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
265
- self._waiting_start = False
266
- if not self._waiting_end and self._speaking:
267
- self._waiting_end = True
268
- self._end_speech = self._current_sample
269
-
270
- if self._waiting_end and (
271
- self._current_sample - self._end_speech
272
- >= max(self._min_silence_samples, self._padding_duration_samples)
273
- ):
274
- self._waiting_end = False
275
- self._speaking = False
276
- event = agents.vad.VADEvent(
277
- type=agents.vad.VADEventType.END_OF_SPEECH,
278
- samples_index=self._end_speech,
279
- duration=(self._end_speech - self._start_speech)
280
- / self._sample_rate,
281
- frames=self._buffered_frames,
282
- speaking=False,
283
- )
284
- self._event_queue.put_nowait(event)
285
-
286
- async def __anext__(self) -> agents.vad.VADEvent:
287
- evt = await self._event_queue.get()
288
- if evt is None:
289
- raise StopAsyncIteration
290
-
291
- return evt
@@ -1,6 +0,0 @@
1
- livekit~=0.11
2
- livekit-agents~=0.8.dev0
3
- torch<3,>=2
4
- torchaudio>=2
5
- numpy<2,>=1
6
- onnxruntime~=1.17.0