livekit-plugins-silero 0.3.dev0__tar.gz → 0.4.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (15) hide show
  1. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/PKG-INFO +3 -3
  2. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/__init__.py +1 -0
  3. livekit_plugins_silero-0.4.dev0/livekit/plugins/silero/log.py +3 -0
  4. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/vad.py +57 -45
  5. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/version.py +1 -1
  6. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/PKG-INFO +3 -3
  7. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/SOURCES.txt +1 -0
  8. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/requires.txt +2 -2
  9. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/setup.py +2 -2
  10. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/README.md +0 -0
  11. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/py.typed +0 -0
  12. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/dependency_links.txt +0 -0
  13. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/top_level.txt +0 -0
  14. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/pyproject.toml +0 -0
  15. {livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-silero
3
- Version: 0.3.dev0
3
+ Version: 0.4.dev0
4
4
  Summary: Agent Framework Plugin for Silero
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,8 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: >=3.8.0
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: livekit~=0.9
24
- Requires-Dist: livekit-agents~=0.5.dev0
23
+ Requires-Dist: livekit~=0.11
24
+ Requires-Dist: livekit-agents~=0.6.dev0
25
25
  Requires-Dist: torch<3,>=2
26
26
  Requires-Dist: torchaudio>=2
27
27
  Requires-Dist: numpy<2,>=1
@@ -29,6 +29,7 @@ class SileroPlugin(Plugin):
29
29
  _ = torch.hub.load(
30
30
  repo_or_dir="snakers4/silero-vad",
31
31
  model="silero_vad",
32
+ use_onnx=True,
32
33
  )
33
34
 
34
35
 
@@ -0,0 +1,3 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger("livekit.plugins.silero")
@@ -16,19 +16,19 @@ from __future__ import annotations
16
16
 
17
17
  import asyncio
18
18
  import contextlib
19
- import logging
19
+ import time
20
20
  from collections import deque
21
- from typing import List, Optional
21
+ from typing import List
22
22
 
23
23
  import numpy as np
24
24
  import torch
25
25
  from livekit import agents, rtc
26
26
 
27
+ from .log import logger
28
+
27
29
 
28
30
  class VAD(agents.vad.VAD):
29
- def __init__(
30
- self, *, model_path: Optional[str] = None, use_onnx: bool = True
31
- ) -> None:
31
+ def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
32
32
  if model_path:
33
33
  model = torch.jit.load(model_path)
34
34
  model.eval()
@@ -43,12 +43,12 @@ class VAD(agents.vad.VAD):
43
43
  def stream(
44
44
  self,
45
45
  *,
46
- min_speaking_duration: float = 0.5,
47
- min_silence_duration: float = 0.5,
46
+ min_speaking_duration: float = 0.2,
47
+ min_silence_duration: float = 0.8,
48
48
  padding_duration: float = 0.1,
49
49
  sample_rate: int = 16000,
50
50
  max_buffered_speech: float = 45.0,
51
- threshold: float = 0.5,
51
+ threshold: float = 0.2,
52
52
  ) -> "VADStream":
53
53
  return VADStream(
54
54
  self._model,
@@ -93,6 +93,7 @@ class VADStream(agents.vad.VADStream):
93
93
  self._waiting_start = False
94
94
  self._waiting_end = False
95
95
  self._current_sample = 0
96
+ self._filter = agents.utils.ExpFilter(0.8)
96
97
  self._min_speaking_samples = min_speaking_duration * sample_rate
97
98
  self._min_silence_samples = min_silence_duration * sample_rate
98
99
  self._padding_duration_samples = padding_duration * sample_rate
@@ -103,12 +104,6 @@ class VADStream(agents.vad.VADStream):
103
104
  self._buffered_frames: List[rtc.AudioFrame] = []
104
105
  self._main_task = asyncio.create_task(self._run())
105
106
 
106
- def log_exception(task: asyncio.Task) -> None:
107
- if not task.cancelled() and task.exception():
108
- logging.error(f"silero vad task failed: {task.exception()}")
109
-
110
- self._main_task.add_done_callback(log_exception)
111
-
112
107
  def push_frame(self, frame: rtc.AudioFrame) -> None:
113
108
  if self._closed:
114
109
  raise ValueError("cannot push frame to closed stream")
@@ -151,6 +146,9 @@ class VADStream(agents.vad.VADStream):
151
146
  break
152
147
 
153
148
  await asyncio.shield(self._run_inference())
149
+
150
+ except Exception:
151
+ logger.exception("silero stream failed")
154
152
  finally:
155
153
  self._event_queue.put_nowait(None)
156
154
 
@@ -169,13 +167,33 @@ class VADStream(agents.vad.VADStream):
169
167
  tensor = tensor.to(torch.float32) / 32768.0
170
168
 
171
169
  # run inference
172
- speech_prob = await asyncio.to_thread(
170
+ start_time = time.time()
171
+ raw_prob = await asyncio.to_thread(
173
172
  lambda: self._model(tensor, self._sample_rate).item()
174
173
  )
175
- self._dispatch_event(speech_prob, original_frames)
174
+ probability = self._filter.apply(1.0, raw_prob)
175
+ inference_duration = time.time() - start_time
176
+
177
+ # inference done
178
+ event = agents.vad.VADEvent(
179
+ type=agents.vad.VADEventType.INFERENCE_DONE,
180
+ samples_index=self._current_sample,
181
+ probability=probability,
182
+ raw_inference_prob=raw_prob,
183
+ inference_duration=inference_duration,
184
+ )
185
+ self._event_queue.put_nowait(event)
186
+
187
+ self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
176
188
  self._current_sample += merged_frame.samples_per_channel
177
189
 
178
- def _dispatch_event(self, speech_prob: int, original_frames: List[rtc.AudioFrame]):
190
+ def _dispatch_event(
191
+ self,
192
+ original_frames: List[rtc.AudioFrame],
193
+ probability: float,
194
+ raw_inference_prob: float,
195
+ inference_duration: float,
196
+ ):
179
197
  """
180
198
  Dispatches a VAD event based on the speech probability and the options
181
199
  Args:
@@ -203,15 +221,11 @@ class VADStream(agents.vad.VADStream):
203
221
  int(self._min_speaking_samples // samples_10ms),
204
222
  )
205
223
  if len(self._buffered_frames) > max_buffer_len:
206
- # if unaware of this, may be hard to debug, so logging seems ok here
207
- logging.warning(
208
- f"VAD buffer overflow, dropping {len(self._buffered_frames) - max_buffer_len} frames"
209
- )
210
224
  self._buffered_frames = self._buffered_frames[
211
225
  len(self._buffered_frames) - max_buffer_len :
212
226
  ]
213
227
 
214
- if speech_prob >= self._threshold:
228
+ if probability >= self._threshold:
215
229
  # speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
216
230
  self._waiting_end = False
217
231
  if not self._waiting_start and not self._speaking:
@@ -223,34 +237,31 @@ class VADStream(agents.vad.VADStream):
223
237
  ):
224
238
  self._waiting_start = False
225
239
  self._speaking = True
226
- event = agents.vad.VADEvent(
227
- type=agents.vad.VADEventType.START_OF_SPEECH,
228
- samples_index=self._start_speech,
229
- )
230
- self._event_queue.put_nowait(event)
231
240
 
232
241
  # since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
233
- # the SPEAKING data is missing the first few frames, trigger it here
234
- # TODO(theomonnom): Maybe it is better to put the data inside the START_OF_SPEECH event?
242
+ # put the speech that were used to trigger the start here
235
243
  event = agents.vad.VADEvent(
236
- type=agents.vad.VADEventType.SPEAKING,
244
+ type=agents.vad.VADEventType.START_OF_SPEECH,
237
245
  samples_index=self._start_speech,
238
- speech=self._buffered_frames[padding_count:],
246
+ frames=self._buffered_frames[padding_count:],
247
+ speaking=True,
239
248
  )
249
+ self._event_queue.put_nowait(event)
240
250
 
241
- return
242
-
243
- if self._speaking:
244
- # we don't check the speech_prob here
245
- event = agents.vad.VADEvent(
246
- type=agents.vad.VADEventType.SPEAKING,
247
- samples_index=self._current_sample,
248
- speech=original_frames,
249
- )
250
- self._event_queue.put_nowait(event)
251
+ # we don't check the speech_prob here
252
+ event = agents.vad.VADEvent(
253
+ type=agents.vad.VADEventType.INFERENCE_DONE,
254
+ samples_index=self._current_sample,
255
+ frames=original_frames,
256
+ probability=probability,
257
+ raw_inference_prob=raw_inference_prob,
258
+ inference_duration=inference_duration,
259
+ speaking=self._speaking,
260
+ )
261
+ self._event_queue.put_nowait(event)
251
262
 
252
- if speech_prob < self._threshold:
253
- # stopped speaking, wait for min_silence_duration to trigger END_OF_SPEECH,
263
+ if probability < self._threshold:
264
+ # stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
254
265
  self._waiting_start = False
255
266
  if not self._waiting_end and self._speaking:
256
267
  self._waiting_end = True
@@ -265,9 +276,10 @@ class VADStream(agents.vad.VADStream):
265
276
  event = agents.vad.VADEvent(
266
277
  type=agents.vad.VADEventType.END_OF_SPEECH,
267
278
  samples_index=self._end_speech,
268
- duration=(self._current_sample - self._start_speech)
279
+ duration=(self._end_speech - self._start_speech)
269
280
  / self._sample_rate,
270
- speech=self._buffered_frames,
281
+ frames=self._buffered_frames,
282
+ speaking=False,
271
283
  )
272
284
  self._event_queue.put_nowait(event)
273
285
 
@@ -12,4 +12,4 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- __version__ = "0.3.dev0"
15
+ __version__ = "0.4.dev0"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: livekit-plugins-silero
3
- Version: 0.3.dev0
3
+ Version: 0.4.dev0
4
4
  Summary: Agent Framework Plugin for Silero
5
5
  Home-page: https://github.com/livekit/agents
6
6
  License: Apache-2.0
@@ -20,8 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
20
20
  Classifier: Programming Language :: Python :: 3 :: Only
21
21
  Requires-Python: >=3.8.0
22
22
  Description-Content-Type: text/markdown
23
- Requires-Dist: livekit~=0.9
24
- Requires-Dist: livekit-agents~=0.5.dev0
23
+ Requires-Dist: livekit~=0.11
24
+ Requires-Dist: livekit-agents~=0.6.dev0
25
25
  Requires-Dist: torch<3,>=2
26
26
  Requires-Dist: torchaudio>=2
27
27
  Requires-Dist: numpy<2,>=1
@@ -2,6 +2,7 @@ README.md
2
2
  pyproject.toml
3
3
  setup.py
4
4
  livekit/plugins/silero/__init__.py
5
+ livekit/plugins/silero/log.py
5
6
  livekit/plugins/silero/py.typed
6
7
  livekit/plugins/silero/vad.py
7
8
  livekit/plugins/silero/version.py
@@ -1,5 +1,5 @@
1
- livekit~=0.9
2
- livekit-agents~=0.5.dev0
1
+ livekit~=0.11
2
+ livekit-agents~=0.6.dev0
3
3
  torch<3,>=2
4
4
  torchaudio>=2
5
5
  numpy<2,>=1
@@ -49,8 +49,8 @@ setuptools.setup(
49
49
  packages=setuptools.find_namespace_packages(include=["livekit.*"]),
50
50
  python_requires=">=3.8.0",
51
51
  install_requires=[
52
- "livekit ~= 0.9",
53
- "livekit-agents~=0.5.dev0",
52
+ "livekit ~= 0.11",
53
+ "livekit-agents~=0.6.dev0",
54
54
  "torch >= 2, < 3",
55
55
  "torchaudio >= 2",
56
56
  "numpy >= 1, < 2",