PyPI - livekit-plugins-silero - Versions diffs - 0.3.dev0__tar.gz → 0.4.dev0__tar.gz - Mend

livekit-plugins-silero 0.3.dev0tar.gz → 0.4.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-silero
-Version: 0.3.dev0
+Version: 0.4.dev0
 Summary: Agent Framework Plugin for Silero
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -20,8 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: >=3.8.0
 Description-Content-Type: text/markdown
-Requires-Dist: livekit~=0.9
-Requires-Dist: livekit-agents~=0.5.dev0
+Requires-Dist: livekit~=0.11
+Requires-Dist: livekit-agents~=0.6.dev0
 Requires-Dist: torch<3,>=2
 Requires-Dist: torchaudio>=2
 Requires-Dist: numpy<2,>=1

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/__init__.py RENAMED Viewed

@@ -29,6 +29,7 @@ class SileroPlugin(Plugin):
         _ = torch.hub.load(
             repo_or_dir="snakers4/silero-vad",
             model="silero_vad",
+            use_onnx=True,
         )

livekit_plugins_silero-0.4.dev0/livekit/plugins/silero/log.py ADDED Viewed

@@ -0,0 +1,3 @@
+import logging
+logger = logging.getLogger("livekit.plugins.silero")

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/vad.py RENAMED Viewed

@@ -16,19 +16,19 @@ from __future__ import annotations
 import asyncio
 import contextlib
-import logging
+import time
 from collections import deque
-from typing import List, Optional
+from typing import List
 import numpy as np
 import torch
 from livekit import agents, rtc
+from .log import logger
 class VAD(agents.vad.VAD):
-    def __init__(
-        self, *, model_path: Optional[str] = None, use_onnx: bool = True
-    ) -> None:
+    def __init__(self, *, model_path: str | None = None, use_onnx: bool = True) -> None:
         if model_path:
             model = torch.jit.load(model_path)
             model.eval()
@@ -43,12 +43,12 @@ class VAD(agents.vad.VAD):
     def stream(
         self,
         *,
-        min_speaking_duration: float = 0.5,
-        min_silence_duration: float = 0.5,
+        min_speaking_duration: float = 0.2,
+        min_silence_duration: float = 0.8,
         padding_duration: float = 0.1,
         sample_rate: int = 16000,
         max_buffered_speech: float = 45.0,
-        threshold: float = 0.5,
+        threshold: float = 0.2,
     ) -> "VADStream":
         return VADStream(
             self._model,
@@ -93,6 +93,7 @@ class VADStream(agents.vad.VADStream):
         self._waiting_start = False
         self._waiting_end = False
         self._current_sample = 0
+        self._filter = agents.utils.ExpFilter(0.8)
         self._min_speaking_samples = min_speaking_duration * sample_rate
         self._min_silence_samples = min_silence_duration * sample_rate
         self._padding_duration_samples = padding_duration * sample_rate
@@ -103,12 +104,6 @@ class VADStream(agents.vad.VADStream):
         self._buffered_frames: List[rtc.AudioFrame] = []
         self._main_task = asyncio.create_task(self._run())
-        def log_exception(task: asyncio.Task) -> None:
-            if not task.cancelled() and task.exception():
-                logging.error(f"silero vad task failed: {task.exception()}")
-        self._main_task.add_done_callback(log_exception)
     def push_frame(self, frame: rtc.AudioFrame) -> None:
         if self._closed:
             raise ValueError("cannot push frame to closed stream")
@@ -151,6 +146,9 @@ class VADStream(agents.vad.VADStream):
                         break
                     await asyncio.shield(self._run_inference())
+        except Exception:
+            logger.exception("silero stream failed")
         finally:
             self._event_queue.put_nowait(None)
@@ -169,13 +167,33 @@ class VADStream(agents.vad.VADStream):
         tensor = tensor.to(torch.float32) / 32768.0
         # run inference
-        speech_prob = await asyncio.to_thread(
+        start_time = time.time()
+        raw_prob = await asyncio.to_thread(
             lambda: self._model(tensor, self._sample_rate).item()
         )
-        self._dispatch_event(speech_prob, original_frames)
+        probability = self._filter.apply(1.0, raw_prob)
+        inference_duration = time.time() - start_time
+        # inference done
+        event = agents.vad.VADEvent(
+            type=agents.vad.VADEventType.INFERENCE_DONE,
+            samples_index=self._current_sample,
+            probability=probability,
+            raw_inference_prob=raw_prob,
+            inference_duration=inference_duration,
+        )
+        self._event_queue.put_nowait(event)
+        self._dispatch_event(original_frames, probability, raw_prob, inference_duration)
         self._current_sample += merged_frame.samples_per_channel
-    def _dispatch_event(self, speech_prob: int, original_frames: List[rtc.AudioFrame]):
+    def _dispatch_event(
+        self,
+        original_frames: List[rtc.AudioFrame],
+        probability: float,
+        raw_inference_prob: float,
+        inference_duration: float,
+    ):
         """
         Dispatches a VAD event based on the speech probability and the options
         Args:
@@ -203,15 +221,11 @@ class VADStream(agents.vad.VADStream):
             int(self._min_speaking_samples // samples_10ms),
         )
         if len(self._buffered_frames) > max_buffer_len:
-            # if unaware of this, may be hard to debug, so logging seems ok here
-            logging.warning(
-                f"VAD buffer overflow, dropping {len(self._buffered_frames) - max_buffer_len} frames"
-            )
             self._buffered_frames = self._buffered_frames[
                 len(self._buffered_frames) - max_buffer_len :
             ]
-        if speech_prob >= self._threshold:
+        if probability >= self._threshold:
             # speaking, wait for min_speaking_duration to trigger START_OF_SPEECH
             self._waiting_end = False
             if not self._waiting_start and not self._speaking:
@@ -223,34 +237,31 @@ class VADStream(agents.vad.VADStream):
             ):
                 self._waiting_start = False
                 self._speaking = True
-                event = agents.vad.VADEvent(
-                    type=agents.vad.VADEventType.START_OF_SPEECH,
-                    samples_index=self._start_speech,
-                )
-                self._event_queue.put_nowait(event)
                 # since we're waiting for the min_spaking_duration to trigger START_OF_SPEECH,
-                # the SPEAKING data is missing the first few frames, trigger it here
-                # TODO(theomonnom): Maybe it is better to put the data inside the START_OF_SPEECH event?
+                # put the speech that were used to trigger the start here
                 event = agents.vad.VADEvent(
-                    type=agents.vad.VADEventType.SPEAKING,
+                    type=agents.vad.VADEventType.START_OF_SPEECH,
                     samples_index=self._start_speech,
-                    speech=self._buffered_frames[padding_count:],
+                    frames=self._buffered_frames[padding_count:],
+                    speaking=True,
                 )
+                self._event_queue.put_nowait(event)
-                return
-        if self._speaking:
-            # we don't check the speech_prob here
-            event = agents.vad.VADEvent(
-                type=agents.vad.VADEventType.SPEAKING,
-                samples_index=self._current_sample,
-                speech=original_frames,
-            )
-            self._event_queue.put_nowait(event)
+        # we don't check the speech_prob here
+        event = agents.vad.VADEvent(
+            type=agents.vad.VADEventType.INFERENCE_DONE,
+            samples_index=self._current_sample,
+            frames=original_frames,
+            probability=probability,
+            raw_inference_prob=raw_inference_prob,
+            inference_duration=inference_duration,
+            speaking=self._speaking,
+        )
+        self._event_queue.put_nowait(event)
-        if speech_prob < self._threshold:
-            # stopped speaking, wait for min_silence_duration to trigger END_OF_SPEECH,
+        if probability < self._threshold:
+            # stopped speaking, s for min_silence_duration to trigger END_OF_SPEECH,
             self._waiting_start = False
             if not self._waiting_end and self._speaking:
                 self._waiting_end = True
@@ -265,9 +276,10 @@ class VADStream(agents.vad.VADStream):
                 event = agents.vad.VADEvent(
                     type=agents.vad.VADEventType.END_OF_SPEECH,
                     samples_index=self._end_speech,
-                    duration=(self._current_sample - self._start_speech)
+                    duration=(self._end_speech - self._start_speech)
                     / self._sample_rate,
-                    speech=self._buffered_frames,
+                    frames=self._buffered_frames,
+                    speaking=False,
                 )
                 self._event_queue.put_nowait(event)

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/version.py RENAMED Viewed

@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.3.dev0"
+__version__ = "0.4.dev0"

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: livekit-plugins-silero
-Version: 0.3.dev0
+Version: 0.4.dev0
 Summary: Agent Framework Plugin for Silero
 Home-page: https://github.com/livekit/agents
 License: Apache-2.0
@@ -20,8 +20,8 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3 :: Only
 Requires-Python: >=3.8.0
 Description-Content-Type: text/markdown
-Requires-Dist: livekit~=0.9
-Requires-Dist: livekit-agents~=0.5.dev0
+Requires-Dist: livekit~=0.11
+Requires-Dist: livekit-agents~=0.6.dev0
 Requires-Dist: torch<3,>=2
 Requires-Dist: torchaudio>=2
 Requires-Dist: numpy<2,>=1

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/SOURCES.txt RENAMED Viewed

@@ -2,6 +2,7 @@ README.md
 pyproject.toml
 setup.py
 livekit/plugins/silero/__init__.py
+livekit/plugins/silero/log.py
 livekit/plugins/silero/py.typed
 livekit/plugins/silero/vad.py
 livekit/plugins/silero/version.py

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/requires.txt RENAMED Viewed

@@ -1,5 +1,5 @@
-livekit~=0.9
-livekit-agents~=0.5.dev0
+livekit~=0.11
+livekit-agents~=0.6.dev0
 torch<3,>=2
 torchaudio>=2
 numpy<2,>=1

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/setup.py RENAMED Viewed

@@ -49,8 +49,8 @@ setuptools.setup(
     packages=setuptools.find_namespace_packages(include=["livekit.*"]),
     python_requires=">=3.8.0",
     install_requires=[
-        "livekit ~= 0.9",
-        "livekit-agents~=0.5.dev0",
+        "livekit ~= 0.11",
+        "livekit-agents~=0.6.dev0",
         "torch >= 2, < 3",
         "torchaudio >= 2",
         "numpy >= 1, < 2",

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/README.md RENAMED Viewed

File without changes

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit/plugins/silero/py.typed RENAMED Viewed

File without changes

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/livekit_plugins_silero.egg-info/top_level.txt RENAMED Viewed

File without changes

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/pyproject.toml RENAMED Viewed

File without changes

{livekit-plugins-silero-0.3.dev0 → livekit_plugins_silero-0.4.dev0}/setup.cfg RENAMED Viewed

File without changes

livekit-plugins-silero 0.3.dev0__tar.gz → 0.4.dev0__tar.gz

livekit-plugins-silero 0.3.dev0tar.gz → 0.4.dev0tar.gz