npm - @jjhbw/silero-vad - Versions diffs - 1.0.0 → 1.0.2 - Mend

@jjhbw/silero-vad 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -10,28 +10,6 @@ npm install @jjhbw/silero-vad
 Requires Node 18+ and `ffmpeg` available on `PATH` for decoding arbitrary audio formats.
-## CLI
-```bash
-npx silero-vad-cli --audio input.wav --audio other.mp3 [options]
-```
-Options:
-- `--model <key|path>`: model key (`default`, `16k`, `8k_16k`, `half`, `op18`) or custom ONNX path (default: `default`, i.e., bundled 16k op15).
-- `--threshold <float>`: speech probability threshold (default `0.5`).
-- `--min-speech-ms <ms>`: minimum speech duration in ms (default `250`).
-- `--min-silence-ms <ms>`: minimum silence duration in ms (default `100`).
-- `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
-- `--time-resolution <n>`: decimal places for seconds output (default `3`).
-- `--neg-threshold <float>`: override the negative threshold (default `threshold - 0.15`).
-- `--seconds`: output timestamps in seconds (default on).
-- `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
-- `--strip-silence`: write a new WAV file with silences removed.
-- `--output-dir <path>`: output directory for strip-silence files (default: input dir).
-Outputs an array of `{ file, timestamps }` to stdout as JSON. The CLI reuses a single ONNX session and resets state per file.
-The sample rate is defined by the selected model (read from `vad.sampleRate`).
 ## Library usage
 ```js
@@ -39,17 +17,19 @@ const {
   loadSileroVad,
   getSpeechTimestamps,
   writeStrippedAudio,
-  WEIGHTS
-} = require('@jjhbw/silero-vad');
+  WEIGHTS,
+} = require("@jjhbw/silero-vad");
 (async () => {
-  const vad = await loadSileroVad('default'); // or WEIGHTS keys/custom path
+  const vad = await loadSileroVad("default"); // or WEIGHTS keys/custom path
   try {
-    if (!vad.sampleRate) throw new Error('Model sample rate is undefined');
-    const inputs = ['input.wav', 'other.mp3'];
+    if (!vad.sampleRate) throw new Error("Model sample rate is undefined");
+    const inputs = ["input.wav", "other.mp3"];
     for (const inputPath of inputs) {
       vad.resetStates(); // per file/stream
-      const ts = await getSpeechTimestamps(inputPath, vad, { returnSeconds: true });
+      const ts = await getSpeechTimestamps(inputPath, vad, {
+        returnSeconds: true,
+      });
       // Each entry includes both seconds (start/end) and samples (startSample/endSample).
       console.log(inputPath, ts);
       // Example return value:
@@ -63,7 +43,7 @@ const {
       // Note: encoding speed varies by container/codec; uncompressed PCM (e.g., .wav) is fastest,
       // lossless compression (e.g., .flac) is slower, and lossy codecs (e.g., .mp3/.aac/.opus)
       // are typically the slowest to encode.
-      const outPath = inputPath.replace(/\.[^.]+$/, '.stripped.wav');
+      const outPath = inputPath.replace(/\.[^.]+$/, ".stripped.wav");
       await writeStrippedAudio(inputPath, ts, vad.sampleRate, outPath);
     }
   } finally {
@@ -73,10 +53,34 @@ const {
 ```
 Guidelines:
 - Load once, reuse: keep one `SileroVad` per concurrent worker.
 - Call `resetStates()` before each new file/stream; the session and weights stay in memory.
 - Call `release()` when shutting down.
+## CLI
+```bash
+npx @jjhbw/silero-vad --audio input.wav --audio other.mp3 [options]
+```
+Options:
+- `--model <key|path>`: model key (`default`, `16k`, `8k_16k`, `half`, `op18`) or custom ONNX path (default: `default`, i.e., bundled 16k op15).
+- `--threshold <float>`: speech probability threshold (default `0.5`).
+- `--min-speech-ms <ms>`: minimum speech duration in ms (default `250`).
+- `--min-silence-ms <ms>`: minimum silence duration in ms (default `100`).
+- `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
+- `--time-resolution <n>`: decimal places for seconds output (default `3`).
+- `--neg-threshold <float>`: override the negative threshold (default `max(threshold - 0.15, 0.01)`).
+- `--seconds`: output timestamps in seconds (default on).
+- `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
+- `--strip-silence`: write a new WAV file with silences removed.
+- `--output-dir <path>`: output directory for strip-silence files (default: input dir).
+Outputs an array of `{ file, timestamps }` to stdout as JSON. The CLI reuses a single ONNX session and resets state per file.
+The sample rate is defined by the selected model (read from `vad.sampleRate`).
 ## Development
 Clone the repo to run benchmarks and tests locally.
@@ -103,4 +107,4 @@ npm install
 npm test
 ```
-Ensure Python snapshots are generated (run `pytest tests/test_snapshots.py` in the repo root) and `ffmpeg` is installed.***
+Ensure Python snapshots are generated (run `pytest tests/test_snapshots.py` in the repo root) and `ffmpeg` is installed.

package/cli.js CHANGED Viewed

@@ -253,7 +253,7 @@ Options:
   --min-silence-ms <ms>  Minimum silence duration in ms (default: 100)
   --speech-pad-ms <ms>   Padding added to speech segments in ms (default: 30)
   --time-resolution <n>  Decimal places for seconds output (default: 3)
-  --neg-threshold <f>    Negative threshold override (default: threshold - 0.15)
+  --neg-threshold <f>    Negative threshold override (default: max(threshold - 0.15, 0.01))
   --seconds              Output timestamps in seconds (default: on)
   --cps <float>          Enable timeline visualization; chars per second (default: 4)
   --strip-silence         Write a new file with all silences removed

package/lib.js CHANGED Viewed

@@ -4,6 +4,10 @@ const path = require('path');
 const { spawn } = require('child_process');
 const ort = require('onnxruntime-node');
+/**
+ * Bundled model spec map keyed by CLI/library names.
+ * @type {Record<string, {path: string, sampleRate: number}>}
+ */
 const WEIGHTS = {
   default: { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
   '16k': { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
@@ -82,6 +86,13 @@ class SileroVad {
   }
 }
+/**
+ * Load a Silero VAD ONNX model and return a ready-to-run VAD instance.
+ * @param {string} [model='default'] Bundled model key or custom ONNX path.
+ * @param {Object} [opts]
+ * @param {Object} [opts.sessionOptions] onnxruntime-node session options override.
+ * @returns {Promise<SileroVad>}
+ */
 async function loadSileroVad(model = 'default', opts = {}) {
   const spec = WEIGHTS[model];
   const modelPath = spec ? spec.path : model || WEIGHTS.default.path;
@@ -95,6 +106,30 @@ async function loadSileroVad(model = 'default', opts = {}) {
   return vad;
 }
+/**
+ * Run VAD on an audio file and return speech segments.
+ * @param {string} inputPath
+ * @param {SileroVad} vad
+ * @param {Object} [options]
+ * @param {number} [options.threshold=0.5] Start speech when prob >= threshold.
+ *   Example: if probs hover at 0.45-0.6, threshold=0.6 will miss soft speech.
+ * @param {number} [options.minSpeechDurationMs=250] Drop segments shorter than this.
+ *   Example: a 120 ms burst above threshold is discarded at 250 ms.
+ * @param {number} [options.minSilenceDurationMs=100] End speech only after silence
+ *   stays below negThreshold for this long.
+ *   Example: a 50 ms pause will not split a segment at 100 ms.
+ * @param {number} [options.speechPadMs=30] Pad each segment on both sides, clamped
+ *   to neighbors. Example: [1.000, 2.000] -> ~[0.970, 2.030].
+ * @param {boolean} [options.returnSeconds=false]
+ * @param {number} [options.timeResolution=3] Decimal places for seconds output.
+ *   Example: timeResolution=1 turns 1.23456 into 1.2.
+ * @param {number} [options.negThreshold=threshold-0.15] End speech when prob dips
+ *   below this; provides hysteresis vs threshold. Example: threshold=0.5,
+ *   negThreshold=0.35 keeps speech open during brief 0.4 dips.
+ *   Default clamps to >= 0.01 to avoid an always-on end condition.
+ * @param {number} [options.sampleRate]
+ * @param {boolean} [options.returnMetadata=false]
+ */
 async function getSpeechTimestamps(
   inputPath,
   vad,
@@ -104,7 +139,7 @@ async function getSpeechTimestamps(
     minSilenceDurationMs = 100,
     speechPadMs = 30,
     returnSeconds = false,
-    timeResolution = 1,
+    timeResolution = 3,
     negThreshold,
     sampleRate,
     returnMetadata = false,
@@ -178,7 +213,7 @@ async function getSpeechTimestamps(
       }
       currentSpeech.end = tempEnd;
-      if (currentSpeech.end - currentSpeech.start > minSpeechSamples) {
+      if (currentSpeech.end - currentSpeech.start >= minSpeechSamples) {
         speeches.push(currentSpeech);
       }
       currentSpeech = {};
@@ -191,7 +226,7 @@ async function getSpeechTimestamps(
     for await (const chunk of ffmpeg.stdout) {
       let data = chunk;
       if (leftoverBytes.length) {
-        const combined = Buffer.allocUnsafe(leftoverBytes.length + chunk.length);
+        const combined = Buffer.alloc(leftoverBytes.length + chunk.length);
         leftoverBytes.copy(combined, 0);
         chunk.copy(combined, leftoverBytes.length);
         data = combined;
@@ -300,7 +335,7 @@ async function getSpeechTimestamps(
   if (currentSpeech.start !== undefined) {
     currentSpeech.end = totalSamples;
-    if (currentSpeech.end - currentSpeech.start > minSpeechSamples) {
+    if (currentSpeech.end - currentSpeech.start >= minSpeechSamples) {
       speeches.push(currentSpeech);
     }
   }
@@ -337,6 +372,15 @@ async function getSpeechTimestamps(
   return result;
 }
+/**
+ * Write a new audio file containing only the provided speech segments.
+ * Uses ffmpeg; encoding is inferred from outputPath extension/container.
+ * @param {string} inputPath
+ * @param {Array<{start: number, end: number}>} segmentsSeconds Seconds-based ranges.
+ * @param {number} sampleRate Output sample rate (required by ffmpeg).
+ * @param {string} outputPath
+ * @returns {Promise<void>}
+ */
 async function writeStrippedAudio(inputPath, segmentsSeconds, sampleRate, outputPath) {
   if (!segmentsSeconds || !segmentsSeconds.length) {
     throw new Error('No valid speech segments to write');

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@jjhbw/silero-vad",
-  "version": "1.0.0",
+  "version": "1.0.2",
   "description": "Node.js bindings for Silero VAD",
   "main": "index.js",
   "exports": {