npm - @jjhbw/silero-vad - Versions diffs - 1.0.2 → 1.0.3 - Mend

@jjhbw/silero-vad 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/README.md CHANGED Viewed

@@ -1,4 +1,4 @@
-# Silero VAD Node Fork
+# Silero VAD Node
 Minimal Node.js wrapper around the Silero VAD ONNX model, with a small CLI and parity tests against the Python implementation. The Node implementation runs VAD and silence stripping directly from ffmpeg streams to keep memory usage low on long files.
@@ -27,9 +27,7 @@ const {
     const inputs = ["input.wav", "other.mp3"];
     for (const inputPath of inputs) {
       vad.resetStates(); // per file/stream
-      const ts = await getSpeechTimestamps(inputPath, vad, {
-        returnSeconds: true,
-      });
+      const ts = await getSpeechTimestamps(inputPath, vad);
       // Each entry includes both seconds (start/end) and samples (startSample/endSample).
       console.log(inputPath, ts);
       // Example return value:
@@ -73,7 +71,6 @@ Options:
 - `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
 - `--time-resolution <n>`: decimal places for seconds output (default `3`).
 - `--neg-threshold <float>`: override the negative threshold (default `max(threshold - 0.15, 0.01)`).
-- `--seconds`: output timestamps in seconds (default on).
 - `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
 - `--strip-silence`: write a new WAV file with silences removed.
 - `--output-dir <path>`: output directory for strip-silence files (default: input dir).

package/cli.js CHANGED Viewed

@@ -3,6 +3,7 @@
 const fs = require('fs');
 const fsp = fs.promises;
 const path = require('path');
+const { spawn } = require('child_process');
 const {
   loadSileroVad,
   getSpeechTimestamps,
@@ -33,15 +34,13 @@ async function main() {
         // reuse session, reset stream state per file
         vad.resetStates();
         const t0 = performance.now();
-        const { timestamps, totalSamples } = await getSpeechTimestamps(audioPath, vad, {
+        const timestamps = await getSpeechTimestamps(audioPath, vad, {
           threshold: args.threshold,
           minSpeechDurationMs: args.minSpeechDurationMs,
           minSilenceDurationMs: args.minSilenceDurationMs,
           speechPadMs: args.speechPadMs,
-          returnSeconds: args.seconds,
           timeResolution: args.timeResolution,
           negThreshold: args.negThreshold,
-          returnMetadata: true,
         });
         const t1 = performance.now();
         results.push({ file: audioPath, timestamps });
@@ -49,10 +48,10 @@ async function main() {
         const mem = process.memoryUsage();
         const speechSeconds = getSpeechDurationSeconds(
           timestamps,
-          args.seconds,
+          true,
           effectiveSampleRate,
         );
-        const durationSeconds = totalSamples / effectiveSampleRate;
+        const durationSeconds = await getAudioDurationSeconds(audioPath);
         const silenceSeconds = Math.max(0, durationSeconds - speechSeconds);
         const totalForPct = durationSeconds > 0 ? durationSeconds : 1;
         const speechPct = (speechSeconds / totalForPct) * 100;
@@ -99,9 +98,9 @@ async function main() {
         }
         if (args.stripSilence) {
-          const segmentsSeconds = timestamps.map(({ start, end, startSeconds, endSeconds }) => ({
-            start: args.seconds ? start : startSeconds,
-            end: args.seconds ? end : endSeconds,
+          const segmentsSeconds = timestamps.map(({ start, end }) => ({
+            start,
+            end,
           }));
           if (!segmentsSeconds.length) {
             console.info(`strip_silence=skipped (no speech detected)`);
@@ -164,7 +163,6 @@ function parseArgs(argv) {
     speechPadMs: 30,
     timeResolution: 3,
     negThreshold: null,
-    seconds: true,
     charsPerSecond: 4,
     showTimeline: false,
     stripSilence: false,
@@ -220,8 +218,6 @@ function parseArgs(argv) {
         out.negThreshold = value;
       }
       i += 1;
-    } else if (arg === '--seconds') {
-      out.seconds = true;
     } else if (arg === '--cps') {
       const value = parseFloat(argv[i + 1]);
       out.showTimeline = true;
@@ -254,7 +250,6 @@ Options:
   --speech-pad-ms <ms>   Padding added to speech segments in ms (default: 30)
   --time-resolution <n>  Decimal places for seconds output (default: 3)
   --neg-threshold <f>    Negative threshold override (default: max(threshold - 0.15, 0.01))
-  --seconds              Output timestamps in seconds (default: on)
   --cps <float>          Enable timeline visualization; chars per second (default: 4)
   --strip-silence         Write a new file with all silences removed
   --output-dir <path>     Output directory for strip-silence files (default: input dir)
@@ -319,6 +314,44 @@ function getStripOutputPath(inputPath, outputDir) {
   return path.join(dir, `${base}_speech.wav`);
 }
+async function getAudioDurationSeconds(inputPath) {
+  const args = [
+    '-v',
+    'error',
+    // Use packet timestamps to handle containers where format duration is unreliable.
+    '-select_streams',
+    'a:0',
+    '-show_entries',
+    'packet=pts_time',
+    '-of',
+    'csv=p=0',
+    inputPath,
+  ];
+  return new Promise((resolve, reject) => {
+    const ffprobe = spawn('ffprobe', args, { stdio: ['ignore', 'pipe', 'inherit'] });
+    let output = '';
+    ffprobe.stdout.on('data', (chunk) => {
+      output += chunk.toString();
+    });
+    ffprobe.on('error', reject);
+    ffprobe.on('close', (code) => {
+      if (code !== 0) {
+        reject(new Error(`ffprobe exited with code ${code}`));
+        return;
+      }
+      const lines = output.trim().split('\n');
+      for (let i = lines.length - 1; i >= 0; i -= 1) {
+        const value = parseFloat(lines[i]);
+        if (Number.isFinite(value)) {
+          resolve(value);
+          return;
+        }
+      }
+      reject(new Error('Unable to read audio duration from ffprobe output'));
+    });
+  });
+}
 async function ensureUniquePath(outputPath) {
   try {
     await fsp.access(outputPath);

package/lib.js CHANGED Viewed

@@ -1,5 +1,3 @@
-const fs = require('fs');
-const fsp = fs.promises;
 const path = require('path');
 const { spawn } = require('child_process');
 const ort = require('onnxruntime-node');
@@ -120,7 +118,6 @@ async function loadSileroVad(model = 'default', opts = {}) {
  *   Example: a 50 ms pause will not split a segment at 100 ms.
  * @param {number} [options.speechPadMs=30] Pad each segment on both sides, clamped
  *   to neighbors. Example: [1.000, 2.000] -> ~[0.970, 2.030].
- * @param {boolean} [options.returnSeconds=false]
  * @param {number} [options.timeResolution=3] Decimal places for seconds output.
  *   Example: timeResolution=1 turns 1.23456 into 1.2.
  * @param {number} [options.negThreshold=threshold-0.15] End speech when prob dips
@@ -128,7 +125,8 @@ async function loadSileroVad(model = 'default', opts = {}) {
  *   negThreshold=0.35 keeps speech open during brief 0.4 dips.
  *   Default clamps to >= 0.01 to avoid an always-on end condition.
  * @param {number} [options.sampleRate]
- * @param {boolean} [options.returnMetadata=false]
+ * @returns {Promise<Array<{start: number, end: number, startSample: number, endSample: number}>>}
+ *   start/end are seconds; startSample/endSample are sample indices.
  */
 async function getSpeechTimestamps(
   inputPath,
@@ -138,11 +136,9 @@ async function getSpeechTimestamps(
     minSpeechDurationMs = 250,
     minSilenceDurationMs = 100,
     speechPadMs = 30,
-    returnSeconds = false,
     timeResolution = 3,
     negThreshold,
     sampleRate,
-    returnMetadata = false,
   } = {},
 ) {
   if (!vad) {
@@ -351,23 +347,12 @@ async function getSpeechTimestamps(
   }
   const convertSeconds = (samples) => +(samples / sr).toFixed(timeResolution);
-  const result = returnSeconds
-    ? speeches.map(({ start, end }) => ({
-      start: convertSeconds(start),
-      end: convertSeconds(end),
-      startSample: start,
-      endSample: end,
-    }))
-    : speeches.map(({ start, end }) => ({
-      start,
-      end,
-      startSeconds: convertSeconds(start),
-      endSeconds: convertSeconds(end),
-    }));
-  if (returnMetadata) {
-    return { timestamps: result, totalSamples };
-  }
+  const result = speeches.map(({ start, end }) => ({
+    start: convertSeconds(start),
+    end: convertSeconds(end),
+    startSample: start,
+    endSample: end,
+  }));
   return result;
 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@jjhbw/silero-vad",
-  "version": "1.0.2",
+  "version": "1.0.3",
   "description": "Node.js bindings for Silero VAD",
   "main": "index.js",
   "exports": {
@@ -17,7 +17,7 @@
   "license": "MIT",
   "repository": {
     "type": "git",
-    "url": "https://github.com/jjhbw/silero-vad.git"
+    "url": "git+https://github.com/jjhbw/silero-vad.git"
   },
   "bugs": {
     "url": "https://github.com/jjhbw/silero-vad/issues"