npm - @jjhbw/silero-vad - Versions diffs - 1.0.0 - Mend

@jjhbw/silero-vad 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/LICENSE +21 -0
package/README.md +106 -0
package/cli.js +339 -0
package/index.js +1 -0
package/lib.js +385 -0
package/package.json +37 -0
package/weights/silero_vad.onnx +0 -0
package/weights/silero_vad_16k_op15.onnx +0 -0
package/weights/silero_vad_half.onnx +0 -0
package/weights/silero_vad_op18_ifless.onnx +0 -0

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2020-present Silero Team
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,106 @@
+# Silero VAD Node Fork
+Minimal Node.js wrapper around the Silero VAD ONNX model, with a small CLI and parity tests against the Python implementation. The Node implementation runs VAD and silence stripping directly from ffmpeg streams to keep memory usage low on long files.
+## Install
+```bash
+npm install @jjhbw/silero-vad
+```
+Requires Node 18+ and `ffmpeg` available on `PATH` for decoding arbitrary audio formats.
+## CLI
+```bash
+npx silero-vad-cli --audio input.wav --audio other.mp3 [options]
+```
+Options:
+- `--model <key|path>`: model key (`default`, `16k`, `8k_16k`, `half`, `op18`) or custom ONNX path (default: `default`, i.e., bundled 16k op15).
+- `--threshold <float>`: speech probability threshold (default `0.5`).
+- `--min-speech-ms <ms>`: minimum speech duration in ms (default `250`).
+- `--min-silence-ms <ms>`: minimum silence duration in ms (default `100`).
+- `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
+- `--time-resolution <n>`: decimal places for seconds output (default `3`).
+- `--neg-threshold <float>`: override the negative threshold (default `threshold - 0.15`).
+- `--seconds`: output timestamps in seconds (default on).
+- `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
+- `--strip-silence`: write a new WAV file with silences removed.
+- `--output-dir <path>`: output directory for strip-silence files (default: input dir).
+Outputs an array of `{ file, timestamps }` to stdout as JSON. The CLI reuses a single ONNX session and resets state per file.
+The sample rate is defined by the selected model (read from `vad.sampleRate`).
+## Library usage
+```js
+const {
+  loadSileroVad,
+  getSpeechTimestamps,
+  writeStrippedAudio,
+  WEIGHTS
+} = require('@jjhbw/silero-vad');
+(async () => {
+  const vad = await loadSileroVad('default'); // or WEIGHTS keys/custom path
+  try {
+    if (!vad.sampleRate) throw new Error('Model sample rate is undefined');
+    const inputs = ['input.wav', 'other.mp3'];
+    for (const inputPath of inputs) {
+      vad.resetStates(); // per file/stream
+      const ts = await getSpeechTimestamps(inputPath, vad, { returnSeconds: true });
+      // Each entry includes both seconds (start/end) and samples (startSample/endSample).
+      console.log(inputPath, ts);
+      // Example return value:
+      // [
+      //   { start: 0.36, end: 1.92, startSample: 5760, endSample: 30720 },
+      //   { start: 2.41, end: 3.05, startSample: 38560, endSample: 48800 }
+      // ]
+      // Strip silences from the original file using the timestamps.
+      // Pick any extension supported by ffmpeg (e.g., .wav, .flac).
+      // Note: encoding speed varies by container/codec; uncompressed PCM (e.g., .wav) is fastest,
+      // lossless compression (e.g., .flac) is slower, and lossy codecs (e.g., .mp3/.aac/.opus)
+      // are typically the slowest to encode.
+      const outPath = inputPath.replace(/\.[^.]+$/, '.stripped.wav');
+      await writeStrippedAudio(inputPath, ts, vad.sampleRate, outPath);
+    }
+  } finally {
+    await vad.session.release?.(); // once per process when shutting down
+  }
+})();
+```
+Guidelines:
+- Load once, reuse: keep one `SileroVad` per concurrent worker.
+- Call `resetStates()` before each new file/stream; the session and weights stay in memory.
+- Call `release()` when shutting down.
+## Development
+Clone the repo to run benchmarks and tests locally.
+### Benchmark
+```bash
+git clone https://github.com/jjhbw/silero-vad
+cd silero-vad/js-fork
+npm install
+node bench.js --audio data/test.mp3 --runs 5
+```
+The benchmark reports timings per file for streaming VAD and silence stripping. Stripped-audio files are written to a temporary directory and removed after each run.
+### Tests
+Snapshot tests compare Node outputs against Python ground truth (`tests/snapshots/onnx.json`):
+```bash
+git clone https://github.com/jjhbw/silero-vad
+cd silero-vad/js-fork
+npm install
+npm test
+```
+Ensure Python snapshots are generated (run `pytest tests/test_snapshots.py` in the repo root) and `ffmpeg` is installed.***

package/cli.js ADDED Viewed

@@ -0,0 +1,339 @@
+#!/usr/bin/env node
+const fs = require('fs');
+const fsp = fs.promises;
+const path = require('path');
+const {
+  loadSileroVad,
+  getSpeechTimestamps,
+  writeStrippedAudio,
+  WEIGHTS,
+} = require('./lib');
+const toMB = (b) => (b / (1024 * 1024)).toFixed(2);
+async function main() {
+  try {
+    const args = parseArgs(process.argv.slice(2));
+    if (!args.audio.length) {
+      printUsage();
+      process.exit(1);
+    }
+    const modelSpecifier = args.model || 'default';
+    const vad = await loadSileroVad(modelSpecifier);
+    if (!vad.sampleRate) {
+      throw new Error('No sample rate available for selected model. Please use a bundled model key.');
+    }
+    const effectiveSampleRate = vad.sampleRate;
+    try {
+      const results = [];
+      for (const audioPath of args.audio) {
+        // reuse session, reset stream state per file
+        vad.resetStates();
+        const t0 = performance.now();
+        const { timestamps, totalSamples } = await getSpeechTimestamps(audioPath, vad, {
+          threshold: args.threshold,
+          minSpeechDurationMs: args.minSpeechDurationMs,
+          minSilenceDurationMs: args.minSilenceDurationMs,
+          speechPadMs: args.speechPadMs,
+          returnSeconds: args.seconds,
+          timeResolution: args.timeResolution,
+          negThreshold: args.negThreshold,
+          returnMetadata: true,
+        });
+        const t1 = performance.now();
+        results.push({ file: audioPath, timestamps });
+        const mem = process.memoryUsage();
+        const speechSeconds = getSpeechDurationSeconds(
+          timestamps,
+          args.seconds,
+          effectiveSampleRate,
+        );
+        const durationSeconds = totalSamples / effectiveSampleRate;
+        const silenceSeconds = Math.max(0, durationSeconds - speechSeconds);
+        const totalForPct = durationSeconds > 0 ? durationSeconds : 1;
+        const speechPct = (speechSeconds / totalForPct) * 100;
+        const silencePct = (silenceSeconds / totalForPct) * 100;
+        console.info(
+          [
+            `file=${audioPath}`,
+            `duration=${formatDuration(durationSeconds)}`,
+          ].join(' '),
+        );
+        console.info(
+          [
+            `speech=${speechSeconds.toFixed(2)}s (${speechPct.toFixed(1)}%)`,
+            `silence=${silenceSeconds.toFixed(2)}s (${silencePct.toFixed(1)}%)`,
+            `total=${durationSeconds.toFixed(2)}s`,
+          ].join(' '),
+        );
+        const totalMs = t1 - t0;
+        console.info(
+          `vad_took=${totalMs.toFixed(2)}ms`,
+        );
+        console.info(
+          [
+            `rss_mb=${toMB(mem.rss)}`,
+            `heapUsed_mb=${toMB(mem.heapUsed)}`,
+            `external_mb=${toMB(mem.external)}`,
+          ].join(' '),
+        );
+        if (args.showTimeline) {
+          const lines = renderTimelineLines(
+            timestamps,
+            durationSeconds,
+            args.charsPerSecond,
+            120,
+          );
+          const secondsPerChar = 1 / args.charsPerSecond;
+          console.info(
+            `legend: # speech  . silence  (1 char = ${secondsPerChar.toFixed(2)}s)`,
+          );
+          for (const line of lines) {
+            console.info(line);
+          }
+        }
+        if (args.stripSilence) {
+          const segmentsSeconds = timestamps.map(({ start, end, startSeconds, endSeconds }) => ({
+            start: args.seconds ? start : startSeconds,
+            end: args.seconds ? end : endSeconds,
+          }));
+          if (!segmentsSeconds.length) {
+            console.info(`strip_silence=skipped (no speech detected)`);
+          } else {
+            if (args.outputDir) {
+              await fsp.mkdir(args.outputDir, { recursive: true });
+            }
+            const outputPath = await ensureUniquePath(
+              getStripOutputPath(audioPath, args.outputDir),
+            );
+            const stripT0 = performance.now();
+            const memBefore = process.memoryUsage();
+            await writeStrippedAudio(
+              audioPath,
+              segmentsSeconds,
+              effectiveSampleRate,
+              outputPath,
+            );
+            const memAfter = process.memoryUsage();
+            const stripT1 = performance.now();
+            const strippedSeconds = segmentsSeconds.reduce(
+              (sum, seg) => sum + (seg.end - seg.start),
+              0,
+            );
+            console.info(
+              `strip_silence_output=${outputPath} duration=${strippedSeconds.toFixed(2)}s`,
+            );
+            console.info(`strip_silence_took=${(stripT1 - stripT0).toFixed(2)}ms`);
+            console.info(
+              [
+                `strip_silence_mem_rss_delta_mb=${toMB(memAfter.rss - memBefore.rss)}`,
+                `strip_silence_mem_heap_delta_mb=${toMB(memAfter.heapUsed - memBefore.heapUsed)}`,
+                `strip_silence_mem_external_delta_mb=${toMB(memAfter.external - memBefore.external)}`,
+              ].join(' '),
+            );
+          }
+        }
+      }
+    } finally {
+      // Keep cleanup explicit so the pattern is clear for long-lived processes.
+      await vad.session.release?.();
+    }
+  } catch (err) {
+    console.error(err.message || err);
+    process.exit(1);
+  }
+}
+if (require.main === module) {
+  main();
+}
+function parseArgs(argv) {
+  const out = {
+    model: null,
+    audio: [],
+    threshold: 0.5,
+    minSpeechDurationMs: 250,
+    minSilenceDurationMs: 100,
+    speechPadMs: 30,
+    timeResolution: 3,
+    negThreshold: null,
+    seconds: true,
+    charsPerSecond: 4,
+    showTimeline: false,
+    stripSilence: false,
+    outputDir: null,
+  };
+  for (let i = 0; i < argv.length; i += 1) {
+    const arg = argv[i];
+    if (arg === '--model') {
+      out.model = argv[i + 1];
+      i += 1;
+    } else if (arg === '--audio') {
+      const value = argv[i + 1];
+      if (!value) {
+        throw new Error('Missing value for --audio');
+      }
+      out.audio.push(value);
+      i += 1;
+    } else if (arg === '--threshold') {
+      const value = parseFloat(argv[i + 1]);
+      if (!Number.isFinite(value)) {
+        throw new Error('Invalid value for --threshold');
+      }
+      out.threshold = value;
+      i += 1;
+    } else if (arg === '--min-speech-ms') {
+      const value = parseFloat(argv[i + 1]);
+      if (Number.isFinite(value) && value >= 0) {
+        out.minSpeechDurationMs = value;
+      }
+      i += 1;
+    } else if (arg === '--min-silence-ms') {
+      const value = parseFloat(argv[i + 1]);
+      if (Number.isFinite(value) && value >= 0) {
+        out.minSilenceDurationMs = value;
+      }
+      i += 1;
+    } else if (arg === '--speech-pad-ms') {
+      const value = parseFloat(argv[i + 1]);
+      if (Number.isFinite(value) && value >= 0) {
+        out.speechPadMs = value;
+      }
+      i += 1;
+    } else if (arg === '--time-resolution') {
+      const value = parseInt(argv[i + 1], 10);
+      if (Number.isFinite(value) && value >= 0) {
+        out.timeResolution = value;
+      }
+      i += 1;
+    } else if (arg === '--neg-threshold') {
+      const value = parseFloat(argv[i + 1]);
+      if (Number.isFinite(value)) {
+        out.negThreshold = value;
+      }
+      i += 1;
+    } else if (arg === '--seconds') {
+      out.seconds = true;
+    } else if (arg === '--cps') {
+      const value = parseFloat(argv[i + 1]);
+      out.showTimeline = true;
+      if (Number.isFinite(value) && value > 0) {
+        out.charsPerSecond = value;
+      }
+      i += 1;
+    } else if (arg === '--strip-silence') {
+      out.stripSilence = true;
+    } else if (arg === '--output-dir') {
+      out.outputDir = argv[i + 1];
+      i += 1;
+    } else if (arg === '--help' || arg === '-h') {
+      printUsage();
+      process.exit(0);
+    }
+  }
+  return out;
+}
+function printUsage() {
+  console.log(`Usage: silero-vad-cli --audio path/to/audio [options]
+Options:
+  --model <key|path>    Model key (${Object.keys(WEIGHTS).join(', ')}) or custom path (default: default)
+  --threshold <float>    Speech probability threshold (default: 0.5)
+  --min-speech-ms <ms>   Minimum speech duration in ms (default: 250)
+  --min-silence-ms <ms>  Minimum silence duration in ms (default: 100)
+  --speech-pad-ms <ms>   Padding added to speech segments in ms (default: 30)
+  --time-resolution <n>  Decimal places for seconds output (default: 3)
+  --neg-threshold <f>    Negative threshold override (default: threshold - 0.15)
+  --seconds              Output timestamps in seconds (default: on)
+  --cps <float>          Enable timeline visualization; chars per second (default: 4)
+  --strip-silence         Write a new file with all silences removed
+  --output-dir <path>     Output directory for strip-silence files (default: input dir)
+  -h, --help             Show this message`);
+}
+function renderTimelineLines(timestamps, durationSeconds, charsPerSecond, maxLineWidth) {
+  if (!durationSeconds || durationSeconds <= 0 || charsPerSecond <= 0) {
+    return ['[no audio]'];
+  }
+  const width = Math.max(1, Math.ceil(durationSeconds * charsPerSecond));
+  const slots = new Array(width).fill('.');
+  for (const { start, end } of timestamps) {
+    const startIdx = Math.max(0, Math.floor((start / durationSeconds) * width));
+    const endIdx = Math.min(width, Math.ceil((end / durationSeconds) * width));
+    for (let i = startIdx; i < endIdx; i += 1) {
+      slots[i] = '#';
+    }
+  }
+  if (!maxLineWidth || maxLineWidth <= 0) {
+    return [`|${slots.join('')}|`];
+  }
+  const lines = [];
+  for (let i = 0; i < slots.length; i += maxLineWidth) {
+    lines.push(`|${slots.slice(i, i + maxLineWidth).join('')}|`);
+  }
+  return lines;
+}
+function getSpeechDurationSeconds(timestamps, timestampsInSeconds, sampleRate) {
+  if (!timestamps || !timestamps.length) {
+    return 0;
+  }
+  if (!timestampsInSeconds && !sampleRate) {
+    throw new Error("Need sampleRate");
+  }
+  if (timestampsInSeconds) {
+    return timestamps.reduce((sum, { start, end }) => sum + (end - start), 0);
+  }
+  return timestamps.reduce((sum, { start, end }) => sum + (end - start) / sampleRate, 0);
+}
+module.exports = {
+  getSpeechDurationSeconds,
+  parseArgs,
+};
+function formatDuration(seconds) {
+  const whole = Math.max(0, Math.round(seconds));
+  const mins = Math.floor(whole / 60);
+  const secs = String(whole % 60).padStart(2, '0');
+  return `${mins}:${secs}`;
+}
+function getStripOutputPath(inputPath, outputDir) {
+  const dir = outputDir || path.dirname(inputPath);
+  const ext = path.extname(inputPath);
+  const base = path.basename(inputPath, ext);
+  return path.join(dir, `${base}_speech.wav`);
+}
+async function ensureUniquePath(outputPath) {
+  try {
+    await fsp.access(outputPath);
+  } catch {
+    return outputPath;
+  }
+  const dir = path.dirname(outputPath);
+  const ext = path.extname(outputPath);
+  const base = path.basename(outputPath, ext);
+  for (let i = 1; ; i += 1) {
+    const candidate = path.join(dir, `${base}-${i}${ext}`);
+    try {
+      await fsp.access(candidate);
+    } catch {
+      return candidate;
+    }
+  }
+}

package/index.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ module.exports = require('./lib');

package/lib.js ADDED Viewed

@@ -0,0 +1,385 @@
+const fs = require('fs');
+const fsp = fs.promises;
+const path = require('path');
+const { spawn } = require('child_process');
+const ort = require('onnxruntime-node');
+const WEIGHTS = {
+  default: { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
+  '16k': { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
+  '8k_16k': { path: path.join(__dirname, 'weights', 'silero_vad.onnx'), sampleRate: 16000 }, // decode to 16k by default
+  half: { path: path.join(__dirname, 'weights', 'silero_vad_half.onnx'), sampleRate: 16000 },
+  op18: { path: path.join(__dirname, 'weights', 'silero_vad_op18_ifless.onnx'), sampleRate: 16000 },
+};
+// Bench sweep on long-form audio showed this CPU-only config is the best general default.
+const DEFAULT_SESSION_OPTIONS = {
+  intraOpNumThreads: 4,
+  interOpNumThreads: 1,
+  executionMode: 'sequential',
+  graphOptimizationLevel: 'all',
+  enableCpuMemArena: true,
+  enableMemPattern: true,
+};
+// Minimal get_speech_timestamps port that runs the Silero VAD ONNX model in Node.
+class SileroVad {
+  constructor(session) {
+    this.session = session;
+    this.outputNames = session.outputNames;
+    this.resetStates();
+  }
+  resetStates() {
+    this.state = new Float32Array(2 * 1 * 128); // shape: [2, 1, 128]
+    this.context = null;
+    this.lastSr = null;
+    this.contextSize = null;
+    this.inputWithContext = null;
+    this.srTensor = null;
+  }
+  async processChunk(chunk, sampleRate) {
+    const sr = sampleRate;
+    const windowSize = sr === 16000 ? 512 : 256;
+    const contextSize = sr === 16000 ? 64 : 32;
+    if (chunk.length !== windowSize) {
+      throw new Error(`Expected chunk of ${windowSize} samples, got ${chunk.length}`);
+    }
+    // Reset state when sample rate changes.
+    if (this.lastSr && this.lastSr !== sr) {
+      this.resetStates();
+    }
+    if (!this.context || this.contextSize !== contextSize) {
+      this.contextSize = contextSize;
+      this.context = new Float32Array(contextSize); // zeros
+      this.inputWithContext = new Float32Array(contextSize + windowSize);
+      this.srTensor = new ort.Tensor('int64', BigInt64Array.from([BigInt(sr)]));
+    }
+    const inputWithContext = this.inputWithContext;
+    inputWithContext.set(this.context, 0);
+    inputWithContext.set(chunk, contextSize);
+    const feeds = {
+      input: new ort.Tensor('float32', inputWithContext, [1, inputWithContext.length]),
+      state: new ort.Tensor('float32', this.state, [2, 1, 128]),
+      sr: this.srTensor,
+    };
+    const results = await this.session.run(feeds);
+    const probTensor = results[this.outputNames[0]];
+    const newStateTensor = results[this.outputNames[1]];
+    this.state.set(newStateTensor.data);
+    this.context.set(inputWithContext.subarray(inputWithContext.length - contextSize));
+    this.lastSr = sr;
+    return probTensor.data[0];
+  }
+}
+async function loadSileroVad(model = 'default', opts = {}) {
+  const spec = WEIGHTS[model];
+  const modelPath = spec ? spec.path : model || WEIGHTS.default.path;
+  const sessionOptions = {
+    ...DEFAULT_SESSION_OPTIONS,
+    ...(opts.sessionOptions || {}),
+  };
+  const session = await ort.InferenceSession.create(modelPath, sessionOptions);
+  const vad = new SileroVad(session);
+  vad.sampleRate = spec ? spec.sampleRate : null;
+  return vad;
+}
+async function getSpeechTimestamps(
+  inputPath,
+  vad,
+  {
+    threshold = 0.5,
+    minSpeechDurationMs = 250,
+    minSilenceDurationMs = 100,
+    speechPadMs = 30,
+    returnSeconds = false,
+    timeResolution = 1,
+    negThreshold,
+    sampleRate,
+    returnMetadata = false,
+  } = {},
+) {
+  if (!vad) {
+    throw new Error('Pass a loaded SileroVad instance');
+  }
+  const sr = sampleRate || vad.sampleRate;
+  if (!sr) {
+    throw new Error('VAD sample rate is undefined. Use a bundled model key.');
+  }
+  if (sr !== 8000 && sr !== 16000) {
+    throw new Error('Supported sampling rates: 8000 or 16000 (or a multiple of 16000).');
+  }
+  const windowSize = sr === 16000 ? 512 : 256;
+  const minSpeechSamples = (sr * minSpeechDurationMs) / 1000;
+  const minSilenceSamples = (sr * minSilenceDurationMs) / 1000;
+  const speechPadSamples = (sr * speechPadMs) / 1000;
+  const negThres = negThreshold ?? Math.max(threshold - 0.15, 0.01);
+  vad.resetStates();
+  let triggered = false;
+  let tempEnd = 0;
+  let currentSpeech = {};
+  const speeches = [];
+  let processedSamples = 0;
+  let totalSamples = 0;
+  let leftoverBytes = Buffer.alloc(0);
+  const frameScratch = new Float32Array(windowSize);
+  let pendingLen = 0;
+  const channels = 1;
+  const args = [
+    '-v',
+    'error',
+    '-i',
+    inputPath,
+    '-ac',
+    String(channels),
+    '-ar',
+    String(sr),
+    '-f',
+    'f32le',
+    'pipe:1',
+  ];
+  const ffmpeg = spawn('ffmpeg', args, { stdio: ['ignore', 'pipe', 'inherit'] });
+  const processFrame = async (frame, curSample) => {
+    const speechProb = await vad.processChunk(frame, sr);
+    if (speechProb >= threshold && tempEnd) {
+      tempEnd = 0;
+    }
+    if (speechProb >= threshold && !triggered) {
+      triggered = true;
+      currentSpeech.start = curSample;
+      return;
+    }
+    if (speechProb < negThres && triggered) {
+      if (!tempEnd) {
+        tempEnd = curSample;
+      }
+      if (curSample - tempEnd < minSilenceSamples) {
+        return;
+      }
+      currentSpeech.end = tempEnd;
+      if (currentSpeech.end - currentSpeech.start > minSpeechSamples) {
+        speeches.push(currentSpeech);
+      }
+      currentSpeech = {};
+      triggered = false;
+      tempEnd = 0;
+    }
+  };
+  const streamDone = (async () => {
+    for await (const chunk of ffmpeg.stdout) {
+      let data = chunk;
+      if (leftoverBytes.length) {
+        const combined = Buffer.allocUnsafe(leftoverBytes.length + chunk.length);
+        leftoverBytes.copy(combined, 0);
+        chunk.copy(combined, leftoverBytes.length);
+        data = combined;
+        leftoverBytes = Buffer.alloc(0);
+      }
+      const usableBytes = data.length - (data.length % 4);
+      if (usableBytes <= 0) {
+        leftoverBytes = data;
+        continue;
+      }
+      leftoverBytes = data.subarray(usableBytes);
+      const floatData = new Float32Array(
+        data.buffer,
+        data.byteOffset,
+        usableBytes / Float32Array.BYTES_PER_ELEMENT,
+      );
+      totalSamples += floatData.length;
+      let offset = 0;
+      if (pendingLen) {
+        const needed = windowSize - pendingLen;
+        if (floatData.length >= needed) {
+          frameScratch.set(frameScratch.subarray(0, pendingLen), 0);
+          frameScratch.set(floatData.subarray(0, needed), pendingLen);
+          const curSample = processedSamples;
+          processedSamples += windowSize;
+          await processFrame(frameScratch, curSample);
+          offset = needed;
+          pendingLen = 0;
+        } else {
+          frameScratch.set(floatData, pendingLen);
+          pendingLen += floatData.length;
+          continue;
+        }
+      }
+      while (offset + windowSize <= floatData.length) {
+        const frame = floatData.subarray(offset, offset + windowSize);
+        const curSample = processedSamples;
+        processedSamples += windowSize;
+        await processFrame(frame, curSample);
+        offset += windowSize;
+      }
+      const remainingSamples = floatData.length - offset;
+      if (remainingSamples > 0) {
+        frameScratch.set(floatData.subarray(offset), 0);
+        pendingLen = remainingSamples;
+      } else {
+        pendingLen = 0;
+      }
+    }
+  })();
+  await new Promise((resolve, reject) => {
+    let settled = false;
+    const finish = (fn) => (value) => {
+      if (settled) {
+        return;
+      }
+      settled = true;
+      fn(value);
+    };
+    const resolveOnce = finish(resolve);
+    const rejectOnce = finish(reject);
+    streamDone.then(resolveOnce, (err) => {
+      ffmpeg.kill('SIGKILL');
+      rejectOnce(err);
+    });
+    ffmpeg.on('error', rejectOnce);
+    ffmpeg.on('close', (code) => {
+      if (code !== 0) {
+        rejectOnce(new Error(`ffmpeg exited with code ${code}`));
+        return;
+      }
+      streamDone.then(resolveOnce, rejectOnce);
+    });
+  });
+  if (leftoverBytes.length) {
+    const usableBytes = leftoverBytes.length - (leftoverBytes.length % 4);
+    if (usableBytes > 0) {
+      const tailFloats = new Float32Array(
+        leftoverBytes.buffer,
+        leftoverBytes.byteOffset,
+        usableBytes / Float32Array.BYTES_PER_ELEMENT,
+      );
+      if (tailFloats.length) {
+        frameScratch.set(tailFloats, pendingLen);
+        pendingLen += tailFloats.length;
+      }
+    }
+  }
+  if (pendingLen) {
+    const padded = new Float32Array(windowSize);
+    padded.set(frameScratch.subarray(0, pendingLen));
+    const curSample = processedSamples;
+    await processFrame(padded, curSample);
+    processedSamples += windowSize;
+  }
+  if (currentSpeech.start !== undefined) {
+    currentSpeech.end = totalSamples;
+    if (currentSpeech.end - currentSpeech.start > minSpeechSamples) {
+      speeches.push(currentSpeech);
+    }
+  }
+  for (let idx = 0; idx < speeches.length; idx += 1) {
+    const speech = speeches[idx];
+    const prevEnd = idx === 0 ? 0 : speeches[idx - 1].end;
+    const nextStart = idx === speeches.length - 1 ? totalSamples : speeches[idx + 1].start;
+    const padStart = Math.max(speech.start - speechPadSamples, prevEnd);
+    const padEnd = Math.min(speech.end + speechPadSamples, nextStart);
+    speech.start = Math.max(0, Math.floor(padStart));
+    speech.end = Math.min(totalSamples, Math.floor(padEnd));
+  }
+  const convertSeconds = (samples) => +(samples / sr).toFixed(timeResolution);
+  const result = returnSeconds
+    ? speeches.map(({ start, end }) => ({
+      start: convertSeconds(start),
+      end: convertSeconds(end),
+      startSample: start,
+      endSample: end,
+    }))
+    : speeches.map(({ start, end }) => ({
+      start,
+      end,
+      startSeconds: convertSeconds(start),
+      endSeconds: convertSeconds(end),
+    }));
+  if (returnMetadata) {
+    return { timestamps: result, totalSamples };
+  }
+  return result;
+}
+async function writeStrippedAudio(inputPath, segmentsSeconds, sampleRate, outputPath) {
+  if (!segmentsSeconds || !segmentsSeconds.length) {
+    throw new Error('No valid speech segments to write');
+  }
+  if (!sampleRate) {
+    throw new Error('Sample rate is required to write WAV');
+  }
+  const expr = segmentsSeconds
+    .map(({ start, end }) => `between(t\\,${start.toFixed(6)}\\,${end.toFixed(6)})`)
+    .join('+');
+  const filter = `aselect='${expr}',asetpts=N/SR/TB`;
+  const args = [
+    '-y',
+    '-v',
+    'error',
+    '-i',
+    inputPath,
+    '-af',
+    filter,
+    '-ac',
+    '1',
+    '-ar',
+    String(sampleRate),
+    outputPath,
+  ];
+  await new Promise((resolve, reject) => {
+    const ffmpeg = spawn('ffmpeg', args, { stdio: ['ignore', 'ignore', 'inherit'] });
+    ffmpeg.on('error', reject);
+    ffmpeg.on('close', (code) => {
+      if (code !== 0) {
+        reject(new Error(`ffmpeg exited with code ${code}`));
+        return;
+      }
+      resolve();
+    });
+  });
+}
+module.exports = {
+  loadSileroVad,
+  getSpeechTimestamps,
+  writeStrippedAudio,
+  WEIGHTS,
+};

package/package.json ADDED Viewed

@@ -0,0 +1,37 @@
+{
+  "name": "@jjhbw/silero-vad",
+  "version": "1.0.0",
+  "description": "Node.js bindings for Silero VAD",
+  "main": "index.js",
+  "exports": {
+    ".": "./index.js"
+  },
+  "bin": {
+    "silero-vad-cli": "cli.js"
+  },
+  "scripts": {
+    "bench": "node bench.js --audio data/long.mp3 --runs 3 --warmup 1",
+    "test": "node --test"
+  },
+  "author": "jjhbw",
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/jjhbw/silero-vad.git"
+  },
+  "bugs": {
+    "url": "https://github.com/jjhbw/silero-vad/issues"
+  },
+  "homepage": "https://github.com/jjhbw/silero-vad#readme",
+  "files": [
+    "index.js",
+    "lib.js",
+    "cli.js",
+    "weights/",
+    "README.md",
+    "LICENSE"
+  ],
+  "dependencies": {
+    "onnxruntime-node": "^1.23.2"
+  }
+}

package/weights/silero_vad.onnx ADDED Viewed

Binary file

package/weights/silero_vad_16k_op15.onnx ADDED Viewed

Binary file

package/weights/silero_vad_half.onnx ADDED Viewed

Binary file

package/weights/silero_vad_op18_ifless.onnx ADDED Viewed

Binary file