@jjhbw/silero-vad 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/README.md +34 -30
  2. package/cli.js +1 -1
  3. package/lib.js +48 -4
  4. package/package.json +1 -1
package/README.md CHANGED
@@ -10,28 +10,6 @@ npm install @jjhbw/silero-vad
10
10
 
11
11
  Requires Node 18+ and `ffmpeg` available on `PATH` for decoding arbitrary audio formats.
12
12
 
13
- ## CLI
14
-
15
- ```bash
16
- npx silero-vad-cli --audio input.wav --audio other.mp3 [options]
17
- ```
18
-
19
- Options:
20
- - `--model <key|path>`: model key (`default`, `16k`, `8k_16k`, `half`, `op18`) or custom ONNX path (default: `default`, i.e., bundled 16k op15).
21
- - `--threshold <float>`: speech probability threshold (default `0.5`).
22
- - `--min-speech-ms <ms>`: minimum speech duration in ms (default `250`).
23
- - `--min-silence-ms <ms>`: minimum silence duration in ms (default `100`).
24
- - `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
25
- - `--time-resolution <n>`: decimal places for seconds output (default `3`).
26
- - `--neg-threshold <float>`: override the negative threshold (default `threshold - 0.15`).
27
- - `--seconds`: output timestamps in seconds (default on).
28
- - `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
29
- - `--strip-silence`: write a new WAV file with silences removed.
30
- - `--output-dir <path>`: output directory for strip-silence files (default: input dir).
31
-
32
- Outputs an array of `{ file, timestamps }` to stdout as JSON. The CLI reuses a single ONNX session and resets state per file.
33
- The sample rate is defined by the selected model (read from `vad.sampleRate`).
34
-
35
13
  ## Library usage
36
14
 
37
15
  ```js
@@ -39,17 +17,19 @@ const {
39
17
  loadSileroVad,
40
18
  getSpeechTimestamps,
41
19
  writeStrippedAudio,
42
- WEIGHTS
43
- } = require('@jjhbw/silero-vad');
20
+ WEIGHTS,
21
+ } = require("@jjhbw/silero-vad");
44
22
 
45
23
  (async () => {
46
- const vad = await loadSileroVad('default'); // or WEIGHTS keys/custom path
24
+ const vad = await loadSileroVad("default"); // or WEIGHTS keys/custom path
47
25
  try {
48
- if (!vad.sampleRate) throw new Error('Model sample rate is undefined');
49
- const inputs = ['input.wav', 'other.mp3'];
26
+ if (!vad.sampleRate) throw new Error("Model sample rate is undefined");
27
+ const inputs = ["input.wav", "other.mp3"];
50
28
  for (const inputPath of inputs) {
51
29
  vad.resetStates(); // per file/stream
52
- const ts = await getSpeechTimestamps(inputPath, vad, { returnSeconds: true });
30
+ const ts = await getSpeechTimestamps(inputPath, vad, {
31
+ returnSeconds: true,
32
+ });
53
33
  // Each entry includes both seconds (start/end) and samples (startSample/endSample).
54
34
  console.log(inputPath, ts);
55
35
  // Example return value:
@@ -63,7 +43,7 @@ const {
63
43
  // Note: encoding speed varies by container/codec; uncompressed PCM (e.g., .wav) is fastest,
64
44
  // lossless compression (e.g., .flac) is slower, and lossy codecs (e.g., .mp3/.aac/.opus)
65
45
  // are typically the slowest to encode.
66
- const outPath = inputPath.replace(/\.[^.]+$/, '.stripped.wav');
46
+ const outPath = inputPath.replace(/\.[^.]+$/, ".stripped.wav");
67
47
  await writeStrippedAudio(inputPath, ts, vad.sampleRate, outPath);
68
48
  }
69
49
  } finally {
@@ -73,10 +53,34 @@ const {
73
53
  ```
74
54
 
75
55
  Guidelines:
56
+
76
57
  - Load once, reuse: keep one `SileroVad` per concurrent worker.
77
58
  - Call `resetStates()` before each new file/stream; the session and weights stay in memory.
78
59
  - Call `release()` when shutting down.
79
60
 
61
+ ## CLI
62
+
63
+ ```bash
64
+ npx @jjhbw/silero-vad --audio input.wav --audio other.mp3 [options]
65
+ ```
66
+
67
+ Options:
68
+
69
+ - `--model <key|path>`: model key (`default`, `16k`, `8k_16k`, `half`, `op18`) or custom ONNX path (default: `default`, i.e., bundled 16k op15).
70
+ - `--threshold <float>`: speech probability threshold (default `0.5`).
71
+ - `--min-speech-ms <ms>`: minimum speech duration in ms (default `250`).
72
+ - `--min-silence-ms <ms>`: minimum silence duration in ms (default `100`).
73
+ - `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
74
+ - `--time-resolution <n>`: decimal places for seconds output (default `3`).
75
+ - `--neg-threshold <float>`: override the negative threshold (default `max(threshold - 0.15, 0.01)`).
76
+ - `--seconds`: output timestamps in seconds (default on).
77
+ - `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
78
+ - `--strip-silence`: write a new WAV file with silences removed.
79
+ - `--output-dir <path>`: output directory for strip-silence files (default: input dir).
80
+
81
+ Outputs an array of `{ file, timestamps }` to stdout as JSON. The CLI reuses a single ONNX session and resets state per file.
82
+ The sample rate is defined by the selected model (read from `vad.sampleRate`).
83
+
80
84
  ## Development
81
85
 
82
86
  Clone the repo to run benchmarks and tests locally.
@@ -103,4 +107,4 @@ npm install
103
107
  npm test
104
108
  ```
105
109
 
106
- Ensure Python snapshots are generated (run `pytest tests/test_snapshots.py` in the repo root) and `ffmpeg` is installed.***
110
+ Ensure Python snapshots are generated (run `pytest tests/test_snapshots.py` in the repo root) and `ffmpeg` is installed.
package/cli.js CHANGED
@@ -253,7 +253,7 @@ Options:
253
253
  --min-silence-ms <ms> Minimum silence duration in ms (default: 100)
254
254
  --speech-pad-ms <ms> Padding added to speech segments in ms (default: 30)
255
255
  --time-resolution <n> Decimal places for seconds output (default: 3)
256
- --neg-threshold <f> Negative threshold override (default: threshold - 0.15)
256
+ --neg-threshold <f> Negative threshold override (default: max(threshold - 0.15, 0.01))
257
257
  --seconds Output timestamps in seconds (default: on)
258
258
  --cps <float> Enable timeline visualization; chars per second (default: 4)
259
259
  --strip-silence Write a new file with all silences removed
package/lib.js CHANGED
@@ -4,6 +4,10 @@ const path = require('path');
4
4
  const { spawn } = require('child_process');
5
5
  const ort = require('onnxruntime-node');
6
6
 
7
+ /**
8
+ * Bundled model spec map keyed by CLI/library names.
9
+ * @type {Record<string, {path: string, sampleRate: number}>}
10
+ */
7
11
  const WEIGHTS = {
8
12
  default: { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
9
13
  '16k': { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
@@ -82,6 +86,13 @@ class SileroVad {
82
86
  }
83
87
  }
84
88
 
89
+ /**
90
+ * Load a Silero VAD ONNX model and return a ready-to-run VAD instance.
91
+ * @param {string} [model='default'] Bundled model key or custom ONNX path.
92
+ * @param {Object} [opts]
93
+ * @param {Object} [opts.sessionOptions] onnxruntime-node session options override.
94
+ * @returns {Promise<SileroVad>}
95
+ */
85
96
  async function loadSileroVad(model = 'default', opts = {}) {
86
97
  const spec = WEIGHTS[model];
87
98
  const modelPath = spec ? spec.path : model || WEIGHTS.default.path;
@@ -95,6 +106,30 @@ async function loadSileroVad(model = 'default', opts = {}) {
95
106
  return vad;
96
107
  }
97
108
 
109
+ /**
110
+ * Run VAD on an audio file and return speech segments.
111
+ * @param {string} inputPath
112
+ * @param {SileroVad} vad
113
+ * @param {Object} [options]
114
+ * @param {number} [options.threshold=0.5] Start speech when prob >= threshold.
115
+ * Example: if probs hover at 0.45-0.6, threshold=0.6 will miss soft speech.
116
+ * @param {number} [options.minSpeechDurationMs=250] Drop segments shorter than this.
117
+ * Example: a 120 ms burst above threshold is discarded at 250 ms.
118
+ * @param {number} [options.minSilenceDurationMs=100] End speech only after silence
119
+ * stays below negThreshold for this long.
120
+ * Example: a 50 ms pause will not split a segment at 100 ms.
121
+ * @param {number} [options.speechPadMs=30] Pad each segment on both sides, clamped
122
+ * to neighbors. Example: [1.000, 2.000] -> ~[0.970, 2.030].
123
+ * @param {boolean} [options.returnSeconds=false]
124
+ * @param {number} [options.timeResolution=3] Decimal places for seconds output.
125
+ * Example: timeResolution=1 turns 1.23456 into 1.2.
126
+ * @param {number} [options.negThreshold=threshold-0.15] End speech when prob dips
127
+ * below this; provides hysteresis vs threshold. Example: threshold=0.5,
128
+ * negThreshold=0.35 keeps speech open during brief 0.4 dips.
129
+ * Default clamps to >= 0.01 to avoid an always-on end condition.
130
+ * @param {number} [options.sampleRate]
131
+ * @param {boolean} [options.returnMetadata=false]
132
+ */
98
133
  async function getSpeechTimestamps(
99
134
  inputPath,
100
135
  vad,
@@ -104,7 +139,7 @@ async function getSpeechTimestamps(
104
139
  minSilenceDurationMs = 100,
105
140
  speechPadMs = 30,
106
141
  returnSeconds = false,
107
- timeResolution = 1,
142
+ timeResolution = 3,
108
143
  negThreshold,
109
144
  sampleRate,
110
145
  returnMetadata = false,
@@ -178,7 +213,7 @@ async function getSpeechTimestamps(
178
213
  }
179
214
 
180
215
  currentSpeech.end = tempEnd;
181
- if (currentSpeech.end - currentSpeech.start > minSpeechSamples) {
216
+ if (currentSpeech.end - currentSpeech.start >= minSpeechSamples) {
182
217
  speeches.push(currentSpeech);
183
218
  }
184
219
  currentSpeech = {};
@@ -191,7 +226,7 @@ async function getSpeechTimestamps(
191
226
  for await (const chunk of ffmpeg.stdout) {
192
227
  let data = chunk;
193
228
  if (leftoverBytes.length) {
194
- const combined = Buffer.allocUnsafe(leftoverBytes.length + chunk.length);
229
+ const combined = Buffer.alloc(leftoverBytes.length + chunk.length);
195
230
  leftoverBytes.copy(combined, 0);
196
231
  chunk.copy(combined, leftoverBytes.length);
197
232
  data = combined;
@@ -300,7 +335,7 @@ async function getSpeechTimestamps(
300
335
 
301
336
  if (currentSpeech.start !== undefined) {
302
337
  currentSpeech.end = totalSamples;
303
- if (currentSpeech.end - currentSpeech.start > minSpeechSamples) {
338
+ if (currentSpeech.end - currentSpeech.start >= minSpeechSamples) {
304
339
  speeches.push(currentSpeech);
305
340
  }
306
341
  }
@@ -337,6 +372,15 @@ async function getSpeechTimestamps(
337
372
  return result;
338
373
  }
339
374
 
375
+ /**
376
+ * Write a new audio file containing only the provided speech segments.
377
+ * Uses ffmpeg; encoding is inferred from outputPath extension/container.
378
+ * @param {string} inputPath
379
+ * @param {Array<{start: number, end: number}>} segmentsSeconds Seconds-based ranges.
380
+ * @param {number} sampleRate Output sample rate (required by ffmpeg).
381
+ * @param {string} outputPath
382
+ * @returns {Promise<void>}
383
+ */
340
384
  async function writeStrippedAudio(inputPath, segmentsSeconds, sampleRate, outputPath) {
341
385
  if (!segmentsSeconds || !segmentsSeconds.length) {
342
386
  throw new Error('No valid speech segments to write');
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jjhbw/silero-vad",
3
- "version": "1.0.0",
3
+ "version": "1.0.2",
4
4
  "description": "Node.js bindings for Silero VAD",
5
5
  "main": "index.js",
6
6
  "exports": {