@jjhbw/silero-vad 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -30
- package/cli.js +1 -1
- package/lib.js +48 -4
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -10,28 +10,6 @@ npm install @jjhbw/silero-vad
|
|
|
10
10
|
|
|
11
11
|
Requires Node 18+ and `ffmpeg` available on `PATH` for decoding arbitrary audio formats.
|
|
12
12
|
|
|
13
|
-
## CLI
|
|
14
|
-
|
|
15
|
-
```bash
|
|
16
|
-
npx silero-vad-cli --audio input.wav --audio other.mp3 [options]
|
|
17
|
-
```
|
|
18
|
-
|
|
19
|
-
Options:
|
|
20
|
-
- `--model <key|path>`: model key (`default`, `16k`, `8k_16k`, `half`, `op18`) or custom ONNX path (default: `default`, i.e., bundled 16k op15).
|
|
21
|
-
- `--threshold <float>`: speech probability threshold (default `0.5`).
|
|
22
|
-
- `--min-speech-ms <ms>`: minimum speech duration in ms (default `250`).
|
|
23
|
-
- `--min-silence-ms <ms>`: minimum silence duration in ms (default `100`).
|
|
24
|
-
- `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
|
|
25
|
-
- `--time-resolution <n>`: decimal places for seconds output (default `3`).
|
|
26
|
-
- `--neg-threshold <float>`: override the negative threshold (default `threshold - 0.15`).
|
|
27
|
-
- `--seconds`: output timestamps in seconds (default on).
|
|
28
|
-
- `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
|
|
29
|
-
- `--strip-silence`: write a new WAV file with silences removed.
|
|
30
|
-
- `--output-dir <path>`: output directory for strip-silence files (default: input dir).
|
|
31
|
-
|
|
32
|
-
Outputs an array of `{ file, timestamps }` to stdout as JSON. The CLI reuses a single ONNX session and resets state per file.
|
|
33
|
-
The sample rate is defined by the selected model (read from `vad.sampleRate`).
|
|
34
|
-
|
|
35
13
|
## Library usage
|
|
36
14
|
|
|
37
15
|
```js
|
|
@@ -39,17 +17,19 @@ const {
|
|
|
39
17
|
loadSileroVad,
|
|
40
18
|
getSpeechTimestamps,
|
|
41
19
|
writeStrippedAudio,
|
|
42
|
-
WEIGHTS
|
|
43
|
-
} = require(
|
|
20
|
+
WEIGHTS,
|
|
21
|
+
} = require("@jjhbw/silero-vad");
|
|
44
22
|
|
|
45
23
|
(async () => {
|
|
46
|
-
const vad = await loadSileroVad(
|
|
24
|
+
const vad = await loadSileroVad("default"); // or WEIGHTS keys/custom path
|
|
47
25
|
try {
|
|
48
|
-
if (!vad.sampleRate) throw new Error(
|
|
49
|
-
const inputs = [
|
|
26
|
+
if (!vad.sampleRate) throw new Error("Model sample rate is undefined");
|
|
27
|
+
const inputs = ["input.wav", "other.mp3"];
|
|
50
28
|
for (const inputPath of inputs) {
|
|
51
29
|
vad.resetStates(); // per file/stream
|
|
52
|
-
const ts = await getSpeechTimestamps(inputPath, vad, {
|
|
30
|
+
const ts = await getSpeechTimestamps(inputPath, vad, {
|
|
31
|
+
returnSeconds: true,
|
|
32
|
+
});
|
|
53
33
|
// Each entry includes both seconds (start/end) and samples (startSample/endSample).
|
|
54
34
|
console.log(inputPath, ts);
|
|
55
35
|
// Example return value:
|
|
@@ -63,7 +43,7 @@ const {
|
|
|
63
43
|
// Note: encoding speed varies by container/codec; uncompressed PCM (e.g., .wav) is fastest,
|
|
64
44
|
// lossless compression (e.g., .flac) is slower, and lossy codecs (e.g., .mp3/.aac/.opus)
|
|
65
45
|
// are typically the slowest to encode.
|
|
66
|
-
const outPath = inputPath.replace(/\.[^.]+$/,
|
|
46
|
+
const outPath = inputPath.replace(/\.[^.]+$/, ".stripped.wav");
|
|
67
47
|
await writeStrippedAudio(inputPath, ts, vad.sampleRate, outPath);
|
|
68
48
|
}
|
|
69
49
|
} finally {
|
|
@@ -73,10 +53,34 @@ const {
|
|
|
73
53
|
```
|
|
74
54
|
|
|
75
55
|
Guidelines:
|
|
56
|
+
|
|
76
57
|
- Load once, reuse: keep one `SileroVad` per concurrent worker.
|
|
77
58
|
- Call `resetStates()` before each new file/stream; the session and weights stay in memory.
|
|
78
59
|
- Call `release()` when shutting down.
|
|
79
60
|
|
|
61
|
+
## CLI
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
npx @jjhbw/silero-vad --audio input.wav --audio other.mp3 [options]
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Options:
|
|
68
|
+
|
|
69
|
+
- `--model <key|path>`: model key (`default`, `16k`, `8k_16k`, `half`, `op18`) or custom ONNX path (default: `default`, i.e., bundled 16k op15).
|
|
70
|
+
- `--threshold <float>`: speech probability threshold (default `0.5`).
|
|
71
|
+
- `--min-speech-ms <ms>`: minimum speech duration in ms (default `250`).
|
|
72
|
+
- `--min-silence-ms <ms>`: minimum silence duration in ms (default `100`).
|
|
73
|
+
- `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
|
|
74
|
+
- `--time-resolution <n>`: decimal places for seconds output (default `3`).
|
|
75
|
+
- `--neg-threshold <float>`: override the negative threshold (default `max(threshold - 0.15, 0.01)`).
|
|
76
|
+
- `--seconds`: output timestamps in seconds (default on).
|
|
77
|
+
- `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
|
|
78
|
+
- `--strip-silence`: write a new WAV file with silences removed.
|
|
79
|
+
- `--output-dir <path>`: output directory for strip-silence files (default: input dir).
|
|
80
|
+
|
|
81
|
+
Outputs an array of `{ file, timestamps }` to stdout as JSON. The CLI reuses a single ONNX session and resets state per file.
|
|
82
|
+
The sample rate is defined by the selected model (read from `vad.sampleRate`).
|
|
83
|
+
|
|
80
84
|
## Development
|
|
81
85
|
|
|
82
86
|
Clone the repo to run benchmarks and tests locally.
|
|
@@ -103,4 +107,4 @@ npm install
|
|
|
103
107
|
npm test
|
|
104
108
|
```
|
|
105
109
|
|
|
106
|
-
Ensure Python snapshots are generated (run `pytest tests/test_snapshots.py` in the repo root) and `ffmpeg` is installed
|
|
110
|
+
Ensure Python snapshots are generated (run `pytest tests/test_snapshots.py` in the repo root) and `ffmpeg` is installed.
|
package/cli.js
CHANGED
|
@@ -253,7 +253,7 @@ Options:
|
|
|
253
253
|
--min-silence-ms <ms> Minimum silence duration in ms (default: 100)
|
|
254
254
|
--speech-pad-ms <ms> Padding added to speech segments in ms (default: 30)
|
|
255
255
|
--time-resolution <n> Decimal places for seconds output (default: 3)
|
|
256
|
-
--neg-threshold <f> Negative threshold override (default: threshold - 0.15)
|
|
256
|
+
--neg-threshold <f> Negative threshold override (default: max(threshold - 0.15, 0.01))
|
|
257
257
|
--seconds Output timestamps in seconds (default: on)
|
|
258
258
|
--cps <float> Enable timeline visualization; chars per second (default: 4)
|
|
259
259
|
--strip-silence Write a new file with all silences removed
|
package/lib.js
CHANGED
|
@@ -4,6 +4,10 @@ const path = require('path');
|
|
|
4
4
|
const { spawn } = require('child_process');
|
|
5
5
|
const ort = require('onnxruntime-node');
|
|
6
6
|
|
|
7
|
+
/**
|
|
8
|
+
* Bundled model spec map keyed by CLI/library names.
|
|
9
|
+
* @type {Record<string, {path: string, sampleRate: number}>}
|
|
10
|
+
*/
|
|
7
11
|
const WEIGHTS = {
|
|
8
12
|
default: { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
|
|
9
13
|
'16k': { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
|
|
@@ -82,6 +86,13 @@ class SileroVad {
|
|
|
82
86
|
}
|
|
83
87
|
}
|
|
84
88
|
|
|
89
|
+
/**
|
|
90
|
+
* Load a Silero VAD ONNX model and return a ready-to-run VAD instance.
|
|
91
|
+
* @param {string} [model='default'] Bundled model key or custom ONNX path.
|
|
92
|
+
* @param {Object} [opts]
|
|
93
|
+
* @param {Object} [opts.sessionOptions] onnxruntime-node session options override.
|
|
94
|
+
* @returns {Promise<SileroVad>}
|
|
95
|
+
*/
|
|
85
96
|
async function loadSileroVad(model = 'default', opts = {}) {
|
|
86
97
|
const spec = WEIGHTS[model];
|
|
87
98
|
const modelPath = spec ? spec.path : model || WEIGHTS.default.path;
|
|
@@ -95,6 +106,30 @@ async function loadSileroVad(model = 'default', opts = {}) {
|
|
|
95
106
|
return vad;
|
|
96
107
|
}
|
|
97
108
|
|
|
109
|
+
/**
|
|
110
|
+
* Run VAD on an audio file and return speech segments.
|
|
111
|
+
* @param {string} inputPath
|
|
112
|
+
* @param {SileroVad} vad
|
|
113
|
+
* @param {Object} [options]
|
|
114
|
+
* @param {number} [options.threshold=0.5] Start speech when prob >= threshold.
|
|
115
|
+
* Example: if probs hover at 0.45-0.6, threshold=0.6 will miss soft speech.
|
|
116
|
+
* @param {number} [options.minSpeechDurationMs=250] Drop segments shorter than this.
|
|
117
|
+
* Example: a 120 ms burst above threshold is discarded at 250 ms.
|
|
118
|
+
* @param {number} [options.minSilenceDurationMs=100] End speech only after silence
|
|
119
|
+
* stays below negThreshold for this long.
|
|
120
|
+
* Example: a 50 ms pause will not split a segment at 100 ms.
|
|
121
|
+
* @param {number} [options.speechPadMs=30] Pad each segment on both sides, clamped
|
|
122
|
+
* to neighbors. Example: [1.000, 2.000] -> ~[0.970, 2.030].
|
|
123
|
+
* @param {boolean} [options.returnSeconds=false]
|
|
124
|
+
* @param {number} [options.timeResolution=3] Decimal places for seconds output.
|
|
125
|
+
* Example: timeResolution=1 turns 1.23456 into 1.2.
|
|
126
|
+
* @param {number} [options.negThreshold=threshold-0.15] End speech when prob dips
|
|
127
|
+
* below this; provides hysteresis vs threshold. Example: threshold=0.5,
|
|
128
|
+
* negThreshold=0.35 keeps speech open during brief 0.4 dips.
|
|
129
|
+
* Default clamps to >= 0.01 to avoid an always-on end condition.
|
|
130
|
+
* @param {number} [options.sampleRate]
|
|
131
|
+
* @param {boolean} [options.returnMetadata=false]
|
|
132
|
+
*/
|
|
98
133
|
async function getSpeechTimestamps(
|
|
99
134
|
inputPath,
|
|
100
135
|
vad,
|
|
@@ -104,7 +139,7 @@ async function getSpeechTimestamps(
|
|
|
104
139
|
minSilenceDurationMs = 100,
|
|
105
140
|
speechPadMs = 30,
|
|
106
141
|
returnSeconds = false,
|
|
107
|
-
timeResolution =
|
|
142
|
+
timeResolution = 3,
|
|
108
143
|
negThreshold,
|
|
109
144
|
sampleRate,
|
|
110
145
|
returnMetadata = false,
|
|
@@ -178,7 +213,7 @@ async function getSpeechTimestamps(
|
|
|
178
213
|
}
|
|
179
214
|
|
|
180
215
|
currentSpeech.end = tempEnd;
|
|
181
|
-
if (currentSpeech.end - currentSpeech.start
|
|
216
|
+
if (currentSpeech.end - currentSpeech.start >= minSpeechSamples) {
|
|
182
217
|
speeches.push(currentSpeech);
|
|
183
218
|
}
|
|
184
219
|
currentSpeech = {};
|
|
@@ -191,7 +226,7 @@ async function getSpeechTimestamps(
|
|
|
191
226
|
for await (const chunk of ffmpeg.stdout) {
|
|
192
227
|
let data = chunk;
|
|
193
228
|
if (leftoverBytes.length) {
|
|
194
|
-
const combined = Buffer.
|
|
229
|
+
const combined = Buffer.alloc(leftoverBytes.length + chunk.length);
|
|
195
230
|
leftoverBytes.copy(combined, 0);
|
|
196
231
|
chunk.copy(combined, leftoverBytes.length);
|
|
197
232
|
data = combined;
|
|
@@ -300,7 +335,7 @@ async function getSpeechTimestamps(
|
|
|
300
335
|
|
|
301
336
|
if (currentSpeech.start !== undefined) {
|
|
302
337
|
currentSpeech.end = totalSamples;
|
|
303
|
-
if (currentSpeech.end - currentSpeech.start
|
|
338
|
+
if (currentSpeech.end - currentSpeech.start >= minSpeechSamples) {
|
|
304
339
|
speeches.push(currentSpeech);
|
|
305
340
|
}
|
|
306
341
|
}
|
|
@@ -337,6 +372,15 @@ async function getSpeechTimestamps(
|
|
|
337
372
|
return result;
|
|
338
373
|
}
|
|
339
374
|
|
|
375
|
+
/**
|
|
376
|
+
* Write a new audio file containing only the provided speech segments.
|
|
377
|
+
* Uses ffmpeg; encoding is inferred from outputPath extension/container.
|
|
378
|
+
* @param {string} inputPath
|
|
379
|
+
* @param {Array<{start: number, end: number}>} segmentsSeconds Seconds-based ranges.
|
|
380
|
+
* @param {number} sampleRate Output sample rate (required by ffmpeg).
|
|
381
|
+
* @param {string} outputPath
|
|
382
|
+
* @returns {Promise<void>}
|
|
383
|
+
*/
|
|
340
384
|
async function writeStrippedAudio(inputPath, segmentsSeconds, sampleRate, outputPath) {
|
|
341
385
|
if (!segmentsSeconds || !segmentsSeconds.length) {
|
|
342
386
|
throw new Error('No valid speech segments to write');
|