@jjhbw/silero-vad 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020-present Silero Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,106 @@
1
+ # Silero VAD Node Fork
2
+
3
+ Minimal Node.js wrapper around the Silero VAD ONNX model, with a small CLI and parity tests against the Python implementation. The Node implementation runs VAD and silence stripping directly from ffmpeg streams to keep memory usage low on long files.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm install @jjhbw/silero-vad
9
+ ```
10
+
11
+ Requires Node 18+ and `ffmpeg` available on `PATH` for decoding arbitrary audio formats.
12
+
13
+ ## CLI
14
+
15
+ ```bash
16
+ npx silero-vad-cli --audio input.wav --audio other.mp3 [options]
17
+ ```
18
+
19
+ Options:
20
+ - `--model <key|path>`: model key (`default`, `16k`, `8k_16k`, `half`, `op18`) or custom ONNX path (default: `default`, i.e., bundled 16k op15).
21
+ - `--threshold <float>`: speech probability threshold (default `0.5`).
22
+ - `--min-speech-ms <ms>`: minimum speech duration in ms (default `250`).
23
+ - `--min-silence-ms <ms>`: minimum silence duration in ms (default `100`).
24
+ - `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
25
+ - `--time-resolution <n>`: decimal places for seconds output (default `3`).
26
+ - `--neg-threshold <float>`: override the negative threshold (default `threshold - 0.15`).
27
+ - `--seconds`: output timestamps in seconds (default on).
28
+ - `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
29
+ - `--strip-silence`: write a new WAV file with silences removed.
30
+ - `--output-dir <path>`: output directory for strip-silence files (default: input dir).
31
+
32
+ Outputs an array of `{ file, timestamps }` to stdout as JSON. The CLI reuses a single ONNX session and resets state per file.
33
+ The sample rate is defined by the selected model (read from `vad.sampleRate`).
34
+
35
+ ## Library usage
36
+
37
+ ```js
38
+ const {
39
+ loadSileroVad,
40
+ getSpeechTimestamps,
41
+ writeStrippedAudio,
42
+ WEIGHTS
43
+ } = require('@jjhbw/silero-vad');
44
+
45
+ (async () => {
46
+ const vad = await loadSileroVad('default'); // or WEIGHTS keys/custom path
47
+ try {
48
+ if (!vad.sampleRate) throw new Error('Model sample rate is undefined');
49
+ const inputs = ['input.wav', 'other.mp3'];
50
+ for (const inputPath of inputs) {
51
+ vad.resetStates(); // per file/stream
52
+ const ts = await getSpeechTimestamps(inputPath, vad, { returnSeconds: true });
53
+ // Each entry includes both seconds (start/end) and samples (startSample/endSample).
54
+ console.log(inputPath, ts);
55
+ // Example return value:
56
+ // [
57
+ // { start: 0.36, end: 1.92, startSample: 5760, endSample: 30720 },
58
+ // { start: 2.41, end: 3.05, startSample: 38560, endSample: 48800 }
59
+ // ]
60
+
61
+ // Strip silences from the original file using the timestamps.
62
+ // Pick any extension supported by ffmpeg (e.g., .wav, .flac).
63
+ // Note: encoding speed varies by container/codec; uncompressed PCM (e.g., .wav) is fastest,
64
+ // lossless compression (e.g., .flac) is slower, and lossy codecs (e.g., .mp3/.aac/.opus)
65
+ // are typically the slowest to encode.
66
+ const outPath = inputPath.replace(/\.[^.]+$/, '.stripped.wav');
67
+ await writeStrippedAudio(inputPath, ts, vad.sampleRate, outPath);
68
+ }
69
+ } finally {
70
+ await vad.session.release?.(); // once per process when shutting down
71
+ }
72
+ })();
73
+ ```
74
+
75
+ Guidelines:
76
+ - Load once, reuse: keep one `SileroVad` per concurrent worker.
77
+ - Call `resetStates()` before each new file/stream; the session and weights stay in memory.
78
+ - Call `release()` when shutting down.
79
+
80
+ ## Development
81
+
82
+ Clone the repo to run benchmarks and tests locally.
83
+
84
+ ### Benchmark
85
+
86
+ ```bash
87
+ git clone https://github.com/jjhbw/silero-vad
88
+ cd silero-vad/js-fork
89
+ npm install
90
+ node bench.js --audio data/test.mp3 --runs 5
91
+ ```
92
+
93
+ The benchmark reports timings per file for streaming VAD and silence stripping. Stripped-audio files are written to a temporary directory and removed after each run.
94
+
95
+ ### Tests
96
+
97
+ Snapshot tests compare Node outputs against Python ground truth (`tests/snapshots/onnx.json`):
98
+
99
+ ```bash
100
+ git clone https://github.com/jjhbw/silero-vad
101
+ cd silero-vad/js-fork
102
+ npm install
103
+ npm test
104
+ ```
105
+
106
+ Ensure Python snapshots are generated (run `pytest tests/test_snapshots.py` in the repo root) and `ffmpeg` is installed.***
package/cli.js ADDED
@@ -0,0 +1,339 @@
1
+ #!/usr/bin/env node
2
+
3
+ const fs = require('fs');
4
+ const fsp = fs.promises;
5
+ const path = require('path');
6
+ const {
7
+ loadSileroVad,
8
+ getSpeechTimestamps,
9
+ writeStrippedAudio,
10
+ WEIGHTS,
11
+ } = require('./lib');
12
+
13
+ const toMB = (b) => (b / (1024 * 1024)).toFixed(2);
14
+
15
+ async function main() {
16
+ try {
17
+ const args = parseArgs(process.argv.slice(2));
18
+ if (!args.audio.length) {
19
+ printUsage();
20
+ process.exit(1);
21
+ }
22
+
23
+ const modelSpecifier = args.model || 'default';
24
+ const vad = await loadSileroVad(modelSpecifier);
25
+ if (!vad.sampleRate) {
26
+ throw new Error('No sample rate available for selected model. Please use a bundled model key.');
27
+ }
28
+ const effectiveSampleRate = vad.sampleRate;
29
+
30
+ try {
31
+ const results = [];
32
+ for (const audioPath of args.audio) {
33
+ // reuse session, reset stream state per file
34
+ vad.resetStates();
35
+ const t0 = performance.now();
36
+ const { timestamps, totalSamples } = await getSpeechTimestamps(audioPath, vad, {
37
+ threshold: args.threshold,
38
+ minSpeechDurationMs: args.minSpeechDurationMs,
39
+ minSilenceDurationMs: args.minSilenceDurationMs,
40
+ speechPadMs: args.speechPadMs,
41
+ returnSeconds: args.seconds,
42
+ timeResolution: args.timeResolution,
43
+ negThreshold: args.negThreshold,
44
+ returnMetadata: true,
45
+ });
46
+ const t1 = performance.now();
47
+ results.push({ file: audioPath, timestamps });
48
+
49
+ const mem = process.memoryUsage();
50
+ const speechSeconds = getSpeechDurationSeconds(
51
+ timestamps,
52
+ args.seconds,
53
+ effectiveSampleRate,
54
+ );
55
+ const durationSeconds = totalSamples / effectiveSampleRate;
56
+ const silenceSeconds = Math.max(0, durationSeconds - speechSeconds);
57
+ const totalForPct = durationSeconds > 0 ? durationSeconds : 1;
58
+ const speechPct = (speechSeconds / totalForPct) * 100;
59
+ const silencePct = (silenceSeconds / totalForPct) * 100;
60
+ console.info(
61
+ [
62
+ `file=${audioPath}`,
63
+ `duration=${formatDuration(durationSeconds)}`,
64
+ ].join(' '),
65
+ );
66
+ console.info(
67
+ [
68
+ `speech=${speechSeconds.toFixed(2)}s (${speechPct.toFixed(1)}%)`,
69
+ `silence=${silenceSeconds.toFixed(2)}s (${silencePct.toFixed(1)}%)`,
70
+ `total=${durationSeconds.toFixed(2)}s`,
71
+ ].join(' '),
72
+ );
73
+ const totalMs = t1 - t0;
74
+ console.info(
75
+ `vad_took=${totalMs.toFixed(2)}ms`,
76
+ );
77
+ console.info(
78
+ [
79
+ `rss_mb=${toMB(mem.rss)}`,
80
+ `heapUsed_mb=${toMB(mem.heapUsed)}`,
81
+ `external_mb=${toMB(mem.external)}`,
82
+ ].join(' '),
83
+ );
84
+
85
+ if (args.showTimeline) {
86
+ const lines = renderTimelineLines(
87
+ timestamps,
88
+ durationSeconds,
89
+ args.charsPerSecond,
90
+ 120,
91
+ );
92
+ const secondsPerChar = 1 / args.charsPerSecond;
93
+ console.info(
94
+ `legend: # speech . silence (1 char = ${secondsPerChar.toFixed(2)}s)`,
95
+ );
96
+ for (const line of lines) {
97
+ console.info(line);
98
+ }
99
+ }
100
+
101
+ if (args.stripSilence) {
102
+ const segmentsSeconds = timestamps.map(({ start, end, startSeconds, endSeconds }) => ({
103
+ start: args.seconds ? start : startSeconds,
104
+ end: args.seconds ? end : endSeconds,
105
+ }));
106
+ if (!segmentsSeconds.length) {
107
+ console.info(`strip_silence=skipped (no speech detected)`);
108
+ } else {
109
+ if (args.outputDir) {
110
+ await fsp.mkdir(args.outputDir, { recursive: true });
111
+ }
112
+ const outputPath = await ensureUniquePath(
113
+ getStripOutputPath(audioPath, args.outputDir),
114
+ );
115
+ const stripT0 = performance.now();
116
+ const memBefore = process.memoryUsage();
117
+ await writeStrippedAudio(
118
+ audioPath,
119
+ segmentsSeconds,
120
+ effectiveSampleRate,
121
+ outputPath,
122
+ );
123
+ const memAfter = process.memoryUsage();
124
+ const stripT1 = performance.now();
125
+ const strippedSeconds = segmentsSeconds.reduce(
126
+ (sum, seg) => sum + (seg.end - seg.start),
127
+ 0,
128
+ );
129
+ console.info(
130
+ `strip_silence_output=${outputPath} duration=${strippedSeconds.toFixed(2)}s`,
131
+ );
132
+ console.info(`strip_silence_took=${(stripT1 - stripT0).toFixed(2)}ms`);
133
+ console.info(
134
+ [
135
+ `strip_silence_mem_rss_delta_mb=${toMB(memAfter.rss - memBefore.rss)}`,
136
+ `strip_silence_mem_heap_delta_mb=${toMB(memAfter.heapUsed - memBefore.heapUsed)}`,
137
+ `strip_silence_mem_external_delta_mb=${toMB(memAfter.external - memBefore.external)}`,
138
+ ].join(' '),
139
+ );
140
+ }
141
+ }
142
+ }
143
+ } finally {
144
+ // Keep cleanup explicit so the pattern is clear for long-lived processes.
145
+ await vad.session.release?.();
146
+ }
147
+ } catch (err) {
148
+ console.error(err.message || err);
149
+ process.exit(1);
150
+ }
151
+ }
152
+
153
+ if (require.main === module) {
154
+ main();
155
+ }
156
+
157
+ function parseArgs(argv) {
158
+ const out = {
159
+ model: null,
160
+ audio: [],
161
+ threshold: 0.5,
162
+ minSpeechDurationMs: 250,
163
+ minSilenceDurationMs: 100,
164
+ speechPadMs: 30,
165
+ timeResolution: 3,
166
+ negThreshold: null,
167
+ seconds: true,
168
+ charsPerSecond: 4,
169
+ showTimeline: false,
170
+ stripSilence: false,
171
+ outputDir: null,
172
+ };
173
+
174
+ for (let i = 0; i < argv.length; i += 1) {
175
+ const arg = argv[i];
176
+ if (arg === '--model') {
177
+ out.model = argv[i + 1];
178
+ i += 1;
179
+ } else if (arg === '--audio') {
180
+ const value = argv[i + 1];
181
+ if (!value) {
182
+ throw new Error('Missing value for --audio');
183
+ }
184
+ out.audio.push(value);
185
+ i += 1;
186
+ } else if (arg === '--threshold') {
187
+ const value = parseFloat(argv[i + 1]);
188
+ if (!Number.isFinite(value)) {
189
+ throw new Error('Invalid value for --threshold');
190
+ }
191
+ out.threshold = value;
192
+ i += 1;
193
+ } else if (arg === '--min-speech-ms') {
194
+ const value = parseFloat(argv[i + 1]);
195
+ if (Number.isFinite(value) && value >= 0) {
196
+ out.minSpeechDurationMs = value;
197
+ }
198
+ i += 1;
199
+ } else if (arg === '--min-silence-ms') {
200
+ const value = parseFloat(argv[i + 1]);
201
+ if (Number.isFinite(value) && value >= 0) {
202
+ out.minSilenceDurationMs = value;
203
+ }
204
+ i += 1;
205
+ } else if (arg === '--speech-pad-ms') {
206
+ const value = parseFloat(argv[i + 1]);
207
+ if (Number.isFinite(value) && value >= 0) {
208
+ out.speechPadMs = value;
209
+ }
210
+ i += 1;
211
+ } else if (arg === '--time-resolution') {
212
+ const value = parseInt(argv[i + 1], 10);
213
+ if (Number.isFinite(value) && value >= 0) {
214
+ out.timeResolution = value;
215
+ }
216
+ i += 1;
217
+ } else if (arg === '--neg-threshold') {
218
+ const value = parseFloat(argv[i + 1]);
219
+ if (Number.isFinite(value)) {
220
+ out.negThreshold = value;
221
+ }
222
+ i += 1;
223
+ } else if (arg === '--seconds') {
224
+ out.seconds = true;
225
+ } else if (arg === '--cps') {
226
+ const value = parseFloat(argv[i + 1]);
227
+ out.showTimeline = true;
228
+ if (Number.isFinite(value) && value > 0) {
229
+ out.charsPerSecond = value;
230
+ }
231
+ i += 1;
232
+ } else if (arg === '--strip-silence') {
233
+ out.stripSilence = true;
234
+ } else if (arg === '--output-dir') {
235
+ out.outputDir = argv[i + 1];
236
+ i += 1;
237
+ } else if (arg === '--help' || arg === '-h') {
238
+ printUsage();
239
+ process.exit(0);
240
+ }
241
+ }
242
+
243
+ return out;
244
+ }
245
+
246
+ function printUsage() {
247
+ console.log(`Usage: silero-vad-cli --audio path/to/audio [options]
248
+
249
+ Options:
250
+ --model <key|path> Model key (${Object.keys(WEIGHTS).join(', ')}) or custom path (default: default)
251
+ --threshold <float> Speech probability threshold (default: 0.5)
252
+ --min-speech-ms <ms> Minimum speech duration in ms (default: 250)
253
+ --min-silence-ms <ms> Minimum silence duration in ms (default: 100)
254
+ --speech-pad-ms <ms> Padding added to speech segments in ms (default: 30)
255
+ --time-resolution <n> Decimal places for seconds output (default: 3)
256
+ --neg-threshold <f> Negative threshold override (default: threshold - 0.15)
257
+ --seconds Output timestamps in seconds (default: on)
258
+ --cps <float> Enable timeline visualization; chars per second (default: 4)
259
+ --strip-silence Write a new file with all silences removed
260
+ --output-dir <path> Output directory for strip-silence files (default: input dir)
261
+ -h, --help Show this message`);
262
+ }
263
+
264
+ function renderTimelineLines(timestamps, durationSeconds, charsPerSecond, maxLineWidth) {
265
+ if (!durationSeconds || durationSeconds <= 0 || charsPerSecond <= 0) {
266
+ return ['[no audio]'];
267
+ }
268
+
269
+ const width = Math.max(1, Math.ceil(durationSeconds * charsPerSecond));
270
+ const slots = new Array(width).fill('.');
271
+ for (const { start, end } of timestamps) {
272
+ const startIdx = Math.max(0, Math.floor((start / durationSeconds) * width));
273
+ const endIdx = Math.min(width, Math.ceil((end / durationSeconds) * width));
274
+ for (let i = startIdx; i < endIdx; i += 1) {
275
+ slots[i] = '#';
276
+ }
277
+ }
278
+
279
+ if (!maxLineWidth || maxLineWidth <= 0) {
280
+ return [`|${slots.join('')}|`];
281
+ }
282
+
283
+ const lines = [];
284
+ for (let i = 0; i < slots.length; i += maxLineWidth) {
285
+ lines.push(`|${slots.slice(i, i + maxLineWidth).join('')}|`);
286
+ }
287
+ return lines;
288
+ }
289
+
290
+ function getSpeechDurationSeconds(timestamps, timestampsInSeconds, sampleRate) {
291
+ if (!timestamps || !timestamps.length) {
292
+ return 0;
293
+ }
294
+ if (!timestampsInSeconds && !sampleRate) {
295
+ throw new Error("Need sampleRate");
296
+ }
297
+ if (timestampsInSeconds) {
298
+ return timestamps.reduce((sum, { start, end }) => sum + (end - start), 0);
299
+ }
300
+ return timestamps.reduce((sum, { start, end }) => sum + (end - start) / sampleRate, 0);
301
+ }
302
+
303
+ module.exports = {
304
+ getSpeechDurationSeconds,
305
+ parseArgs,
306
+ };
307
+
308
+ function formatDuration(seconds) {
309
+ const whole = Math.max(0, Math.round(seconds));
310
+ const mins = Math.floor(whole / 60);
311
+ const secs = String(whole % 60).padStart(2, '0');
312
+ return `${mins}:${secs}`;
313
+ }
314
+
315
+ function getStripOutputPath(inputPath, outputDir) {
316
+ const dir = outputDir || path.dirname(inputPath);
317
+ const ext = path.extname(inputPath);
318
+ const base = path.basename(inputPath, ext);
319
+ return path.join(dir, `${base}_speech.wav`);
320
+ }
321
+
322
+ async function ensureUniquePath(outputPath) {
323
+ try {
324
+ await fsp.access(outputPath);
325
+ } catch {
326
+ return outputPath;
327
+ }
328
+ const dir = path.dirname(outputPath);
329
+ const ext = path.extname(outputPath);
330
+ const base = path.basename(outputPath, ext);
331
+ for (let i = 1; ; i += 1) {
332
+ const candidate = path.join(dir, `${base}-${i}${ext}`);
333
+ try {
334
+ await fsp.access(candidate);
335
+ } catch {
336
+ return candidate;
337
+ }
338
+ }
339
+ }
package/index.js ADDED
@@ -0,0 +1 @@
1
+ module.exports = require('./lib');
package/lib.js ADDED
@@ -0,0 +1,385 @@
1
+ const fs = require('fs');
2
+ const fsp = fs.promises;
3
+ const path = require('path');
4
+ const { spawn } = require('child_process');
5
+ const ort = require('onnxruntime-node');
6
+
7
+ const WEIGHTS = {
8
+ default: { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
9
+ '16k': { path: path.join(__dirname, 'weights', 'silero_vad_16k_op15.onnx'), sampleRate: 16000 },
10
+ '8k_16k': { path: path.join(__dirname, 'weights', 'silero_vad.onnx'), sampleRate: 16000 }, // decode to 16k by default
11
+ half: { path: path.join(__dirname, 'weights', 'silero_vad_half.onnx'), sampleRate: 16000 },
12
+ op18: { path: path.join(__dirname, 'weights', 'silero_vad_op18_ifless.onnx'), sampleRate: 16000 },
13
+ };
14
+
15
+ // Bench sweep on long-form audio showed this CPU-only config is the best general default.
16
+ const DEFAULT_SESSION_OPTIONS = {
17
+ intraOpNumThreads: 4,
18
+ interOpNumThreads: 1,
19
+ executionMode: 'sequential',
20
+ graphOptimizationLevel: 'all',
21
+ enableCpuMemArena: true,
22
+ enableMemPattern: true,
23
+ };
24
+
25
+ // Minimal get_speech_timestamps port that runs the Silero VAD ONNX model in Node.
26
+ class SileroVad {
27
+ constructor(session) {
28
+ this.session = session;
29
+ this.outputNames = session.outputNames;
30
+ this.resetStates();
31
+ }
32
+
33
+ resetStates() {
34
+ this.state = new Float32Array(2 * 1 * 128); // shape: [2, 1, 128]
35
+ this.context = null;
36
+ this.lastSr = null;
37
+ this.contextSize = null;
38
+ this.inputWithContext = null;
39
+ this.srTensor = null;
40
+ }
41
+
42
+ async processChunk(chunk, sampleRate) {
43
+ const sr = sampleRate;
44
+ const windowSize = sr === 16000 ? 512 : 256;
45
+ const contextSize = sr === 16000 ? 64 : 32;
46
+
47
+ if (chunk.length !== windowSize) {
48
+ throw new Error(`Expected chunk of ${windowSize} samples, got ${chunk.length}`);
49
+ }
50
+
51
+ // Reset state when sample rate changes.
52
+ if (this.lastSr && this.lastSr !== sr) {
53
+ this.resetStates();
54
+ }
55
+
56
+ if (!this.context || this.contextSize !== contextSize) {
57
+ this.contextSize = contextSize;
58
+ this.context = new Float32Array(contextSize); // zeros
59
+ this.inputWithContext = new Float32Array(contextSize + windowSize);
60
+ this.srTensor = new ort.Tensor('int64', BigInt64Array.from([BigInt(sr)]));
61
+ }
62
+
63
+ const inputWithContext = this.inputWithContext;
64
+ inputWithContext.set(this.context, 0);
65
+ inputWithContext.set(chunk, contextSize);
66
+
67
+ const feeds = {
68
+ input: new ort.Tensor('float32', inputWithContext, [1, inputWithContext.length]),
69
+ state: new ort.Tensor('float32', this.state, [2, 1, 128]),
70
+ sr: this.srTensor,
71
+ };
72
+
73
+ const results = await this.session.run(feeds);
74
+ const probTensor = results[this.outputNames[0]];
75
+ const newStateTensor = results[this.outputNames[1]];
76
+
77
+ this.state.set(newStateTensor.data);
78
+ this.context.set(inputWithContext.subarray(inputWithContext.length - contextSize));
79
+ this.lastSr = sr;
80
+
81
+ return probTensor.data[0];
82
+ }
83
+ }
84
+
85
+ async function loadSileroVad(model = 'default', opts = {}) {
86
+ const spec = WEIGHTS[model];
87
+ const modelPath = spec ? spec.path : model || WEIGHTS.default.path;
88
+ const sessionOptions = {
89
+ ...DEFAULT_SESSION_OPTIONS,
90
+ ...(opts.sessionOptions || {}),
91
+ };
92
+ const session = await ort.InferenceSession.create(modelPath, sessionOptions);
93
+ const vad = new SileroVad(session);
94
+ vad.sampleRate = spec ? spec.sampleRate : null;
95
+ return vad;
96
+ }
97
+
98
+ async function getSpeechTimestamps(
99
+ inputPath,
100
+ vad,
101
+ {
102
+ threshold = 0.5,
103
+ minSpeechDurationMs = 250,
104
+ minSilenceDurationMs = 100,
105
+ speechPadMs = 30,
106
+ returnSeconds = false,
107
+ timeResolution = 1,
108
+ negThreshold,
109
+ sampleRate,
110
+ returnMetadata = false,
111
+ } = {},
112
+ ) {
113
+ if (!vad) {
114
+ throw new Error('Pass a loaded SileroVad instance');
115
+ }
116
+
117
+ const sr = sampleRate || vad.sampleRate;
118
+ if (!sr) {
119
+ throw new Error('VAD sample rate is undefined. Use a bundled model key.');
120
+ }
121
+
122
+ if (sr !== 8000 && sr !== 16000) {
123
+ throw new Error('Supported sampling rates: 8000 or 16000 (or a multiple of 16000).');
124
+ }
125
+
126
+ const windowSize = sr === 16000 ? 512 : 256;
127
+ const minSpeechSamples = (sr * minSpeechDurationMs) / 1000;
128
+ const minSilenceSamples = (sr * minSilenceDurationMs) / 1000;
129
+ const speechPadSamples = (sr * speechPadMs) / 1000;
130
+ const negThres = negThreshold ?? Math.max(threshold - 0.15, 0.01);
131
+
132
+ vad.resetStates();
133
+
134
+ let triggered = false;
135
+ let tempEnd = 0;
136
+ let currentSpeech = {};
137
+ const speeches = [];
138
+ let processedSamples = 0;
139
+ let totalSamples = 0;
140
+ let leftoverBytes = Buffer.alloc(0);
141
+ const frameScratch = new Float32Array(windowSize);
142
+ let pendingLen = 0;
143
+
144
+ const channels = 1;
145
+ const args = [
146
+ '-v',
147
+ 'error',
148
+ '-i',
149
+ inputPath,
150
+ '-ac',
151
+ String(channels),
152
+ '-ar',
153
+ String(sr),
154
+ '-f',
155
+ 'f32le',
156
+ 'pipe:1',
157
+ ];
158
+ const ffmpeg = spawn('ffmpeg', args, { stdio: ['ignore', 'pipe', 'inherit'] });
159
+
160
+ const processFrame = async (frame, curSample) => {
161
+ const speechProb = await vad.processChunk(frame, sr);
162
+ if (speechProb >= threshold && tempEnd) {
163
+ tempEnd = 0;
164
+ }
165
+
166
+ if (speechProb >= threshold && !triggered) {
167
+ triggered = true;
168
+ currentSpeech.start = curSample;
169
+ return;
170
+ }
171
+
172
+ if (speechProb < negThres && triggered) {
173
+ if (!tempEnd) {
174
+ tempEnd = curSample;
175
+ }
176
+ if (curSample - tempEnd < minSilenceSamples) {
177
+ return;
178
+ }
179
+
180
+ currentSpeech.end = tempEnd;
181
+ if (currentSpeech.end - currentSpeech.start > minSpeechSamples) {
182
+ speeches.push(currentSpeech);
183
+ }
184
+ currentSpeech = {};
185
+ triggered = false;
186
+ tempEnd = 0;
187
+ }
188
+ };
189
+
190
+ const streamDone = (async () => {
191
+ for await (const chunk of ffmpeg.stdout) {
192
+ let data = chunk;
193
+ if (leftoverBytes.length) {
194
+ const combined = Buffer.allocUnsafe(leftoverBytes.length + chunk.length);
195
+ leftoverBytes.copy(combined, 0);
196
+ chunk.copy(combined, leftoverBytes.length);
197
+ data = combined;
198
+ leftoverBytes = Buffer.alloc(0);
199
+ }
200
+
201
+ const usableBytes = data.length - (data.length % 4);
202
+ if (usableBytes <= 0) {
203
+ leftoverBytes = data;
204
+ continue;
205
+ }
206
+
207
+ leftoverBytes = data.subarray(usableBytes);
208
+ const floatData = new Float32Array(
209
+ data.buffer,
210
+ data.byteOffset,
211
+ usableBytes / Float32Array.BYTES_PER_ELEMENT,
212
+ );
213
+ totalSamples += floatData.length;
214
+
215
+ let offset = 0;
216
+ if (pendingLen) {
217
+ const needed = windowSize - pendingLen;
218
+ if (floatData.length >= needed) {
219
+ frameScratch.set(frameScratch.subarray(0, pendingLen), 0);
220
+ frameScratch.set(floatData.subarray(0, needed), pendingLen);
221
+ const curSample = processedSamples;
222
+ processedSamples += windowSize;
223
+ await processFrame(frameScratch, curSample);
224
+ offset = needed;
225
+ pendingLen = 0;
226
+ } else {
227
+ frameScratch.set(floatData, pendingLen);
228
+ pendingLen += floatData.length;
229
+ continue;
230
+ }
231
+ }
232
+
233
+ while (offset + windowSize <= floatData.length) {
234
+ const frame = floatData.subarray(offset, offset + windowSize);
235
+ const curSample = processedSamples;
236
+ processedSamples += windowSize;
237
+ await processFrame(frame, curSample);
238
+ offset += windowSize;
239
+ }
240
+
241
+ const remainingSamples = floatData.length - offset;
242
+ if (remainingSamples > 0) {
243
+ frameScratch.set(floatData.subarray(offset), 0);
244
+ pendingLen = remainingSamples;
245
+ } else {
246
+ pendingLen = 0;
247
+ }
248
+ }
249
+ })();
250
+
251
+ await new Promise((resolve, reject) => {
252
+ let settled = false;
253
+ const finish = (fn) => (value) => {
254
+ if (settled) {
255
+ return;
256
+ }
257
+ settled = true;
258
+ fn(value);
259
+ };
260
+ const resolveOnce = finish(resolve);
261
+ const rejectOnce = finish(reject);
262
+
263
+ streamDone.then(resolveOnce, (err) => {
264
+ ffmpeg.kill('SIGKILL');
265
+ rejectOnce(err);
266
+ });
267
+
268
+ ffmpeg.on('error', rejectOnce);
269
+ ffmpeg.on('close', (code) => {
270
+ if (code !== 0) {
271
+ rejectOnce(new Error(`ffmpeg exited with code ${code}`));
272
+ return;
273
+ }
274
+ streamDone.then(resolveOnce, rejectOnce);
275
+ });
276
+ });
277
+
278
+ if (leftoverBytes.length) {
279
+ const usableBytes = leftoverBytes.length - (leftoverBytes.length % 4);
280
+ if (usableBytes > 0) {
281
+ const tailFloats = new Float32Array(
282
+ leftoverBytes.buffer,
283
+ leftoverBytes.byteOffset,
284
+ usableBytes / Float32Array.BYTES_PER_ELEMENT,
285
+ );
286
+ if (tailFloats.length) {
287
+ frameScratch.set(tailFloats, pendingLen);
288
+ pendingLen += tailFloats.length;
289
+ }
290
+ }
291
+ }
292
+
293
+ if (pendingLen) {
294
+ const padded = new Float32Array(windowSize);
295
+ padded.set(frameScratch.subarray(0, pendingLen));
296
+ const curSample = processedSamples;
297
+ await processFrame(padded, curSample);
298
+ processedSamples += windowSize;
299
+ }
300
+
301
+ if (currentSpeech.start !== undefined) {
302
+ currentSpeech.end = totalSamples;
303
+ if (currentSpeech.end - currentSpeech.start > minSpeechSamples) {
304
+ speeches.push(currentSpeech);
305
+ }
306
+ }
307
+
308
+ for (let idx = 0; idx < speeches.length; idx += 1) {
309
+ const speech = speeches[idx];
310
+ const prevEnd = idx === 0 ? 0 : speeches[idx - 1].end;
311
+ const nextStart = idx === speeches.length - 1 ? totalSamples : speeches[idx + 1].start;
312
+ const padStart = Math.max(speech.start - speechPadSamples, prevEnd);
313
+ const padEnd = Math.min(speech.end + speechPadSamples, nextStart);
314
+ speech.start = Math.max(0, Math.floor(padStart));
315
+ speech.end = Math.min(totalSamples, Math.floor(padEnd));
316
+ }
317
+
318
+ const convertSeconds = (samples) => +(samples / sr).toFixed(timeResolution);
319
+ const result = returnSeconds
320
+ ? speeches.map(({ start, end }) => ({
321
+ start: convertSeconds(start),
322
+ end: convertSeconds(end),
323
+ startSample: start,
324
+ endSample: end,
325
+ }))
326
+ : speeches.map(({ start, end }) => ({
327
+ start,
328
+ end,
329
+ startSeconds: convertSeconds(start),
330
+ endSeconds: convertSeconds(end),
331
+ }));
332
+
333
+ if (returnMetadata) {
334
+ return { timestamps: result, totalSamples };
335
+ }
336
+
337
+ return result;
338
+ }
339
+
340
+ async function writeStrippedAudio(inputPath, segmentsSeconds, sampleRate, outputPath) {
341
+ if (!segmentsSeconds || !segmentsSeconds.length) {
342
+ throw new Error('No valid speech segments to write');
343
+ }
344
+ if (!sampleRate) {
345
+ throw new Error('Sample rate is required to write WAV');
346
+ }
347
+ const expr = segmentsSeconds
348
+ .map(({ start, end }) => `between(t\\,${start.toFixed(6)}\\,${end.toFixed(6)})`)
349
+ .join('+');
350
+ const filter = `aselect='${expr}',asetpts=N/SR/TB`;
351
+
352
+ const args = [
353
+ '-y',
354
+ '-v',
355
+ 'error',
356
+ '-i',
357
+ inputPath,
358
+ '-af',
359
+ filter,
360
+ '-ac',
361
+ '1',
362
+ '-ar',
363
+ String(sampleRate),
364
+ outputPath,
365
+ ];
366
+
367
+ await new Promise((resolve, reject) => {
368
+ const ffmpeg = spawn('ffmpeg', args, { stdio: ['ignore', 'ignore', 'inherit'] });
369
+ ffmpeg.on('error', reject);
370
+ ffmpeg.on('close', (code) => {
371
+ if (code !== 0) {
372
+ reject(new Error(`ffmpeg exited with code ${code}`));
373
+ return;
374
+ }
375
+ resolve();
376
+ });
377
+ });
378
+ }
379
+
380
+ module.exports = {
381
+ loadSileroVad,
382
+ getSpeechTimestamps,
383
+ writeStrippedAudio,
384
+ WEIGHTS,
385
+ };
package/package.json ADDED
@@ -0,0 +1,37 @@
1
+ {
2
+ "name": "@jjhbw/silero-vad",
3
+ "version": "1.0.0",
4
+ "description": "Node.js bindings for Silero VAD",
5
+ "main": "index.js",
6
+ "exports": {
7
+ ".": "./index.js"
8
+ },
9
+ "bin": {
10
+ "silero-vad-cli": "cli.js"
11
+ },
12
+ "scripts": {
13
+ "bench": "node bench.js --audio data/long.mp3 --runs 3 --warmup 1",
14
+ "test": "node --test"
15
+ },
16
+ "author": "jjhbw",
17
+ "license": "MIT",
18
+ "repository": {
19
+ "type": "git",
20
+ "url": "https://github.com/jjhbw/silero-vad.git"
21
+ },
22
+ "bugs": {
23
+ "url": "https://github.com/jjhbw/silero-vad/issues"
24
+ },
25
+ "homepage": "https://github.com/jjhbw/silero-vad#readme",
26
+ "files": [
27
+ "index.js",
28
+ "lib.js",
29
+ "cli.js",
30
+ "weights/",
31
+ "README.md",
32
+ "LICENSE"
33
+ ],
34
+ "dependencies": {
35
+ "onnxruntime-node": "^1.23.2"
36
+ }
37
+ }
Binary file
Binary file
Binary file