@jjhbw/silero-vad 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -5
- package/cli.js +45 -12
- package/lib.js +8 -23
- package/package.json +2 -2
package/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Silero VAD Node
|
|
1
|
+
# Silero VAD Node
|
|
2
2
|
|
|
3
3
|
Minimal Node.js wrapper around the Silero VAD ONNX model, with a small CLI and parity tests against the Python implementation. The Node implementation runs VAD and silence stripping directly from ffmpeg streams to keep memory usage low on long files.
|
|
4
4
|
|
|
@@ -27,9 +27,7 @@ const {
|
|
|
27
27
|
const inputs = ["input.wav", "other.mp3"];
|
|
28
28
|
for (const inputPath of inputs) {
|
|
29
29
|
vad.resetStates(); // per file/stream
|
|
30
|
-
const ts = await getSpeechTimestamps(inputPath, vad
|
|
31
|
-
returnSeconds: true,
|
|
32
|
-
});
|
|
30
|
+
const ts = await getSpeechTimestamps(inputPath, vad);
|
|
33
31
|
// Each entry includes both seconds (start/end) and samples (startSample/endSample).
|
|
34
32
|
console.log(inputPath, ts);
|
|
35
33
|
// Example return value:
|
|
@@ -73,7 +71,6 @@ Options:
|
|
|
73
71
|
- `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
|
|
74
72
|
- `--time-resolution <n>`: decimal places for seconds output (default `3`).
|
|
75
73
|
- `--neg-threshold <float>`: override the negative threshold (default `max(threshold - 0.15, 0.01)`).
|
|
76
|
-
- `--seconds`: output timestamps in seconds (default on).
|
|
77
74
|
- `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
|
|
78
75
|
- `--strip-silence`: write a new WAV file with silences removed.
|
|
79
76
|
- `--output-dir <path>`: output directory for strip-silence files (default: input dir).
|
package/cli.js
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
const fs = require('fs');
|
|
4
4
|
const fsp = fs.promises;
|
|
5
5
|
const path = require('path');
|
|
6
|
+
const { spawn } = require('child_process');
|
|
6
7
|
const {
|
|
7
8
|
loadSileroVad,
|
|
8
9
|
getSpeechTimestamps,
|
|
@@ -33,15 +34,13 @@ async function main() {
|
|
|
33
34
|
// reuse session, reset stream state per file
|
|
34
35
|
vad.resetStates();
|
|
35
36
|
const t0 = performance.now();
|
|
36
|
-
const
|
|
37
|
+
const timestamps = await getSpeechTimestamps(audioPath, vad, {
|
|
37
38
|
threshold: args.threshold,
|
|
38
39
|
minSpeechDurationMs: args.minSpeechDurationMs,
|
|
39
40
|
minSilenceDurationMs: args.minSilenceDurationMs,
|
|
40
41
|
speechPadMs: args.speechPadMs,
|
|
41
|
-
returnSeconds: args.seconds,
|
|
42
42
|
timeResolution: args.timeResolution,
|
|
43
43
|
negThreshold: args.negThreshold,
|
|
44
|
-
returnMetadata: true,
|
|
45
44
|
});
|
|
46
45
|
const t1 = performance.now();
|
|
47
46
|
results.push({ file: audioPath, timestamps });
|
|
@@ -49,10 +48,10 @@ async function main() {
|
|
|
49
48
|
const mem = process.memoryUsage();
|
|
50
49
|
const speechSeconds = getSpeechDurationSeconds(
|
|
51
50
|
timestamps,
|
|
52
|
-
|
|
51
|
+
true,
|
|
53
52
|
effectiveSampleRate,
|
|
54
53
|
);
|
|
55
|
-
const durationSeconds =
|
|
54
|
+
const durationSeconds = await getAudioDurationSeconds(audioPath);
|
|
56
55
|
const silenceSeconds = Math.max(0, durationSeconds - speechSeconds);
|
|
57
56
|
const totalForPct = durationSeconds > 0 ? durationSeconds : 1;
|
|
58
57
|
const speechPct = (speechSeconds / totalForPct) * 100;
|
|
@@ -99,9 +98,9 @@ async function main() {
|
|
|
99
98
|
}
|
|
100
99
|
|
|
101
100
|
if (args.stripSilence) {
|
|
102
|
-
const segmentsSeconds = timestamps.map(({ start, end
|
|
103
|
-
start
|
|
104
|
-
end
|
|
101
|
+
const segmentsSeconds = timestamps.map(({ start, end }) => ({
|
|
102
|
+
start,
|
|
103
|
+
end,
|
|
105
104
|
}));
|
|
106
105
|
if (!segmentsSeconds.length) {
|
|
107
106
|
console.info(`strip_silence=skipped (no speech detected)`);
|
|
@@ -164,7 +163,6 @@ function parseArgs(argv) {
|
|
|
164
163
|
speechPadMs: 30,
|
|
165
164
|
timeResolution: 3,
|
|
166
165
|
negThreshold: null,
|
|
167
|
-
seconds: true,
|
|
168
166
|
charsPerSecond: 4,
|
|
169
167
|
showTimeline: false,
|
|
170
168
|
stripSilence: false,
|
|
@@ -220,8 +218,6 @@ function parseArgs(argv) {
|
|
|
220
218
|
out.negThreshold = value;
|
|
221
219
|
}
|
|
222
220
|
i += 1;
|
|
223
|
-
} else if (arg === '--seconds') {
|
|
224
|
-
out.seconds = true;
|
|
225
221
|
} else if (arg === '--cps') {
|
|
226
222
|
const value = parseFloat(argv[i + 1]);
|
|
227
223
|
out.showTimeline = true;
|
|
@@ -254,7 +250,6 @@ Options:
|
|
|
254
250
|
--speech-pad-ms <ms> Padding added to speech segments in ms (default: 30)
|
|
255
251
|
--time-resolution <n> Decimal places for seconds output (default: 3)
|
|
256
252
|
--neg-threshold <f> Negative threshold override (default: max(threshold - 0.15, 0.01))
|
|
257
|
-
--seconds Output timestamps in seconds (default: on)
|
|
258
253
|
--cps <float> Enable timeline visualization; chars per second (default: 4)
|
|
259
254
|
--strip-silence Write a new file with all silences removed
|
|
260
255
|
--output-dir <path> Output directory for strip-silence files (default: input dir)
|
|
@@ -319,6 +314,44 @@ function getStripOutputPath(inputPath, outputDir) {
|
|
|
319
314
|
return path.join(dir, `${base}_speech.wav`);
|
|
320
315
|
}
|
|
321
316
|
|
|
317
|
+
async function getAudioDurationSeconds(inputPath) {
|
|
318
|
+
const args = [
|
|
319
|
+
'-v',
|
|
320
|
+
'error',
|
|
321
|
+
// Use packet timestamps to handle containers where format duration is unreliable.
|
|
322
|
+
'-select_streams',
|
|
323
|
+
'a:0',
|
|
324
|
+
'-show_entries',
|
|
325
|
+
'packet=pts_time',
|
|
326
|
+
'-of',
|
|
327
|
+
'csv=p=0',
|
|
328
|
+
inputPath,
|
|
329
|
+
];
|
|
330
|
+
return new Promise((resolve, reject) => {
|
|
331
|
+
const ffprobe = spawn('ffprobe', args, { stdio: ['ignore', 'pipe', 'inherit'] });
|
|
332
|
+
let output = '';
|
|
333
|
+
ffprobe.stdout.on('data', (chunk) => {
|
|
334
|
+
output += chunk.toString();
|
|
335
|
+
});
|
|
336
|
+
ffprobe.on('error', reject);
|
|
337
|
+
ffprobe.on('close', (code) => {
|
|
338
|
+
if (code !== 0) {
|
|
339
|
+
reject(new Error(`ffprobe exited with code ${code}`));
|
|
340
|
+
return;
|
|
341
|
+
}
|
|
342
|
+
const lines = output.trim().split('\n');
|
|
343
|
+
for (let i = lines.length - 1; i >= 0; i -= 1) {
|
|
344
|
+
const value = parseFloat(lines[i]);
|
|
345
|
+
if (Number.isFinite(value)) {
|
|
346
|
+
resolve(value);
|
|
347
|
+
return;
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
reject(new Error('Unable to read audio duration from ffprobe output'));
|
|
351
|
+
});
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
|
|
322
355
|
async function ensureUniquePath(outputPath) {
|
|
323
356
|
try {
|
|
324
357
|
await fsp.access(outputPath);
|
package/lib.js
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
const fs = require('fs');
|
|
2
|
-
const fsp = fs.promises;
|
|
3
1
|
const path = require('path');
|
|
4
2
|
const { spawn } = require('child_process');
|
|
5
3
|
const ort = require('onnxruntime-node');
|
|
@@ -120,7 +118,6 @@ async function loadSileroVad(model = 'default', opts = {}) {
|
|
|
120
118
|
* Example: a 50 ms pause will not split a segment at 100 ms.
|
|
121
119
|
* @param {number} [options.speechPadMs=30] Pad each segment on both sides, clamped
|
|
122
120
|
* to neighbors. Example: [1.000, 2.000] -> ~[0.970, 2.030].
|
|
123
|
-
* @param {boolean} [options.returnSeconds=false]
|
|
124
121
|
* @param {number} [options.timeResolution=3] Decimal places for seconds output.
|
|
125
122
|
* Example: timeResolution=1 turns 1.23456 into 1.2.
|
|
126
123
|
* @param {number} [options.negThreshold=threshold-0.15] End speech when prob dips
|
|
@@ -128,7 +125,8 @@ async function loadSileroVad(model = 'default', opts = {}) {
|
|
|
128
125
|
* negThreshold=0.35 keeps speech open during brief 0.4 dips.
|
|
129
126
|
* Default clamps to >= 0.01 to avoid an always-on end condition.
|
|
130
127
|
* @param {number} [options.sampleRate]
|
|
131
|
-
* @
|
|
128
|
+
* @returns {Promise<Array<{start: number, end: number, startSample: number, endSample: number}>>}
|
|
129
|
+
* start/end are seconds; startSample/endSample are sample indices.
|
|
132
130
|
*/
|
|
133
131
|
async function getSpeechTimestamps(
|
|
134
132
|
inputPath,
|
|
@@ -138,11 +136,9 @@ async function getSpeechTimestamps(
|
|
|
138
136
|
minSpeechDurationMs = 250,
|
|
139
137
|
minSilenceDurationMs = 100,
|
|
140
138
|
speechPadMs = 30,
|
|
141
|
-
returnSeconds = false,
|
|
142
139
|
timeResolution = 3,
|
|
143
140
|
negThreshold,
|
|
144
141
|
sampleRate,
|
|
145
|
-
returnMetadata = false,
|
|
146
142
|
} = {},
|
|
147
143
|
) {
|
|
148
144
|
if (!vad) {
|
|
@@ -351,23 +347,12 @@ async function getSpeechTimestamps(
|
|
|
351
347
|
}
|
|
352
348
|
|
|
353
349
|
const convertSeconds = (samples) => +(samples / sr).toFixed(timeResolution);
|
|
354
|
-
const result =
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
}))
|
|
361
|
-
: speeches.map(({ start, end }) => ({
|
|
362
|
-
start,
|
|
363
|
-
end,
|
|
364
|
-
startSeconds: convertSeconds(start),
|
|
365
|
-
endSeconds: convertSeconds(end),
|
|
366
|
-
}));
|
|
367
|
-
|
|
368
|
-
if (returnMetadata) {
|
|
369
|
-
return { timestamps: result, totalSamples };
|
|
370
|
-
}
|
|
350
|
+
const result = speeches.map(({ start, end }) => ({
|
|
351
|
+
start: convertSeconds(start),
|
|
352
|
+
end: convertSeconds(end),
|
|
353
|
+
startSample: start,
|
|
354
|
+
endSample: end,
|
|
355
|
+
}));
|
|
371
356
|
|
|
372
357
|
return result;
|
|
373
358
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@jjhbw/silero-vad",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.3",
|
|
4
4
|
"description": "Node.js bindings for Silero VAD",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"exports": {
|
|
@@ -17,7 +17,7 @@
|
|
|
17
17
|
"license": "MIT",
|
|
18
18
|
"repository": {
|
|
19
19
|
"type": "git",
|
|
20
|
-
"url": "https://github.com/jjhbw/silero-vad.git"
|
|
20
|
+
"url": "git+https://github.com/jjhbw/silero-vad.git"
|
|
21
21
|
},
|
|
22
22
|
"bugs": {
|
|
23
23
|
"url": "https://github.com/jjhbw/silero-vad/issues"
|