@jjhbw/silero-vad 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. package/README.md +2 -5
  2. package/cli.js +45 -12
  3. package/lib.js +8 -23
  4. package/package.json +2 -2
package/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Silero VAD Node Fork
1
+ # Silero VAD Node
2
2
 
3
3
  Minimal Node.js wrapper around the Silero VAD ONNX model, with a small CLI and parity tests against the Python implementation. The Node implementation runs VAD and silence stripping directly from ffmpeg streams to keep memory usage low on long files.
4
4
 
@@ -27,9 +27,7 @@ const {
27
27
  const inputs = ["input.wav", "other.mp3"];
28
28
  for (const inputPath of inputs) {
29
29
  vad.resetStates(); // per file/stream
30
- const ts = await getSpeechTimestamps(inputPath, vad, {
31
- returnSeconds: true,
32
- });
30
+ const ts = await getSpeechTimestamps(inputPath, vad);
33
31
  // Each entry includes both seconds (start/end) and samples (startSample/endSample).
34
32
  console.log(inputPath, ts);
35
33
  // Example return value:
@@ -73,7 +71,6 @@ Options:
73
71
  - `--speech-pad-ms <ms>`: padding added to each speech segment in ms (default `30`).
74
72
  - `--time-resolution <n>`: decimal places for seconds output (default `3`).
75
73
  - `--neg-threshold <float>`: override the negative threshold (default `max(threshold - 0.15, 0.01)`).
76
- - `--seconds`: output timestamps in seconds (default on).
77
74
  - `--cps <float>`: enable the timeline visualization and set chars per second (default `4`).
78
75
  - `--strip-silence`: write a new WAV file with silences removed.
79
76
  - `--output-dir <path>`: output directory for strip-silence files (default: input dir).
package/cli.js CHANGED
@@ -3,6 +3,7 @@
3
3
  const fs = require('fs');
4
4
  const fsp = fs.promises;
5
5
  const path = require('path');
6
+ const { spawn } = require('child_process');
6
7
  const {
7
8
  loadSileroVad,
8
9
  getSpeechTimestamps,
@@ -33,15 +34,13 @@ async function main() {
33
34
  // reuse session, reset stream state per file
34
35
  vad.resetStates();
35
36
  const t0 = performance.now();
36
- const { timestamps, totalSamples } = await getSpeechTimestamps(audioPath, vad, {
37
+ const timestamps = await getSpeechTimestamps(audioPath, vad, {
37
38
  threshold: args.threshold,
38
39
  minSpeechDurationMs: args.minSpeechDurationMs,
39
40
  minSilenceDurationMs: args.minSilenceDurationMs,
40
41
  speechPadMs: args.speechPadMs,
41
- returnSeconds: args.seconds,
42
42
  timeResolution: args.timeResolution,
43
43
  negThreshold: args.negThreshold,
44
- returnMetadata: true,
45
44
  });
46
45
  const t1 = performance.now();
47
46
  results.push({ file: audioPath, timestamps });
@@ -49,10 +48,10 @@ async function main() {
49
48
  const mem = process.memoryUsage();
50
49
  const speechSeconds = getSpeechDurationSeconds(
51
50
  timestamps,
52
- args.seconds,
51
+ true,
53
52
  effectiveSampleRate,
54
53
  );
55
- const durationSeconds = totalSamples / effectiveSampleRate;
54
+ const durationSeconds = await getAudioDurationSeconds(audioPath);
56
55
  const silenceSeconds = Math.max(0, durationSeconds - speechSeconds);
57
56
  const totalForPct = durationSeconds > 0 ? durationSeconds : 1;
58
57
  const speechPct = (speechSeconds / totalForPct) * 100;
@@ -99,9 +98,9 @@ async function main() {
99
98
  }
100
99
 
101
100
  if (args.stripSilence) {
102
- const segmentsSeconds = timestamps.map(({ start, end, startSeconds, endSeconds }) => ({
103
- start: args.seconds ? start : startSeconds,
104
- end: args.seconds ? end : endSeconds,
101
+ const segmentsSeconds = timestamps.map(({ start, end }) => ({
102
+ start,
103
+ end,
105
104
  }));
106
105
  if (!segmentsSeconds.length) {
107
106
  console.info(`strip_silence=skipped (no speech detected)`);
@@ -164,7 +163,6 @@ function parseArgs(argv) {
164
163
  speechPadMs: 30,
165
164
  timeResolution: 3,
166
165
  negThreshold: null,
167
- seconds: true,
168
166
  charsPerSecond: 4,
169
167
  showTimeline: false,
170
168
  stripSilence: false,
@@ -220,8 +218,6 @@ function parseArgs(argv) {
220
218
  out.negThreshold = value;
221
219
  }
222
220
  i += 1;
223
- } else if (arg === '--seconds') {
224
- out.seconds = true;
225
221
  } else if (arg === '--cps') {
226
222
  const value = parseFloat(argv[i + 1]);
227
223
  out.showTimeline = true;
@@ -254,7 +250,6 @@ Options:
254
250
  --speech-pad-ms <ms> Padding added to speech segments in ms (default: 30)
255
251
  --time-resolution <n> Decimal places for seconds output (default: 3)
256
252
  --neg-threshold <f> Negative threshold override (default: max(threshold - 0.15, 0.01))
257
- --seconds Output timestamps in seconds (default: on)
258
253
  --cps <float> Enable timeline visualization; chars per second (default: 4)
259
254
  --strip-silence Write a new file with all silences removed
260
255
  --output-dir <path> Output directory for strip-silence files (default: input dir)
@@ -319,6 +314,44 @@ function getStripOutputPath(inputPath, outputDir) {
319
314
  return path.join(dir, `${base}_speech.wav`);
320
315
  }
321
316
 
317
+ async function getAudioDurationSeconds(inputPath) {
318
+ const args = [
319
+ '-v',
320
+ 'error',
321
+ // Use packet timestamps to handle containers where format duration is unreliable.
322
+ '-select_streams',
323
+ 'a:0',
324
+ '-show_entries',
325
+ 'packet=pts_time',
326
+ '-of',
327
+ 'csv=p=0',
328
+ inputPath,
329
+ ];
330
+ return new Promise((resolve, reject) => {
331
+ const ffprobe = spawn('ffprobe', args, { stdio: ['ignore', 'pipe', 'inherit'] });
332
+ let output = '';
333
+ ffprobe.stdout.on('data', (chunk) => {
334
+ output += chunk.toString();
335
+ });
336
+ ffprobe.on('error', reject);
337
+ ffprobe.on('close', (code) => {
338
+ if (code !== 0) {
339
+ reject(new Error(`ffprobe exited with code ${code}`));
340
+ return;
341
+ }
342
+ const lines = output.trim().split('\n');
343
+ for (let i = lines.length - 1; i >= 0; i -= 1) {
344
+ const value = parseFloat(lines[i]);
345
+ if (Number.isFinite(value)) {
346
+ resolve(value);
347
+ return;
348
+ }
349
+ }
350
+ reject(new Error('Unable to read audio duration from ffprobe output'));
351
+ });
352
+ });
353
+ }
354
+
322
355
  async function ensureUniquePath(outputPath) {
323
356
  try {
324
357
  await fsp.access(outputPath);
package/lib.js CHANGED
@@ -1,5 +1,3 @@
1
- const fs = require('fs');
2
- const fsp = fs.promises;
3
1
  const path = require('path');
4
2
  const { spawn } = require('child_process');
5
3
  const ort = require('onnxruntime-node');
@@ -120,7 +118,6 @@ async function loadSileroVad(model = 'default', opts = {}) {
120
118
  * Example: a 50 ms pause will not split a segment at 100 ms.
121
119
  * @param {number} [options.speechPadMs=30] Pad each segment on both sides, clamped
122
120
  * to neighbors. Example: [1.000, 2.000] -> ~[0.970, 2.030].
123
- * @param {boolean} [options.returnSeconds=false]
124
121
  * @param {number} [options.timeResolution=3] Decimal places for seconds output.
125
122
  * Example: timeResolution=1 turns 1.23456 into 1.2.
126
123
  * @param {number} [options.negThreshold=threshold-0.15] End speech when prob dips
@@ -128,7 +125,8 @@ async function loadSileroVad(model = 'default', opts = {}) {
128
125
  * negThreshold=0.35 keeps speech open during brief 0.4 dips.
129
126
  * Default clamps to >= 0.01 to avoid an always-on end condition.
130
127
  * @param {number} [options.sampleRate]
131
- * @param {boolean} [options.returnMetadata=false]
128
+ * @returns {Promise<Array<{start: number, end: number, startSample: number, endSample: number}>>}
129
+ * start/end are seconds; startSample/endSample are sample indices.
132
130
  */
133
131
  async function getSpeechTimestamps(
134
132
  inputPath,
@@ -138,11 +136,9 @@ async function getSpeechTimestamps(
138
136
  minSpeechDurationMs = 250,
139
137
  minSilenceDurationMs = 100,
140
138
  speechPadMs = 30,
141
- returnSeconds = false,
142
139
  timeResolution = 3,
143
140
  negThreshold,
144
141
  sampleRate,
145
- returnMetadata = false,
146
142
  } = {},
147
143
  ) {
148
144
  if (!vad) {
@@ -351,23 +347,12 @@ async function getSpeechTimestamps(
351
347
  }
352
348
 
353
349
  const convertSeconds = (samples) => +(samples / sr).toFixed(timeResolution);
354
- const result = returnSeconds
355
- ? speeches.map(({ start, end }) => ({
356
- start: convertSeconds(start),
357
- end: convertSeconds(end),
358
- startSample: start,
359
- endSample: end,
360
- }))
361
- : speeches.map(({ start, end }) => ({
362
- start,
363
- end,
364
- startSeconds: convertSeconds(start),
365
- endSeconds: convertSeconds(end),
366
- }));
367
-
368
- if (returnMetadata) {
369
- return { timestamps: result, totalSamples };
370
- }
350
+ const result = speeches.map(({ start, end }) => ({
351
+ start: convertSeconds(start),
352
+ end: convertSeconds(end),
353
+ startSample: start,
354
+ endSample: end,
355
+ }));
371
356
 
372
357
  return result;
373
358
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jjhbw/silero-vad",
3
- "version": "1.0.2",
3
+ "version": "1.0.3",
4
4
  "description": "Node.js bindings for Silero VAD",
5
5
  "main": "index.js",
6
6
  "exports": {
@@ -17,7 +17,7 @@
17
17
  "license": "MIT",
18
18
  "repository": {
19
19
  "type": "git",
20
- "url": "https://github.com/jjhbw/silero-vad.git"
20
+ "url": "git+https://github.com/jjhbw/silero-vad.git"
21
21
  },
22
22
  "bugs": {
23
23
  "url": "https://github.com/jjhbw/silero-vad/issues"