@tensamin/audio 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -121,13 +121,32 @@ To disable noise suppression, set noiseSuppression.enabled to false.`
121
121
  };
122
122
 
123
123
  // src/vad/vad-node.ts
124
- var energyVadWorkletCode = `
124
+ var createEnergyVadWorkletCode = (vadConfig) => {
125
+ const energyParams = vadConfig?.energyVad || {};
126
+ const smoothing = energyParams.smoothing ?? 0.95;
127
+ const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
128
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
129
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-3;
130
+ const minSNR = energyParams.minSNR ?? 2;
131
+ const snrRange = energyParams.snrRange ?? 8;
132
+ return `
125
133
  class EnergyVadProcessor extends AudioWorkletProcessor {
126
134
  constructor() {
127
135
  super();
128
- this.smoothing = 0.95;
136
+ this.smoothing = ${smoothing};
129
137
  this.energy = 0;
130
- this.noiseFloor = 0.001;
138
+ this.noiseFloor = ${initialNoiseFloor};
139
+ this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
140
+ this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
141
+ this.minSNR = ${minSNR};
142
+ this.snrRange = ${snrRange};
143
+ this.isSpeaking = false;
144
+
145
+ this.port.onmessage = (event) => {
146
+ if (event.data && event.data.isSpeaking !== undefined) {
147
+ this.isSpeaking = event.data.isSpeaking;
148
+ }
149
+ };
131
150
  }
132
151
 
133
152
  process(inputs, outputs, parameters) {
@@ -135,41 +154,54 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
135
154
  if (!input || !input.length) return true;
136
155
  const channel = input[0];
137
156
 
138
- // Calculate RMS
157
+ // Calculate RMS (Root Mean Square) energy
139
158
  let sum = 0;
140
159
  for (let i = 0; i < channel.length; i++) {
141
160
  sum += channel[i] * channel[i];
142
161
  }
143
162
  const rms = Math.sqrt(sum / channel.length);
144
163
 
145
- // Simple adaptive noise floor (very basic)
146
- if (rms < this.noiseFloor) {
147
- this.noiseFloor = this.noiseFloor * 0.99 + rms * 0.01;
148
- } else {
149
- this.noiseFloor = this.noiseFloor * 0.999 + rms * 0.001;
164
+ // Adaptive noise floor estimation - ONLY during silence
165
+ // This prevents the noise floor from rising during speech
166
+ if (!this.isSpeaking) {
167
+ if (rms < this.noiseFloor) {
168
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
169
+ } else {
170
+ // Even during silence, if we detect a loud signal, adapt very slowly
171
+ // This could be brief noise we haven't classified as speech yet
172
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
173
+ }
150
174
  }
175
+ // During speech, freeze the noise floor to maintain consistent detection
151
176
 
152
- // Calculate "probability" based on SNR
153
- // This is a heuristic mapping from energy to 0-1
177
+ // Calculate Signal-to-Noise Ratio (SNR)
154
178
  const snr = rms / (this.noiseFloor + 1e-6);
155
- const probability = Math.min(1, Math.max(0, (snr - 1.5) / 10)); // Arbitrary scaling
179
+
180
+ // Map SNR to probability (0-1)
181
+ // Probability is 0 when SNR <= minSNR
182
+ // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
183
+ // Probability is 1 when SNR >= (minSNR + snrRange)
184
+ const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
156
185
 
157
- this.port.postMessage({ probability });
186
+ this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
158
187
 
159
188
  return true;
160
189
  }
161
190
  }
162
191
  registerProcessor('energy-vad-processor', EnergyVadProcessor);
163
192
  `;
193
+ };
164
194
  var EnergyVADPlugin = class {
165
195
  name = "energy-vad";
196
+ workletNode = null;
166
197
  async createNode(context, config, onDecision) {
167
198
  if (!config?.enabled) {
168
199
  console.log("VAD disabled, using passthrough node");
169
200
  const pass = context.createGain();
170
201
  return pass;
171
202
  }
172
- const blob = new Blob([energyVadWorkletCode], {
203
+ const workletCode = createEnergyVadWorkletCode(config);
204
+ const blob = new Blob([workletCode], {
173
205
  type: "application/javascript"
174
206
  });
175
207
  const url = URL.createObjectURL(blob);
@@ -188,6 +220,7 @@ var EnergyVADPlugin = class {
188
220
  let node;
189
221
  try {
190
222
  node = new AudioWorkletNode(context, "energy-vad-processor");
223
+ this.workletNode = node;
191
224
  console.log("Energy VAD node created successfully");
192
225
  } catch (e) {
193
226
  const error = new Error(
@@ -213,6 +246,11 @@ var EnergyVADPlugin = class {
213
246
  };
214
247
  return node;
215
248
  }
249
+ updateSpeakingState(isSpeaking) {
250
+ if (this.workletNode) {
251
+ this.workletNode.port.postMessage({ isSpeaking });
252
+ }
253
+ }
216
254
  };
217
255
 
218
256
  // src/extensibility/plugins.ts
@@ -249,31 +287,60 @@ var VADStateMachine = class {
249
287
  currentState = "silent";
250
288
  lastSpeechTime = 0;
251
289
  speechStartTime = 0;
290
+ lastSilenceTime = 0;
252
291
  frameDurationMs = 20;
253
292
  // Assumed frame duration, updated by calls
254
293
  constructor(config) {
255
294
  this.config = {
256
295
  enabled: config?.enabled ?? true,
257
296
  pluginName: config?.pluginName ?? "energy-vad",
258
- startThreshold: config?.startThreshold ?? 0.5,
259
- stopThreshold: config?.stopThreshold ?? 0.4,
260
- hangoverMs: config?.hangoverMs ?? 300,
261
- preRollMs: config?.preRollMs ?? 200
297
+ // Voice-optimized defaults
298
+ startThreshold: config?.startThreshold ?? 0.6,
299
+ // Higher threshold to avoid noise
300
+ stopThreshold: config?.stopThreshold ?? 0.45,
301
+ // Balanced for voice
302
+ hangoverMs: config?.hangoverMs ?? 400,
303
+ // Smooth for natural speech
304
+ preRollMs: config?.preRollMs ?? 250,
305
+ // Generous pre-roll
306
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 100,
307
+ minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
308
+ energyVad: {
309
+ smoothing: config?.energyVad?.smoothing ?? 0.95,
310
+ initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
311
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
312
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
313
+ minSNR: config?.energyVad?.minSNR ?? 2,
314
+ snrRange: config?.energyVad?.snrRange ?? 8
315
+ }
262
316
  };
317
+ this.lastSilenceTime = Date.now();
263
318
  }
264
319
  updateConfig(config) {
265
320
  this.config = { ...this.config, ...config };
266
321
  }
267
322
  processFrame(probability, timestamp) {
268
- const { startThreshold, stopThreshold, hangoverMs } = this.config;
323
+ const {
324
+ startThreshold,
325
+ stopThreshold,
326
+ hangoverMs,
327
+ minSpeechDurationMs,
328
+ minSilenceDurationMs
329
+ } = this.config;
269
330
  let newState = this.currentState;
270
331
  if (this.currentState === "silent" || this.currentState === "speech_ending") {
271
332
  if (probability >= startThreshold) {
272
- newState = "speech_starting";
273
- this.speechStartTime = timestamp;
274
- this.lastSpeechTime = timestamp;
333
+ const silenceDuration = timestamp - this.lastSilenceTime;
334
+ if (silenceDuration >= minSilenceDurationMs) {
335
+ newState = "speech_starting";
336
+ this.speechStartTime = timestamp;
337
+ this.lastSpeechTime = timestamp;
338
+ } else {
339
+ newState = "silent";
340
+ }
275
341
  } else {
276
342
  newState = "silent";
343
+ this.lastSilenceTime = timestamp;
277
344
  }
278
345
  } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
279
346
  if (probability >= stopThreshold) {
@@ -281,10 +348,15 @@ var VADStateMachine = class {
281
348
  this.lastSpeechTime = timestamp;
282
349
  } else {
283
350
  const timeSinceSpeech = timestamp - this.lastSpeechTime;
351
+ const speechDuration = timestamp - this.speechStartTime;
284
352
  if (timeSinceSpeech < hangoverMs) {
285
353
  newState = "speaking";
354
+ } else if (speechDuration < minSpeechDurationMs) {
355
+ newState = "silent";
356
+ this.lastSilenceTime = timestamp;
286
357
  } else {
287
358
  newState = "speech_ending";
359
+ this.lastSilenceTime = timestamp;
288
360
  }
289
361
  }
290
362
  }
@@ -303,7 +375,9 @@ var VADStateMachine = class {
303
375
  async function createAudioPipeline(sourceTrack, config = {}) {
304
376
  const context = getAudioContext();
305
377
  registerPipeline();
306
- const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl);
378
+ const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
379
+ config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl
380
+ );
307
381
  const vadEnabled = config.vad?.enabled !== false;
308
382
  const fullConfig = {
309
383
  noiseSuppression: {
@@ -312,13 +386,38 @@ async function createAudioPipeline(sourceTrack, config = {}) {
312
386
  },
313
387
  vad: {
314
388
  enabled: vadEnabled,
389
+ // Voice-optimized defaults (will be overridden by config)
390
+ startThreshold: 0.6,
391
+ stopThreshold: 0.45,
392
+ hangoverMs: 400,
393
+ preRollMs: 250,
394
+ minSpeechDurationMs: 100,
395
+ minSilenceDurationMs: 150,
396
+ energyVad: {
397
+ smoothing: 0.95,
398
+ initialNoiseFloor: 1e-3,
399
+ noiseFloorAdaptRateQuiet: 0.01,
400
+ noiseFloorAdaptRateLoud: 1e-3,
401
+ minSNR: 2,
402
+ snrRange: 8
403
+ },
315
404
  ...config.vad
316
405
  },
317
406
  output: {
318
407
  speechGain: 1,
319
- silenceGain: vadEnabled ? 0 : 1,
320
- // If no VAD, always output audio
321
- gainRampTime: 0.02,
408
+ silenceGain: 0,
409
+ // Full mute for voice-only
410
+ gainRampTime: 0.015,
411
+ // Fast but smooth transitions
412
+ smoothTransitions: true,
413
+ maxGainDb: 6,
414
+ enableCompression: false,
415
+ compression: {
416
+ threshold: -24,
417
+ ratio: 3,
418
+ attack: 3e-3,
419
+ release: 0.05
420
+ },
322
421
  ...config.output
323
422
  },
324
423
  livekit: { manageTrackMute: false, ...config.livekit }
@@ -329,7 +428,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
329
428
  output: fullConfig.output
330
429
  });
331
430
  if (!sourceTrack || sourceTrack.kind !== "audio") {
332
- throw new Error("createAudioPipeline requires a valid audio MediaStreamTrack");
431
+ throw new Error(
432
+ "createAudioPipeline requires a valid audio MediaStreamTrack"
433
+ );
333
434
  }
334
435
  if (sourceTrack.readyState === "ended") {
335
436
  throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
@@ -343,10 +444,7 @@ async function createAudioPipeline(sourceTrack, config = {}) {
343
444
  const nsPlugin = getNoiseSuppressionPlugin(
344
445
  fullConfig.noiseSuppression?.pluginName
345
446
  );
346
- nsNode = await nsPlugin.createNode(
347
- context,
348
- fullConfig.noiseSuppression
349
- );
447
+ nsNode = await nsPlugin.createNode(context, fullConfig.noiseSuppression);
350
448
  } catch (error) {
351
449
  const err = error instanceof Error ? error : new Error(String(error));
352
450
  console.error("Failed to create noise suppression node:", err);
@@ -354,27 +452,27 @@ async function createAudioPipeline(sourceTrack, config = {}) {
354
452
  throw err;
355
453
  }
356
454
  const vadStateMachine = new VADStateMachine(fullConfig.vad);
455
+ let vadPlugin;
357
456
  try {
358
- const vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
359
- vadNode = await vadPlugin.createNode(
360
- context,
361
- fullConfig.vad,
362
- (prob) => {
363
- try {
364
- const timestamp = context.currentTime * 1e3;
365
- const newState = vadStateMachine.processFrame(prob, timestamp);
366
- if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
367
- emitter.emit("vadChange", newState);
368
- lastVadState = newState;
369
- updateGain(newState);
370
- }
371
- } catch (vadError) {
372
- const err = vadError instanceof Error ? vadError : new Error(String(vadError));
373
- console.error("Error in VAD callback:", err);
374
- emitter.emit("error", err);
457
+ vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
458
+ vadNode = await vadPlugin.createNode(context, fullConfig.vad, (prob) => {
459
+ try {
460
+ const timestamp = context.currentTime * 1e3;
461
+ const newState = vadStateMachine.processFrame(prob, timestamp);
462
+ if (vadPlugin && typeof vadPlugin.updateSpeakingState === "function") {
463
+ vadPlugin.updateSpeakingState(newState.isSpeaking);
375
464
  }
465
+ if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
466
+ emitter.emit("vadChange", newState);
467
+ lastVadState = newState;
468
+ updateGain(newState);
469
+ }
470
+ } catch (vadError) {
471
+ const err = vadError instanceof Error ? vadError : new Error(String(vadError));
472
+ console.error("Error in VAD callback:", err);
473
+ emitter.emit("error", err);
376
474
  }
377
- );
475
+ });
378
476
  } catch (error) {
379
477
  const err = error instanceof Error ? error : new Error(String(error));
380
478
  console.error("Failed to create VAD node:", err);
@@ -391,15 +489,31 @@ async function createAudioPipeline(sourceTrack, config = {}) {
391
489
  nsNode.connect(splitter);
392
490
  splitter.connect(vadNode);
393
491
  const delayNode = context.createDelay(1);
394
- const preRollSeconds = (fullConfig.vad?.preRollMs ?? 200) / 1e3;
492
+ const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
395
493
  delayNode.delayTime.value = preRollSeconds;
396
494
  const gainNode = context.createGain();
397
495
  gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
496
+ let compressor = null;
497
+ if (fullConfig.output?.enableCompression) {
498
+ compressor = context.createDynamicsCompressor();
499
+ const comp = fullConfig.output.compression;
500
+ compressor.threshold.value = comp.threshold ?? -24;
501
+ compressor.ratio.value = comp.ratio ?? 3;
502
+ compressor.attack.value = comp.attack ?? 3e-3;
503
+ compressor.release.value = comp.release ?? 0.05;
504
+ compressor.knee.value = 10;
505
+ }
398
506
  const destination = context.createMediaStreamDestination();
399
507
  try {
400
508
  splitter.connect(delayNode);
401
509
  delayNode.connect(gainNode);
402
- gainNode.connect(destination);
510
+ if (compressor) {
511
+ gainNode.connect(compressor);
512
+ compressor.connect(destination);
513
+ console.log("Compression enabled:", fullConfig.output?.compression);
514
+ } else {
515
+ gainNode.connect(destination);
516
+ }
403
517
  } catch (error) {
404
518
  const err = error instanceof Error ? error : new Error(String(error));
405
519
  console.error("Failed to wire audio pipeline:", err);
@@ -408,10 +522,24 @@ async function createAudioPipeline(sourceTrack, config = {}) {
408
522
  }
409
523
  function updateGain(state) {
410
524
  try {
411
- const { speechGain, silenceGain, gainRampTime } = fullConfig.output;
412
- const targetGain = state.isSpeaking ? speechGain ?? 1 : silenceGain ?? 0;
525
+ const {
526
+ speechGain = 1,
527
+ silenceGain = 0,
528
+ gainRampTime = 0.015,
529
+ smoothTransitions = true,
530
+ maxGainDb = 6
531
+ } = fullConfig.output;
532
+ const maxGainLinear = Math.pow(10, maxGainDb / 20);
533
+ const limitedSpeechGain = Math.min(speechGain, maxGainLinear);
534
+ const targetGain = state.isSpeaking ? limitedSpeechGain : silenceGain;
413
535
  const now = context.currentTime;
414
- gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime ?? 0.02);
536
+ if (smoothTransitions) {
537
+ gainNode.gain.cancelScheduledValues(now);
538
+ gainNode.gain.setValueAtTime(gainNode.gain.value, now);
539
+ gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime / 3);
540
+ } else {
541
+ gainNode.gain.setValueAtTime(targetGain, now);
542
+ }
415
543
  } catch (error) {
416
544
  const err = error instanceof Error ? error : new Error(String(error));
417
545
  console.error("Failed to update gain:", err);
@@ -467,6 +595,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
467
595
  vadNode.disconnect();
468
596
  delayNode.disconnect();
469
597
  gainNode.disconnect();
598
+ if (compressor) {
599
+ compressor.disconnect();
600
+ }
470
601
  destination.stream.getTracks().forEach((t) => t.stop());
471
602
  unregisterPipeline();
472
603
  } catch (error) {
@@ -483,7 +614,47 @@ async function createAudioPipeline(sourceTrack, config = {}) {
483
614
  try {
484
615
  if (newConfig.vad) {
485
616
  vadStateMachine.updateConfig(newConfig.vad);
617
+ Object.assign(fullConfig.vad, newConfig.vad);
618
+ if (newConfig.vad.preRollMs !== void 0) {
619
+ const preRollSeconds2 = newConfig.vad.preRollMs / 1e3;
620
+ delayNode.delayTime.setValueAtTime(
621
+ preRollSeconds2,
622
+ context.currentTime
623
+ );
624
+ }
625
+ }
626
+ if (newConfig.output) {
627
+ Object.assign(fullConfig.output, newConfig.output);
628
+ updateGain(lastVadState);
629
+ if (compressor && newConfig.output.compression) {
630
+ const comp = newConfig.output.compression;
631
+ if (comp.threshold !== void 0) {
632
+ compressor.threshold.setValueAtTime(
633
+ comp.threshold,
634
+ context.currentTime
635
+ );
636
+ }
637
+ if (comp.ratio !== void 0) {
638
+ compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
639
+ }
640
+ if (comp.attack !== void 0) {
641
+ compressor.attack.setValueAtTime(
642
+ comp.attack,
643
+ context.currentTime
644
+ );
645
+ }
646
+ if (comp.release !== void 0) {
647
+ compressor.release.setValueAtTime(
648
+ comp.release,
649
+ context.currentTime
650
+ );
651
+ }
652
+ }
653
+ }
654
+ if (newConfig.livekit) {
655
+ Object.assign(fullConfig.livekit, newConfig.livekit);
486
656
  }
657
+ console.log("Pipeline config updated:", newConfig);
487
658
  } catch (error) {
488
659
  const err = error instanceof Error ? error : new Error(String(error));
489
660
  console.error("Failed to update config:", err);
@@ -1,11 +1,11 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "../chunk-EXH2PNUE.mjs";
4
- import "../chunk-JJASCVEW.mjs";
3
+ } from "../chunk-XXTNAUYX.mjs";
4
+ import "../chunk-N553RHTI.mjs";
5
5
  import "../chunk-OZ7KMC4S.mjs";
6
- import "../chunk-6P2RDBW5.mjs";
6
+ import "../chunk-H5UKZU2Y.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-R5JVHKWA.mjs";
8
+ import "../chunk-VEJXAEMM.mjs";
9
9
  export {
10
10
  createAudioPipeline
11
11
  };
package/dist/types.d.mts CHANGED
@@ -35,46 +35,154 @@ interface AudioProcessingConfig {
35
35
  vad?: {
36
36
  enabled: boolean;
37
37
  /**
38
- * Plugin name to use. Defaults to 'rnnoise-vad' or 'energy-vad'.
38
+ * Plugin name to use. Defaults to 'energy-vad'.
39
39
  */
40
40
  pluginName?: string;
41
41
  /**
42
42
  * Probability threshold for speech onset (0-1).
43
- * Default: 0.5
43
+ * When VAD probability rises above this, audio is unmuted.
44
+ * Lower = more sensitive (catches quiet speech, may include noise)
45
+ * Higher = less sensitive (only confident speech, may clip quiet parts)
46
+ * Default: 0.6 (optimized for voice-only)
44
47
  */
45
48
  startThreshold?: number;
46
49
  /**
47
50
  * Probability threshold for speech offset (0-1).
48
- * Default: 0.4
51
+ * When VAD probability drops below this (after hangover), audio is muted.
52
+ * Lower = keeps audio on longer (less aggressive gating)
53
+ * Higher = mutes faster (more aggressive noise suppression)
54
+ * Default: 0.45 (balanced voice detection)
49
55
  */
50
56
  stopThreshold?: number;
51
57
  /**
52
- * Time in ms to wait after speech stops before considering it silent.
53
- * Default: 300ms
58
+ * Time in ms to wait after speech stops before muting.
59
+ * Prevents rapid on/off toggling during pauses.
60
+ * Lower = more aggressive gating, may clip between words
61
+ * Higher = smoother but may let trailing noise through
62
+ * Default: 400ms (optimized for natural speech)
54
63
  */
55
64
  hangoverMs?: number;
56
65
  /**
57
- * Time in ms of audio to buffer before speech onset to avoid cutting the start.
58
- * Default: 200ms
66
+ * Time in ms of audio to buffer before speech onset.
67
+ * Prevents cutting off the beginning of speech.
68
+ * Default: 250ms (generous pre-roll for voice)
59
69
  */
60
70
  preRollMs?: number;
71
+ /**
72
+ * Minimum speech duration in ms to consider it valid speech.
73
+ * Filters out very brief noise spikes.
74
+ * Default: 100ms
75
+ */
76
+ minSpeechDurationMs?: number;
77
+ /**
78
+ * Minimum silence duration in ms before allowing another speech segment.
79
+ * Prevents false positives from quick noise bursts.
80
+ * Default: 150ms
81
+ */
82
+ minSilenceDurationMs?: number;
83
+ /**
84
+ * Advanced: Energy VAD specific parameters
85
+ */
86
+ energyVad?: {
87
+ /**
88
+ * Smoothing factor for energy calculation (0-1).
89
+ * Higher = more smoothing, slower to react
90
+ * Default: 0.95
91
+ */
92
+ smoothing?: number;
93
+ /**
94
+ * Initial noise floor estimate.
95
+ * Default: 0.001
96
+ */
97
+ initialNoiseFloor?: number;
98
+ /**
99
+ * Rate at which noise floor adapts to quiet signals (0-1).
100
+ * Default: 0.01
101
+ */
102
+ noiseFloorAdaptRateQuiet?: number;
103
+ /**
104
+ * Rate at which noise floor adapts to loud signals (0-1).
105
+ * Default: 0.001 (slower adaptation for speech)
106
+ */
107
+ noiseFloorAdaptRateLoud?: number;
108
+ /**
109
+ * Minimum SNR (Signal-to-Noise Ratio) for speech detection.
110
+ * Default: 2.0 (voice is 2x louder than noise floor)
111
+ */
112
+ minSNR?: number;
113
+ /**
114
+ * SNR range for probability scaling.
115
+ * Default: 8.0 (probability scales from minSNR to minSNR+snrRange)
116
+ */
117
+ snrRange?: number;
118
+ };
61
119
  };
62
120
  /**
63
121
  * Output gain and muting configuration.
64
122
  */
65
123
  output?: {
66
124
  /**
67
- * Gain to apply when speaking (0-1+). Default: 1.0
125
+ * Gain to apply when speaking (0-infinity).
126
+ * Values > 1.0 will amplify the voice.
127
+ * Default: 1.0 (unity gain)
68
128
  */
69
129
  speechGain?: number;
70
130
  /**
71
- * Gain to apply when silent (0-1). Default: 0.0 (mute)
131
+ * Gain to apply when silent (0-1).
132
+ * 0.0 = complete mute (recommended for voice-only)
133
+ * 0.1-0.3 = allow some background ambience
134
+ * Default: 0.0 (full mute for voice-only)
72
135
  */
73
136
  silenceGain?: number;
74
137
  /**
75
- * Time in seconds to ramp gain changes. Default: 0.02
138
+ * Time in seconds to ramp gain changes.
139
+ * Lower = faster transitions (may cause clicks)
140
+ * Higher = smoother transitions (may sound sluggish)
141
+ * Default: 0.015 (fast but smooth for voice)
76
142
  */
77
143
  gainRampTime?: number;
144
+ /**
145
+ * Apply additional gain reduction during the transition to silence.
146
+ * Helps create cleaner cutoffs without abrupt clicks.
147
+ * Default: true
148
+ */
149
+ smoothTransitions?: boolean;
150
+ /**
151
+ * Maximum gain in dB to apply (prevents clipping).
152
+ * Default: 6.0 dB (roughly 2x amplitude)
153
+ */
154
+ maxGainDb?: number;
155
+ /**
156
+ * Apply dynamic range compression when speaking.
157
+ * Makes quiet parts louder and loud parts quieter.
158
+ * Default: false (transparent audio)
159
+ */
160
+ enableCompression?: boolean;
161
+ /**
162
+ * Compression settings (when enabled)
163
+ */
164
+ compression?: {
165
+ /**
166
+ * Threshold in dB above which compression starts.
167
+ * Default: -24.0 dB
168
+ */
169
+ threshold?: number;
170
+ /**
171
+ * Compression ratio (1:N).
172
+ * Default: 3.0 (3:1 ratio)
173
+ */
174
+ ratio?: number;
175
+ /**
176
+ * Attack time in seconds.
177
+ * Default: 0.003 (3ms)
178
+ */
179
+ attack?: number;
180
+ /**
181
+ * Release time in seconds.
182
+ * Default: 0.05 (50ms)
183
+ */
184
+ release?: number;
185
+ };
78
186
  };
79
187
  /**
80
188
  * LiveKit integration configuration.