@tensamin/audio 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -123,13 +123,32 @@ To disable noise suppression, set noiseSuppression.enabled to false.`
123
123
  };
124
124
 
125
125
  // src/vad/vad-node.ts
126
- var energyVadWorkletCode = `
126
+ var createEnergyVadWorkletCode = (vadConfig) => {
127
+ const energyParams = vadConfig?.energyVad || {};
128
+ const smoothing = energyParams.smoothing ?? 0.95;
129
+ const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
130
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
131
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-3;
132
+ const minSNR = energyParams.minSNR ?? 2;
133
+ const snrRange = energyParams.snrRange ?? 8;
134
+ return `
127
135
  class EnergyVadProcessor extends AudioWorkletProcessor {
128
136
  constructor() {
129
137
  super();
130
- this.smoothing = 0.95;
138
+ this.smoothing = ${smoothing};
131
139
  this.energy = 0;
132
- this.noiseFloor = 0.001;
140
+ this.noiseFloor = ${initialNoiseFloor};
141
+ this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
142
+ this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
143
+ this.minSNR = ${minSNR};
144
+ this.snrRange = ${snrRange};
145
+ this.isSpeaking = false;
146
+
147
+ this.port.onmessage = (event) => {
148
+ if (event.data && event.data.isSpeaking !== undefined) {
149
+ this.isSpeaking = event.data.isSpeaking;
150
+ }
151
+ };
133
152
  }
134
153
 
135
154
  process(inputs, outputs, parameters) {
@@ -137,41 +156,54 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
137
156
  if (!input || !input.length) return true;
138
157
  const channel = input[0];
139
158
 
140
- // Calculate RMS
159
+ // Calculate RMS (Root Mean Square) energy
141
160
  let sum = 0;
142
161
  for (let i = 0; i < channel.length; i++) {
143
162
  sum += channel[i] * channel[i];
144
163
  }
145
164
  const rms = Math.sqrt(sum / channel.length);
146
165
 
147
- // Simple adaptive noise floor (very basic)
148
- if (rms < this.noiseFloor) {
149
- this.noiseFloor = this.noiseFloor * 0.99 + rms * 0.01;
150
- } else {
151
- this.noiseFloor = this.noiseFloor * 0.999 + rms * 0.001;
166
+ // Adaptive noise floor estimation - ONLY during silence
167
+ // This prevents the noise floor from rising during speech
168
+ if (!this.isSpeaking) {
169
+ if (rms < this.noiseFloor) {
170
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
171
+ } else {
172
+ // Even during silence, if we detect a loud signal, adapt very slowly
173
+ // This could be brief noise we haven't classified as speech yet
174
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
175
+ }
152
176
  }
177
+ // During speech, freeze the noise floor to maintain consistent detection
153
178
 
154
- // Calculate "probability" based on SNR
155
- // This is a heuristic mapping from energy to 0-1
179
+ // Calculate Signal-to-Noise Ratio (SNR)
156
180
  const snr = rms / (this.noiseFloor + 1e-6);
157
- const probability = Math.min(1, Math.max(0, (snr - 1.5) / 10)); // Arbitrary scaling
181
+
182
+ // Map SNR to probability (0-1)
183
+ // Probability is 0 when SNR <= minSNR
184
+ // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
185
+ // Probability is 1 when SNR >= (minSNR + snrRange)
186
+ const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
158
187
 
159
- this.port.postMessage({ probability });
188
+ this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
160
189
 
161
190
  return true;
162
191
  }
163
192
  }
164
193
  registerProcessor('energy-vad-processor', EnergyVadProcessor);
165
194
  `;
195
+ };
166
196
  var EnergyVADPlugin = class {
167
197
  name = "energy-vad";
198
+ workletNode = null;
168
199
  async createNode(context, config, onDecision) {
169
200
  if (!config?.enabled) {
170
201
  console.log("VAD disabled, using passthrough node");
171
202
  const pass = context.createGain();
172
203
  return pass;
173
204
  }
174
- const blob = new Blob([energyVadWorkletCode], {
205
+ const workletCode = createEnergyVadWorkletCode(config);
206
+ const blob = new Blob([workletCode], {
175
207
  type: "application/javascript"
176
208
  });
177
209
  const url = URL.createObjectURL(blob);
@@ -190,6 +222,7 @@ var EnergyVADPlugin = class {
190
222
  let node;
191
223
  try {
192
224
  node = new AudioWorkletNode(context, "energy-vad-processor");
225
+ this.workletNode = node;
193
226
  console.log("Energy VAD node created successfully");
194
227
  } catch (e) {
195
228
  const error = new Error(
@@ -215,6 +248,11 @@ var EnergyVADPlugin = class {
215
248
  };
216
249
  return node;
217
250
  }
251
+ updateSpeakingState(isSpeaking) {
252
+ if (this.workletNode) {
253
+ this.workletNode.port.postMessage({ isSpeaking });
254
+ }
255
+ }
218
256
  };
219
257
 
220
258
  // src/extensibility/plugins.ts
@@ -251,31 +289,60 @@ var VADStateMachine = class {
251
289
  currentState = "silent";
252
290
  lastSpeechTime = 0;
253
291
  speechStartTime = 0;
292
+ lastSilenceTime = 0;
254
293
  frameDurationMs = 20;
255
294
  // Assumed frame duration, updated by calls
256
295
  constructor(config) {
257
296
  this.config = {
258
297
  enabled: config?.enabled ?? true,
259
298
  pluginName: config?.pluginName ?? "energy-vad",
260
- startThreshold: config?.startThreshold ?? 0.5,
261
- stopThreshold: config?.stopThreshold ?? 0.4,
262
- hangoverMs: config?.hangoverMs ?? 300,
263
- preRollMs: config?.preRollMs ?? 200
299
+ // Voice-optimized defaults
300
+ startThreshold: config?.startThreshold ?? 0.6,
301
+ // Higher threshold to avoid noise
302
+ stopThreshold: config?.stopThreshold ?? 0.45,
303
+ // Balanced for voice
304
+ hangoverMs: config?.hangoverMs ?? 400,
305
+ // Smooth for natural speech
306
+ preRollMs: config?.preRollMs ?? 250,
307
+ // Generous pre-roll
308
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 100,
309
+ minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
310
+ energyVad: {
311
+ smoothing: config?.energyVad?.smoothing ?? 0.95,
312
+ initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
313
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
314
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
315
+ minSNR: config?.energyVad?.minSNR ?? 2,
316
+ snrRange: config?.energyVad?.snrRange ?? 8
317
+ }
264
318
  };
319
+ this.lastSilenceTime = Date.now();
265
320
  }
266
321
  updateConfig(config) {
267
322
  this.config = { ...this.config, ...config };
268
323
  }
269
324
  processFrame(probability, timestamp) {
270
- const { startThreshold, stopThreshold, hangoverMs } = this.config;
325
+ const {
326
+ startThreshold,
327
+ stopThreshold,
328
+ hangoverMs,
329
+ minSpeechDurationMs,
330
+ minSilenceDurationMs
331
+ } = this.config;
271
332
  let newState = this.currentState;
272
333
  if (this.currentState === "silent" || this.currentState === "speech_ending") {
273
334
  if (probability >= startThreshold) {
274
- newState = "speech_starting";
275
- this.speechStartTime = timestamp;
276
- this.lastSpeechTime = timestamp;
335
+ const silenceDuration = timestamp - this.lastSilenceTime;
336
+ if (silenceDuration >= minSilenceDurationMs) {
337
+ newState = "speech_starting";
338
+ this.speechStartTime = timestamp;
339
+ this.lastSpeechTime = timestamp;
340
+ } else {
341
+ newState = "silent";
342
+ }
277
343
  } else {
278
344
  newState = "silent";
345
+ this.lastSilenceTime = timestamp;
279
346
  }
280
347
  } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
281
348
  if (probability >= stopThreshold) {
@@ -283,10 +350,15 @@ var VADStateMachine = class {
283
350
  this.lastSpeechTime = timestamp;
284
351
  } else {
285
352
  const timeSinceSpeech = timestamp - this.lastSpeechTime;
353
+ const speechDuration = timestamp - this.speechStartTime;
286
354
  if (timeSinceSpeech < hangoverMs) {
287
355
  newState = "speaking";
356
+ } else if (speechDuration < minSpeechDurationMs) {
357
+ newState = "silent";
358
+ this.lastSilenceTime = timestamp;
288
359
  } else {
289
360
  newState = "speech_ending";
361
+ this.lastSilenceTime = timestamp;
290
362
  }
291
363
  }
292
364
  }
@@ -305,7 +377,9 @@ var VADStateMachine = class {
305
377
  async function createAudioPipeline(sourceTrack, config = {}) {
306
378
  const context = getAudioContext();
307
379
  registerPipeline();
308
- const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl);
380
+ const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
381
+ config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl
382
+ );
309
383
  const vadEnabled = config.vad?.enabled !== false;
310
384
  const fullConfig = {
311
385
  noiseSuppression: {
@@ -314,13 +388,38 @@ async function createAudioPipeline(sourceTrack, config = {}) {
314
388
  },
315
389
  vad: {
316
390
  enabled: vadEnabled,
391
+ // Voice-optimized defaults (will be overridden by config)
392
+ startThreshold: 0.6,
393
+ stopThreshold: 0.45,
394
+ hangoverMs: 400,
395
+ preRollMs: 250,
396
+ minSpeechDurationMs: 100,
397
+ minSilenceDurationMs: 150,
398
+ energyVad: {
399
+ smoothing: 0.95,
400
+ initialNoiseFloor: 1e-3,
401
+ noiseFloorAdaptRateQuiet: 0.01,
402
+ noiseFloorAdaptRateLoud: 1e-3,
403
+ minSNR: 2,
404
+ snrRange: 8
405
+ },
317
406
  ...config.vad
318
407
  },
319
408
  output: {
320
409
  speechGain: 1,
321
- silenceGain: vadEnabled ? 0 : 1,
322
- // If no VAD, always output audio
323
- gainRampTime: 0.02,
410
+ silenceGain: 0,
411
+ // Full mute for voice-only
412
+ gainRampTime: 0.015,
413
+ // Fast but smooth transitions
414
+ smoothTransitions: true,
415
+ maxGainDb: 6,
416
+ enableCompression: false,
417
+ compression: {
418
+ threshold: -24,
419
+ ratio: 3,
420
+ attack: 3e-3,
421
+ release: 0.05
422
+ },
324
423
  ...config.output
325
424
  },
326
425
  livekit: { manageTrackMute: false, ...config.livekit }
@@ -331,7 +430,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
331
430
  output: fullConfig.output
332
431
  });
333
432
  if (!sourceTrack || sourceTrack.kind !== "audio") {
334
- throw new Error("createAudioPipeline requires a valid audio MediaStreamTrack");
433
+ throw new Error(
434
+ "createAudioPipeline requires a valid audio MediaStreamTrack"
435
+ );
335
436
  }
336
437
  if (sourceTrack.readyState === "ended") {
337
438
  throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
@@ -345,10 +446,7 @@ async function createAudioPipeline(sourceTrack, config = {}) {
345
446
  const nsPlugin = getNoiseSuppressionPlugin(
346
447
  fullConfig.noiseSuppression?.pluginName
347
448
  );
348
- nsNode = await nsPlugin.createNode(
349
- context,
350
- fullConfig.noiseSuppression
351
- );
449
+ nsNode = await nsPlugin.createNode(context, fullConfig.noiseSuppression);
352
450
  } catch (error) {
353
451
  const err = error instanceof Error ? error : new Error(String(error));
354
452
  console.error("Failed to create noise suppression node:", err);
@@ -356,27 +454,27 @@ async function createAudioPipeline(sourceTrack, config = {}) {
356
454
  throw err;
357
455
  }
358
456
  const vadStateMachine = new VADStateMachine(fullConfig.vad);
457
+ let vadPlugin;
359
458
  try {
360
- const vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
361
- vadNode = await vadPlugin.createNode(
362
- context,
363
- fullConfig.vad,
364
- (prob) => {
365
- try {
366
- const timestamp = context.currentTime * 1e3;
367
- const newState = vadStateMachine.processFrame(prob, timestamp);
368
- if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
369
- emitter.emit("vadChange", newState);
370
- lastVadState = newState;
371
- updateGain(newState);
372
- }
373
- } catch (vadError) {
374
- const err = vadError instanceof Error ? vadError : new Error(String(vadError));
375
- console.error("Error in VAD callback:", err);
376
- emitter.emit("error", err);
459
+ vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
460
+ vadNode = await vadPlugin.createNode(context, fullConfig.vad, (prob) => {
461
+ try {
462
+ const timestamp = context.currentTime * 1e3;
463
+ const newState = vadStateMachine.processFrame(prob, timestamp);
464
+ if (vadPlugin && typeof vadPlugin.updateSpeakingState === "function") {
465
+ vadPlugin.updateSpeakingState(newState.isSpeaking);
377
466
  }
467
+ if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
468
+ emitter.emit("vadChange", newState);
469
+ lastVadState = newState;
470
+ updateGain(newState);
471
+ }
472
+ } catch (vadError) {
473
+ const err = vadError instanceof Error ? vadError : new Error(String(vadError));
474
+ console.error("Error in VAD callback:", err);
475
+ emitter.emit("error", err);
378
476
  }
379
- );
477
+ });
380
478
  } catch (error) {
381
479
  const err = error instanceof Error ? error : new Error(String(error));
382
480
  console.error("Failed to create VAD node:", err);
@@ -393,15 +491,31 @@ async function createAudioPipeline(sourceTrack, config = {}) {
393
491
  nsNode.connect(splitter);
394
492
  splitter.connect(vadNode);
395
493
  const delayNode = context.createDelay(1);
396
- const preRollSeconds = (fullConfig.vad?.preRollMs ?? 200) / 1e3;
494
+ const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
397
495
  delayNode.delayTime.value = preRollSeconds;
398
496
  const gainNode = context.createGain();
399
497
  gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
498
+ let compressor = null;
499
+ if (fullConfig.output?.enableCompression) {
500
+ compressor = context.createDynamicsCompressor();
501
+ const comp = fullConfig.output.compression;
502
+ compressor.threshold.value = comp.threshold ?? -24;
503
+ compressor.ratio.value = comp.ratio ?? 3;
504
+ compressor.attack.value = comp.attack ?? 3e-3;
505
+ compressor.release.value = comp.release ?? 0.05;
506
+ compressor.knee.value = 10;
507
+ }
400
508
  const destination = context.createMediaStreamDestination();
401
509
  try {
402
510
  splitter.connect(delayNode);
403
511
  delayNode.connect(gainNode);
404
- gainNode.connect(destination);
512
+ if (compressor) {
513
+ gainNode.connect(compressor);
514
+ compressor.connect(destination);
515
+ console.log("Compression enabled:", fullConfig.output?.compression);
516
+ } else {
517
+ gainNode.connect(destination);
518
+ }
405
519
  } catch (error) {
406
520
  const err = error instanceof Error ? error : new Error(String(error));
407
521
  console.error("Failed to wire audio pipeline:", err);
@@ -410,10 +524,24 @@ async function createAudioPipeline(sourceTrack, config = {}) {
410
524
  }
411
525
  function updateGain(state) {
412
526
  try {
413
- const { speechGain, silenceGain, gainRampTime } = fullConfig.output;
414
- const targetGain = state.isSpeaking ? speechGain ?? 1 : silenceGain ?? 0;
527
+ const {
528
+ speechGain = 1,
529
+ silenceGain = 0,
530
+ gainRampTime = 0.015,
531
+ smoothTransitions = true,
532
+ maxGainDb = 6
533
+ } = fullConfig.output;
534
+ const maxGainLinear = Math.pow(10, maxGainDb / 20);
535
+ const limitedSpeechGain = Math.min(speechGain, maxGainLinear);
536
+ const targetGain = state.isSpeaking ? limitedSpeechGain : silenceGain;
415
537
  const now = context.currentTime;
416
- gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime ?? 0.02);
538
+ if (smoothTransitions) {
539
+ gainNode.gain.cancelScheduledValues(now);
540
+ gainNode.gain.setValueAtTime(gainNode.gain.value, now);
541
+ gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime / 3);
542
+ } else {
543
+ gainNode.gain.setValueAtTime(targetGain, now);
544
+ }
417
545
  } catch (error) {
418
546
  const err = error instanceof Error ? error : new Error(String(error));
419
547
  console.error("Failed to update gain:", err);
@@ -469,6 +597,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
469
597
  vadNode.disconnect();
470
598
  delayNode.disconnect();
471
599
  gainNode.disconnect();
600
+ if (compressor) {
601
+ compressor.disconnect();
602
+ }
472
603
  destination.stream.getTracks().forEach((t) => t.stop());
473
604
  unregisterPipeline();
474
605
  } catch (error) {
@@ -485,7 +616,47 @@ async function createAudioPipeline(sourceTrack, config = {}) {
485
616
  try {
486
617
  if (newConfig.vad) {
487
618
  vadStateMachine.updateConfig(newConfig.vad);
619
+ Object.assign(fullConfig.vad, newConfig.vad);
620
+ if (newConfig.vad.preRollMs !== void 0) {
621
+ const preRollSeconds2 = newConfig.vad.preRollMs / 1e3;
622
+ delayNode.delayTime.setValueAtTime(
623
+ preRollSeconds2,
624
+ context.currentTime
625
+ );
626
+ }
627
+ }
628
+ if (newConfig.output) {
629
+ Object.assign(fullConfig.output, newConfig.output);
630
+ updateGain(lastVadState);
631
+ if (compressor && newConfig.output.compression) {
632
+ const comp = newConfig.output.compression;
633
+ if (comp.threshold !== void 0) {
634
+ compressor.threshold.setValueAtTime(
635
+ comp.threshold,
636
+ context.currentTime
637
+ );
638
+ }
639
+ if (comp.ratio !== void 0) {
640
+ compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
641
+ }
642
+ if (comp.attack !== void 0) {
643
+ compressor.attack.setValueAtTime(
644
+ comp.attack,
645
+ context.currentTime
646
+ );
647
+ }
648
+ if (comp.release !== void 0) {
649
+ compressor.release.setValueAtTime(
650
+ comp.release,
651
+ context.currentTime
652
+ );
653
+ }
654
+ }
655
+ }
656
+ if (newConfig.livekit) {
657
+ Object.assign(fullConfig.livekit, newConfig.livekit);
488
658
  }
659
+ console.log("Pipeline config updated:", newConfig);
489
660
  } catch (error) {
490
661
  const err = error instanceof Error ? error : new Error(String(error));
491
662
  console.error("Failed to update config:", err);
@@ -1,12 +1,12 @@
1
1
  import {
2
2
  attachProcessingToTrack
3
- } from "../chunk-XMTQPMQ6.mjs";
4
- import "../chunk-EXH2PNUE.mjs";
5
- import "../chunk-JJASCVEW.mjs";
3
+ } from "../chunk-GVKCBKW6.mjs";
4
+ import "../chunk-XXTNAUYX.mjs";
5
+ import "../chunk-N553RHTI.mjs";
6
6
  import "../chunk-OZ7KMC4S.mjs";
7
- import "../chunk-6P2RDBW5.mjs";
7
+ import "../chunk-H5UKZU2Y.mjs";
8
8
  import "../chunk-XO6B3D4A.mjs";
9
- import "../chunk-R5JVHKWA.mjs";
9
+ import "../chunk-VEJXAEMM.mjs";
10
10
  export {
11
11
  attachProcessingToTrack
12
12
  };