@tensamin/audio 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -154,13 +154,32 @@ To disable noise suppression, set noiseSuppression.enabled to false.`
154
154
  };
155
155
 
156
156
  // src/vad/vad-node.ts
157
- var energyVadWorkletCode = `
157
+ var createEnergyVadWorkletCode = (vadConfig) => {
158
+ const energyParams = vadConfig?.energyVad || {};
159
+ const smoothing = energyParams.smoothing ?? 0.95;
160
+ const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
161
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
162
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-3;
163
+ const minSNR = energyParams.minSNR ?? 2;
164
+ const snrRange = energyParams.snrRange ?? 8;
165
+ return `
158
166
  class EnergyVadProcessor extends AudioWorkletProcessor {
159
167
  constructor() {
160
168
  super();
161
- this.smoothing = 0.95;
169
+ this.smoothing = ${smoothing};
162
170
  this.energy = 0;
163
- this.noiseFloor = 0.001;
171
+ this.noiseFloor = ${initialNoiseFloor};
172
+ this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
173
+ this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
174
+ this.minSNR = ${minSNR};
175
+ this.snrRange = ${snrRange};
176
+ this.isSpeaking = false;
177
+
178
+ this.port.onmessage = (event) => {
179
+ if (event.data && event.data.isSpeaking !== undefined) {
180
+ this.isSpeaking = event.data.isSpeaking;
181
+ }
182
+ };
164
183
  }
165
184
 
166
185
  process(inputs, outputs, parameters) {
@@ -168,41 +187,54 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
168
187
  if (!input || !input.length) return true;
169
188
  const channel = input[0];
170
189
 
171
- // Calculate RMS
190
+ // Calculate RMS (Root Mean Square) energy
172
191
  let sum = 0;
173
192
  for (let i = 0; i < channel.length; i++) {
174
193
  sum += channel[i] * channel[i];
175
194
  }
176
195
  const rms = Math.sqrt(sum / channel.length);
177
196
 
178
- // Simple adaptive noise floor (very basic)
179
- if (rms < this.noiseFloor) {
180
- this.noiseFloor = this.noiseFloor * 0.99 + rms * 0.01;
181
- } else {
182
- this.noiseFloor = this.noiseFloor * 0.999 + rms * 0.001;
197
+ // Adaptive noise floor estimation - ONLY during silence
198
+ // This prevents the noise floor from rising during speech
199
+ if (!this.isSpeaking) {
200
+ if (rms < this.noiseFloor) {
201
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
202
+ } else {
203
+ // Even during silence, if we detect a loud signal, adapt very slowly
204
+ // This could be brief noise we haven't classified as speech yet
205
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
206
+ }
183
207
  }
208
+ // During speech, freeze the noise floor to maintain consistent detection
184
209
 
185
- // Calculate "probability" based on SNR
186
- // This is a heuristic mapping from energy to 0-1
210
+ // Calculate Signal-to-Noise Ratio (SNR)
187
211
  const snr = rms / (this.noiseFloor + 1e-6);
188
- const probability = Math.min(1, Math.max(0, (snr - 1.5) / 10)); // Arbitrary scaling
212
+
213
+ // Map SNR to probability (0-1)
214
+ // Probability is 0 when SNR <= minSNR
215
+ // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
216
+ // Probability is 1 when SNR >= (minSNR + snrRange)
217
+ const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
189
218
 
190
- this.port.postMessage({ probability });
219
+ this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
191
220
 
192
221
  return true;
193
222
  }
194
223
  }
195
224
  registerProcessor('energy-vad-processor', EnergyVadProcessor);
196
225
  `;
226
+ };
197
227
  var EnergyVADPlugin = class {
198
228
  name = "energy-vad";
229
+ workletNode = null;
199
230
  async createNode(context, config, onDecision) {
200
231
  if (!config?.enabled) {
201
232
  console.log("VAD disabled, using passthrough node");
202
233
  const pass = context.createGain();
203
234
  return pass;
204
235
  }
205
- const blob = new Blob([energyVadWorkletCode], {
236
+ const workletCode = createEnergyVadWorkletCode(config);
237
+ const blob = new Blob([workletCode], {
206
238
  type: "application/javascript"
207
239
  });
208
240
  const url = URL.createObjectURL(blob);
@@ -221,6 +253,7 @@ var EnergyVADPlugin = class {
221
253
  let node;
222
254
  try {
223
255
  node = new AudioWorkletNode(context, "energy-vad-processor");
256
+ this.workletNode = node;
224
257
  console.log("Energy VAD node created successfully");
225
258
  } catch (e) {
226
259
  const error = new Error(
@@ -246,6 +279,11 @@ var EnergyVADPlugin = class {
246
279
  };
247
280
  return node;
248
281
  }
282
+ updateSpeakingState(isSpeaking) {
283
+ if (this.workletNode) {
284
+ this.workletNode.port.postMessage({ isSpeaking });
285
+ }
286
+ }
249
287
  };
250
288
 
251
289
  // src/extensibility/plugins.ts
@@ -288,31 +326,60 @@ var VADStateMachine = class {
288
326
  currentState = "silent";
289
327
  lastSpeechTime = 0;
290
328
  speechStartTime = 0;
329
+ lastSilenceTime = 0;
291
330
  frameDurationMs = 20;
292
331
  // Assumed frame duration, updated by calls
293
332
  constructor(config) {
294
333
  this.config = {
295
334
  enabled: config?.enabled ?? true,
296
335
  pluginName: config?.pluginName ?? "energy-vad",
297
- startThreshold: config?.startThreshold ?? 0.5,
298
- stopThreshold: config?.stopThreshold ?? 0.4,
299
- hangoverMs: config?.hangoverMs ?? 300,
300
- preRollMs: config?.preRollMs ?? 200
336
+ // Voice-optimized defaults
337
+ startThreshold: config?.startThreshold ?? 0.6,
338
+ // Higher threshold to avoid noise
339
+ stopThreshold: config?.stopThreshold ?? 0.45,
340
+ // Balanced for voice
341
+ hangoverMs: config?.hangoverMs ?? 400,
342
+ // Smooth for natural speech
343
+ preRollMs: config?.preRollMs ?? 250,
344
+ // Generous pre-roll
345
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 100,
346
+ minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
347
+ energyVad: {
348
+ smoothing: config?.energyVad?.smoothing ?? 0.95,
349
+ initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
350
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
351
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
352
+ minSNR: config?.energyVad?.minSNR ?? 2,
353
+ snrRange: config?.energyVad?.snrRange ?? 8
354
+ }
301
355
  };
356
+ this.lastSilenceTime = Date.now();
302
357
  }
303
358
  updateConfig(config) {
304
359
  this.config = { ...this.config, ...config };
305
360
  }
306
361
  processFrame(probability, timestamp) {
307
- const { startThreshold, stopThreshold, hangoverMs } = this.config;
362
+ const {
363
+ startThreshold,
364
+ stopThreshold,
365
+ hangoverMs,
366
+ minSpeechDurationMs,
367
+ minSilenceDurationMs
368
+ } = this.config;
308
369
  let newState = this.currentState;
309
370
  if (this.currentState === "silent" || this.currentState === "speech_ending") {
310
371
  if (probability >= startThreshold) {
311
- newState = "speech_starting";
312
- this.speechStartTime = timestamp;
313
- this.lastSpeechTime = timestamp;
372
+ const silenceDuration = timestamp - this.lastSilenceTime;
373
+ if (silenceDuration >= minSilenceDurationMs) {
374
+ newState = "speech_starting";
375
+ this.speechStartTime = timestamp;
376
+ this.lastSpeechTime = timestamp;
377
+ } else {
378
+ newState = "silent";
379
+ }
314
380
  } else {
315
381
  newState = "silent";
382
+ this.lastSilenceTime = timestamp;
316
383
  }
317
384
  } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
318
385
  if (probability >= stopThreshold) {
@@ -320,10 +387,15 @@ var VADStateMachine = class {
320
387
  this.lastSpeechTime = timestamp;
321
388
  } else {
322
389
  const timeSinceSpeech = timestamp - this.lastSpeechTime;
390
+ const speechDuration = timestamp - this.speechStartTime;
323
391
  if (timeSinceSpeech < hangoverMs) {
324
392
  newState = "speaking";
393
+ } else if (speechDuration < minSpeechDurationMs) {
394
+ newState = "silent";
395
+ this.lastSilenceTime = timestamp;
325
396
  } else {
326
397
  newState = "speech_ending";
398
+ this.lastSilenceTime = timestamp;
327
399
  }
328
400
  }
329
401
  }
@@ -342,7 +414,9 @@ var VADStateMachine = class {
342
414
  async function createAudioPipeline(sourceTrack, config = {}) {
343
415
  const context = getAudioContext();
344
416
  registerPipeline();
345
- const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl);
417
+ const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
418
+ config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl
419
+ );
346
420
  const vadEnabled = config.vad?.enabled !== false;
347
421
  const fullConfig = {
348
422
  noiseSuppression: {
@@ -351,13 +425,38 @@ async function createAudioPipeline(sourceTrack, config = {}) {
351
425
  },
352
426
  vad: {
353
427
  enabled: vadEnabled,
428
+ // Voice-optimized defaults (will be overridden by config)
429
+ startThreshold: 0.6,
430
+ stopThreshold: 0.45,
431
+ hangoverMs: 400,
432
+ preRollMs: 250,
433
+ minSpeechDurationMs: 100,
434
+ minSilenceDurationMs: 150,
435
+ energyVad: {
436
+ smoothing: 0.95,
437
+ initialNoiseFloor: 1e-3,
438
+ noiseFloorAdaptRateQuiet: 0.01,
439
+ noiseFloorAdaptRateLoud: 1e-3,
440
+ minSNR: 2,
441
+ snrRange: 8
442
+ },
354
443
  ...config.vad
355
444
  },
356
445
  output: {
357
446
  speechGain: 1,
358
- silenceGain: vadEnabled ? 0 : 1,
359
- // If no VAD, always output audio
360
- gainRampTime: 0.02,
447
+ silenceGain: 0,
448
+ // Full mute for voice-only
449
+ gainRampTime: 0.015,
450
+ // Fast but smooth transitions
451
+ smoothTransitions: true,
452
+ maxGainDb: 6,
453
+ enableCompression: false,
454
+ compression: {
455
+ threshold: -24,
456
+ ratio: 3,
457
+ attack: 3e-3,
458
+ release: 0.05
459
+ },
361
460
  ...config.output
362
461
  },
363
462
  livekit: { manageTrackMute: false, ...config.livekit }
@@ -368,7 +467,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
368
467
  output: fullConfig.output
369
468
  });
370
469
  if (!sourceTrack || sourceTrack.kind !== "audio") {
371
- throw new Error("createAudioPipeline requires a valid audio MediaStreamTrack");
470
+ throw new Error(
471
+ "createAudioPipeline requires a valid audio MediaStreamTrack"
472
+ );
372
473
  }
373
474
  if (sourceTrack.readyState === "ended") {
374
475
  throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
@@ -382,10 +483,7 @@ async function createAudioPipeline(sourceTrack, config = {}) {
382
483
  const nsPlugin = getNoiseSuppressionPlugin(
383
484
  fullConfig.noiseSuppression?.pluginName
384
485
  );
385
- nsNode = await nsPlugin.createNode(
386
- context,
387
- fullConfig.noiseSuppression
388
- );
486
+ nsNode = await nsPlugin.createNode(context, fullConfig.noiseSuppression);
389
487
  } catch (error) {
390
488
  const err = error instanceof Error ? error : new Error(String(error));
391
489
  console.error("Failed to create noise suppression node:", err);
@@ -393,27 +491,27 @@ async function createAudioPipeline(sourceTrack, config = {}) {
393
491
  throw err;
394
492
  }
395
493
  const vadStateMachine = new VADStateMachine(fullConfig.vad);
494
+ let vadPlugin;
396
495
  try {
397
- const vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
398
- vadNode = await vadPlugin.createNode(
399
- context,
400
- fullConfig.vad,
401
- (prob) => {
402
- try {
403
- const timestamp = context.currentTime * 1e3;
404
- const newState = vadStateMachine.processFrame(prob, timestamp);
405
- if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
406
- emitter.emit("vadChange", newState);
407
- lastVadState = newState;
408
- updateGain(newState);
409
- }
410
- } catch (vadError) {
411
- const err = vadError instanceof Error ? vadError : new Error(String(vadError));
412
- console.error("Error in VAD callback:", err);
413
- emitter.emit("error", err);
496
+ vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
497
+ vadNode = await vadPlugin.createNode(context, fullConfig.vad, (prob) => {
498
+ try {
499
+ const timestamp = context.currentTime * 1e3;
500
+ const newState = vadStateMachine.processFrame(prob, timestamp);
501
+ if (vadPlugin && typeof vadPlugin.updateSpeakingState === "function") {
502
+ vadPlugin.updateSpeakingState(newState.isSpeaking);
414
503
  }
504
+ if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
505
+ emitter.emit("vadChange", newState);
506
+ lastVadState = newState;
507
+ updateGain(newState);
508
+ }
509
+ } catch (vadError) {
510
+ const err = vadError instanceof Error ? vadError : new Error(String(vadError));
511
+ console.error("Error in VAD callback:", err);
512
+ emitter.emit("error", err);
415
513
  }
416
- );
514
+ });
417
515
  } catch (error) {
418
516
  const err = error instanceof Error ? error : new Error(String(error));
419
517
  console.error("Failed to create VAD node:", err);
@@ -430,15 +528,31 @@ async function createAudioPipeline(sourceTrack, config = {}) {
430
528
  nsNode.connect(splitter);
431
529
  splitter.connect(vadNode);
432
530
  const delayNode = context.createDelay(1);
433
- const preRollSeconds = (fullConfig.vad?.preRollMs ?? 200) / 1e3;
531
+ const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
434
532
  delayNode.delayTime.value = preRollSeconds;
435
533
  const gainNode = context.createGain();
436
534
  gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
535
+ let compressor = null;
536
+ if (fullConfig.output?.enableCompression) {
537
+ compressor = context.createDynamicsCompressor();
538
+ const comp = fullConfig.output.compression;
539
+ compressor.threshold.value = comp.threshold ?? -24;
540
+ compressor.ratio.value = comp.ratio ?? 3;
541
+ compressor.attack.value = comp.attack ?? 3e-3;
542
+ compressor.release.value = comp.release ?? 0.05;
543
+ compressor.knee.value = 10;
544
+ }
437
545
  const destination = context.createMediaStreamDestination();
438
546
  try {
439
547
  splitter.connect(delayNode);
440
548
  delayNode.connect(gainNode);
441
- gainNode.connect(destination);
549
+ if (compressor) {
550
+ gainNode.connect(compressor);
551
+ compressor.connect(destination);
552
+ console.log("Compression enabled:", fullConfig.output?.compression);
553
+ } else {
554
+ gainNode.connect(destination);
555
+ }
442
556
  } catch (error) {
443
557
  const err = error instanceof Error ? error : new Error(String(error));
444
558
  console.error("Failed to wire audio pipeline:", err);
@@ -447,10 +561,24 @@ async function createAudioPipeline(sourceTrack, config = {}) {
447
561
  }
448
562
  function updateGain(state) {
449
563
  try {
450
- const { speechGain, silenceGain, gainRampTime } = fullConfig.output;
451
- const targetGain = state.isSpeaking ? speechGain ?? 1 : silenceGain ?? 0;
564
+ const {
565
+ speechGain = 1,
566
+ silenceGain = 0,
567
+ gainRampTime = 0.015,
568
+ smoothTransitions = true,
569
+ maxGainDb = 6
570
+ } = fullConfig.output;
571
+ const maxGainLinear = Math.pow(10, maxGainDb / 20);
572
+ const limitedSpeechGain = Math.min(speechGain, maxGainLinear);
573
+ const targetGain = state.isSpeaking ? limitedSpeechGain : silenceGain;
452
574
  const now = context.currentTime;
453
- gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime ?? 0.02);
575
+ if (smoothTransitions) {
576
+ gainNode.gain.cancelScheduledValues(now);
577
+ gainNode.gain.setValueAtTime(gainNode.gain.value, now);
578
+ gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime / 3);
579
+ } else {
580
+ gainNode.gain.setValueAtTime(targetGain, now);
581
+ }
454
582
  } catch (error) {
455
583
  const err = error instanceof Error ? error : new Error(String(error));
456
584
  console.error("Failed to update gain:", err);
@@ -506,6 +634,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
506
634
  vadNode.disconnect();
507
635
  delayNode.disconnect();
508
636
  gainNode.disconnect();
637
+ if (compressor) {
638
+ compressor.disconnect();
639
+ }
509
640
  destination.stream.getTracks().forEach((t) => t.stop());
510
641
  unregisterPipeline();
511
642
  } catch (error) {
@@ -522,7 +653,47 @@ async function createAudioPipeline(sourceTrack, config = {}) {
522
653
  try {
523
654
  if (newConfig.vad) {
524
655
  vadStateMachine.updateConfig(newConfig.vad);
656
+ Object.assign(fullConfig.vad, newConfig.vad);
657
+ if (newConfig.vad.preRollMs !== void 0) {
658
+ const preRollSeconds2 = newConfig.vad.preRollMs / 1e3;
659
+ delayNode.delayTime.setValueAtTime(
660
+ preRollSeconds2,
661
+ context.currentTime
662
+ );
663
+ }
664
+ }
665
+ if (newConfig.output) {
666
+ Object.assign(fullConfig.output, newConfig.output);
667
+ updateGain(lastVadState);
668
+ if (compressor && newConfig.output.compression) {
669
+ const comp = newConfig.output.compression;
670
+ if (comp.threshold !== void 0) {
671
+ compressor.threshold.setValueAtTime(
672
+ comp.threshold,
673
+ context.currentTime
674
+ );
675
+ }
676
+ if (comp.ratio !== void 0) {
677
+ compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
678
+ }
679
+ if (comp.attack !== void 0) {
680
+ compressor.attack.setValueAtTime(
681
+ comp.attack,
682
+ context.currentTime
683
+ );
684
+ }
685
+ if (comp.release !== void 0) {
686
+ compressor.release.setValueAtTime(
687
+ comp.release,
688
+ context.currentTime
689
+ );
690
+ }
691
+ }
692
+ }
693
+ if (newConfig.livekit) {
694
+ Object.assign(fullConfig.livekit, newConfig.livekit);
525
695
  }
696
+ console.log("Pipeline config updated:", newConfig);
526
697
  } catch (error) {
527
698
  const err = error instanceof Error ? error : new Error(String(error));
528
699
  console.error("Failed to update config:", err);
package/dist/index.mjs CHANGED
@@ -1,13 +1,13 @@
1
1
  import "./chunk-WBQAMGXK.mjs";
2
2
  import {
3
3
  attachProcessingToTrack
4
- } from "./chunk-XMTQPMQ6.mjs";
4
+ } from "./chunk-GVKCBKW6.mjs";
5
5
  import {
6
6
  createAudioPipeline
7
- } from "./chunk-EXH2PNUE.mjs";
7
+ } from "./chunk-XXTNAUYX.mjs";
8
8
  import {
9
9
  VADStateMachine
10
- } from "./chunk-JJASCVEW.mjs";
10
+ } from "./chunk-N553RHTI.mjs";
11
11
  import {
12
12
  closeAudioContext,
13
13
  getAudioContext,
@@ -21,13 +21,13 @@ import {
21
21
  getVADPlugin,
22
22
  registerNoiseSuppressionPlugin,
23
23
  registerVADPlugin
24
- } from "./chunk-6P2RDBW5.mjs";
24
+ } from "./chunk-H5UKZU2Y.mjs";
25
25
  import {
26
26
  RNNoisePlugin
27
27
  } from "./chunk-XO6B3D4A.mjs";
28
28
  import {
29
29
  EnergyVADPlugin
30
- } from "./chunk-R5JVHKWA.mjs";
30
+ } from "./chunk-VEJXAEMM.mjs";
31
31
  export {
32
32
  EnergyVADPlugin,
33
33
  RNNoisePlugin,