@tensamin/audio 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -54
- package/dist/{chunk-EXH2PNUE.mjs → chunk-AHBRT4RD.mjs} +128 -33
- package/dist/{chunk-XMTQPMQ6.mjs → chunk-ERJVV5JR.mjs} +1 -1
- package/dist/chunk-N553RHTI.mjs +93 -0
- package/dist/{chunk-R5JVHKWA.mjs → chunk-NMHKX64G.mjs} +32 -12
- package/dist/{chunk-6P2RDBW5.mjs → chunk-YOSTLLCS.mjs} +1 -1
- package/dist/extensibility/plugins.js +32 -12
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +200 -51
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +200 -51
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +200 -51
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +118 -10
- package/dist/types.d.ts +118 -10
- package/dist/vad/vad-node.js +32 -12
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.d.mts +1 -0
- package/dist/vad/vad-state.d.ts +1 -0
- package/dist/vad/vad-state.js +42 -8
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
- package/dist/chunk-JJASCVEW.mjs +0 -59
|
@@ -121,13 +121,25 @@ To disable noise suppression, set noiseSuppression.enabled to false.`
|
|
|
121
121
|
};
|
|
122
122
|
|
|
123
123
|
// src/vad/vad-node.ts
|
|
124
|
-
var
|
|
124
|
+
var createEnergyVadWorkletCode = (vadConfig) => {
|
|
125
|
+
const energyParams = vadConfig?.energyVad || {};
|
|
126
|
+
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
|
+
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
129
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-3;
|
|
130
|
+
const minSNR = energyParams.minSNR ?? 2;
|
|
131
|
+
const snrRange = energyParams.snrRange ?? 8;
|
|
132
|
+
return `
|
|
125
133
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
126
134
|
constructor() {
|
|
127
135
|
super();
|
|
128
|
-
this.smoothing =
|
|
136
|
+
this.smoothing = ${smoothing};
|
|
129
137
|
this.energy = 0;
|
|
130
|
-
this.noiseFloor =
|
|
138
|
+
this.noiseFloor = ${initialNoiseFloor};
|
|
139
|
+
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
140
|
+
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
141
|
+
this.minSNR = ${minSNR};
|
|
142
|
+
this.snrRange = ${snrRange};
|
|
131
143
|
}
|
|
132
144
|
|
|
133
145
|
process(inputs, outputs, parameters) {
|
|
@@ -135,32 +147,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
135
147
|
if (!input || !input.length) return true;
|
|
136
148
|
const channel = input[0];
|
|
137
149
|
|
|
138
|
-
// Calculate RMS
|
|
150
|
+
// Calculate RMS (Root Mean Square) energy
|
|
139
151
|
let sum = 0;
|
|
140
152
|
for (let i = 0; i < channel.length; i++) {
|
|
141
153
|
sum += channel[i] * channel[i];
|
|
142
154
|
}
|
|
143
155
|
const rms = Math.sqrt(sum / channel.length);
|
|
144
156
|
|
|
145
|
-
//
|
|
157
|
+
// Adaptive noise floor estimation
|
|
158
|
+
// When signal is quiet, adapt quickly to find new noise floor
|
|
159
|
+
// When signal is loud (speech), adapt slowly to avoid raising noise floor
|
|
146
160
|
if (rms < this.noiseFloor) {
|
|
147
|
-
this.noiseFloor = this.noiseFloor *
|
|
161
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
|
|
148
162
|
} else {
|
|
149
|
-
this.noiseFloor = this.noiseFloor *
|
|
163
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
|
|
150
164
|
}
|
|
151
165
|
|
|
152
|
-
// Calculate
|
|
153
|
-
// This is a heuristic mapping from energy to 0-1
|
|
166
|
+
// Calculate Signal-to-Noise Ratio (SNR)
|
|
154
167
|
const snr = rms / (this.noiseFloor + 1e-6);
|
|
155
|
-
|
|
168
|
+
|
|
169
|
+
// Map SNR to probability (0-1)
|
|
170
|
+
// Probability is 0 when SNR <= minSNR
|
|
171
|
+
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
172
|
+
// Probability is 1 when SNR >= (minSNR + snrRange)
|
|
173
|
+
const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
|
|
156
174
|
|
|
157
|
-
this.port.postMessage({ probability });
|
|
175
|
+
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
158
176
|
|
|
159
177
|
return true;
|
|
160
178
|
}
|
|
161
179
|
}
|
|
162
180
|
registerProcessor('energy-vad-processor', EnergyVadProcessor);
|
|
163
181
|
`;
|
|
182
|
+
};
|
|
164
183
|
var EnergyVADPlugin = class {
|
|
165
184
|
name = "energy-vad";
|
|
166
185
|
async createNode(context, config, onDecision) {
|
|
@@ -169,7 +188,8 @@ var EnergyVADPlugin = class {
|
|
|
169
188
|
const pass = context.createGain();
|
|
170
189
|
return pass;
|
|
171
190
|
}
|
|
172
|
-
const
|
|
191
|
+
const workletCode = createEnergyVadWorkletCode(config);
|
|
192
|
+
const blob = new Blob([workletCode], {
|
|
173
193
|
type: "application/javascript"
|
|
174
194
|
});
|
|
175
195
|
const url = URL.createObjectURL(blob);
|
|
@@ -249,31 +269,60 @@ var VADStateMachine = class {
|
|
|
249
269
|
currentState = "silent";
|
|
250
270
|
lastSpeechTime = 0;
|
|
251
271
|
speechStartTime = 0;
|
|
272
|
+
lastSilenceTime = 0;
|
|
252
273
|
frameDurationMs = 20;
|
|
253
274
|
// Assumed frame duration, updated by calls
|
|
254
275
|
constructor(config) {
|
|
255
276
|
this.config = {
|
|
256
277
|
enabled: config?.enabled ?? true,
|
|
257
278
|
pluginName: config?.pluginName ?? "energy-vad",
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
279
|
+
// Voice-optimized defaults
|
|
280
|
+
startThreshold: config?.startThreshold ?? 0.6,
|
|
281
|
+
// Higher threshold to avoid noise
|
|
282
|
+
stopThreshold: config?.stopThreshold ?? 0.45,
|
|
283
|
+
// Balanced for voice
|
|
284
|
+
hangoverMs: config?.hangoverMs ?? 400,
|
|
285
|
+
// Smooth for natural speech
|
|
286
|
+
preRollMs: config?.preRollMs ?? 250,
|
|
287
|
+
// Generous pre-roll
|
|
288
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 100,
|
|
289
|
+
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
290
|
+
energyVad: {
|
|
291
|
+
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
292
|
+
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
293
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
|
|
294
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
|
|
295
|
+
minSNR: config?.energyVad?.minSNR ?? 2,
|
|
296
|
+
snrRange: config?.energyVad?.snrRange ?? 8
|
|
297
|
+
}
|
|
262
298
|
};
|
|
299
|
+
this.lastSilenceTime = Date.now();
|
|
263
300
|
}
|
|
264
301
|
updateConfig(config) {
|
|
265
302
|
this.config = { ...this.config, ...config };
|
|
266
303
|
}
|
|
267
304
|
processFrame(probability, timestamp) {
|
|
268
|
-
const {
|
|
305
|
+
const {
|
|
306
|
+
startThreshold,
|
|
307
|
+
stopThreshold,
|
|
308
|
+
hangoverMs,
|
|
309
|
+
minSpeechDurationMs,
|
|
310
|
+
minSilenceDurationMs
|
|
311
|
+
} = this.config;
|
|
269
312
|
let newState = this.currentState;
|
|
270
313
|
if (this.currentState === "silent" || this.currentState === "speech_ending") {
|
|
271
314
|
if (probability >= startThreshold) {
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
315
|
+
const silenceDuration = timestamp - this.lastSilenceTime;
|
|
316
|
+
if (silenceDuration >= minSilenceDurationMs) {
|
|
317
|
+
newState = "speech_starting";
|
|
318
|
+
this.speechStartTime = timestamp;
|
|
319
|
+
this.lastSpeechTime = timestamp;
|
|
320
|
+
} else {
|
|
321
|
+
newState = "silent";
|
|
322
|
+
}
|
|
275
323
|
} else {
|
|
276
324
|
newState = "silent";
|
|
325
|
+
this.lastSilenceTime = timestamp;
|
|
277
326
|
}
|
|
278
327
|
} else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
|
|
279
328
|
if (probability >= stopThreshold) {
|
|
@@ -281,10 +330,15 @@ var VADStateMachine = class {
|
|
|
281
330
|
this.lastSpeechTime = timestamp;
|
|
282
331
|
} else {
|
|
283
332
|
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
333
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
284
334
|
if (timeSinceSpeech < hangoverMs) {
|
|
285
335
|
newState = "speaking";
|
|
336
|
+
} else if (speechDuration < minSpeechDurationMs) {
|
|
337
|
+
newState = "silent";
|
|
338
|
+
this.lastSilenceTime = timestamp;
|
|
286
339
|
} else {
|
|
287
340
|
newState = "speech_ending";
|
|
341
|
+
this.lastSilenceTime = timestamp;
|
|
288
342
|
}
|
|
289
343
|
}
|
|
290
344
|
}
|
|
@@ -303,7 +357,9 @@ var VADStateMachine = class {
|
|
|
303
357
|
async function createAudioPipeline(sourceTrack, config = {}) {
|
|
304
358
|
const context = getAudioContext();
|
|
305
359
|
registerPipeline();
|
|
306
|
-
const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
|
|
360
|
+
const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
|
|
361
|
+
config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl
|
|
362
|
+
);
|
|
307
363
|
const vadEnabled = config.vad?.enabled !== false;
|
|
308
364
|
const fullConfig = {
|
|
309
365
|
noiseSuppression: {
|
|
@@ -312,13 +368,38 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
312
368
|
},
|
|
313
369
|
vad: {
|
|
314
370
|
enabled: vadEnabled,
|
|
371
|
+
// Voice-optimized defaults (will be overridden by config)
|
|
372
|
+
startThreshold: 0.6,
|
|
373
|
+
stopThreshold: 0.45,
|
|
374
|
+
hangoverMs: 400,
|
|
375
|
+
preRollMs: 250,
|
|
376
|
+
minSpeechDurationMs: 100,
|
|
377
|
+
minSilenceDurationMs: 150,
|
|
378
|
+
energyVad: {
|
|
379
|
+
smoothing: 0.95,
|
|
380
|
+
initialNoiseFloor: 1e-3,
|
|
381
|
+
noiseFloorAdaptRateQuiet: 0.01,
|
|
382
|
+
noiseFloorAdaptRateLoud: 1e-3,
|
|
383
|
+
minSNR: 2,
|
|
384
|
+
snrRange: 8
|
|
385
|
+
},
|
|
315
386
|
...config.vad
|
|
316
387
|
},
|
|
317
388
|
output: {
|
|
318
389
|
speechGain: 1,
|
|
319
|
-
silenceGain:
|
|
320
|
-
//
|
|
321
|
-
gainRampTime: 0.
|
|
390
|
+
silenceGain: 0,
|
|
391
|
+
// Full mute for voice-only
|
|
392
|
+
gainRampTime: 0.015,
|
|
393
|
+
// Fast but smooth transitions
|
|
394
|
+
smoothTransitions: true,
|
|
395
|
+
maxGainDb: 6,
|
|
396
|
+
enableCompression: false,
|
|
397
|
+
compression: {
|
|
398
|
+
threshold: -24,
|
|
399
|
+
ratio: 3,
|
|
400
|
+
attack: 3e-3,
|
|
401
|
+
release: 0.05
|
|
402
|
+
},
|
|
322
403
|
...config.output
|
|
323
404
|
},
|
|
324
405
|
livekit: { manageTrackMute: false, ...config.livekit }
|
|
@@ -329,7 +410,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
329
410
|
output: fullConfig.output
|
|
330
411
|
});
|
|
331
412
|
if (!sourceTrack || sourceTrack.kind !== "audio") {
|
|
332
|
-
throw new Error(
|
|
413
|
+
throw new Error(
|
|
414
|
+
"createAudioPipeline requires a valid audio MediaStreamTrack"
|
|
415
|
+
);
|
|
333
416
|
}
|
|
334
417
|
if (sourceTrack.readyState === "ended") {
|
|
335
418
|
throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
|
|
@@ -343,10 +426,7 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
343
426
|
const nsPlugin = getNoiseSuppressionPlugin(
|
|
344
427
|
fullConfig.noiseSuppression?.pluginName
|
|
345
428
|
);
|
|
346
|
-
nsNode = await nsPlugin.createNode(
|
|
347
|
-
context,
|
|
348
|
-
fullConfig.noiseSuppression
|
|
349
|
-
);
|
|
429
|
+
nsNode = await nsPlugin.createNode(context, fullConfig.noiseSuppression);
|
|
350
430
|
} catch (error) {
|
|
351
431
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
352
432
|
console.error("Failed to create noise suppression node:", err);
|
|
@@ -356,25 +436,21 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
356
436
|
const vadStateMachine = new VADStateMachine(fullConfig.vad);
|
|
357
437
|
try {
|
|
358
438
|
const vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
|
|
359
|
-
vadNode = await vadPlugin.createNode(
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
emitter.emit("vadChange", newState);
|
|
368
|
-
lastVadState = newState;
|
|
369
|
-
updateGain(newState);
|
|
370
|
-
}
|
|
371
|
-
} catch (vadError) {
|
|
372
|
-
const err = vadError instanceof Error ? vadError : new Error(String(vadError));
|
|
373
|
-
console.error("Error in VAD callback:", err);
|
|
374
|
-
emitter.emit("error", err);
|
|
439
|
+
vadNode = await vadPlugin.createNode(context, fullConfig.vad, (prob) => {
|
|
440
|
+
try {
|
|
441
|
+
const timestamp = context.currentTime * 1e3;
|
|
442
|
+
const newState = vadStateMachine.processFrame(prob, timestamp);
|
|
443
|
+
if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
|
|
444
|
+
emitter.emit("vadChange", newState);
|
|
445
|
+
lastVadState = newState;
|
|
446
|
+
updateGain(newState);
|
|
375
447
|
}
|
|
448
|
+
} catch (vadError) {
|
|
449
|
+
const err = vadError instanceof Error ? vadError : new Error(String(vadError));
|
|
450
|
+
console.error("Error in VAD callback:", err);
|
|
451
|
+
emitter.emit("error", err);
|
|
376
452
|
}
|
|
377
|
-
);
|
|
453
|
+
});
|
|
378
454
|
} catch (error) {
|
|
379
455
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
380
456
|
console.error("Failed to create VAD node:", err);
|
|
@@ -391,15 +467,31 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
391
467
|
nsNode.connect(splitter);
|
|
392
468
|
splitter.connect(vadNode);
|
|
393
469
|
const delayNode = context.createDelay(1);
|
|
394
|
-
const preRollSeconds = (fullConfig.vad?.preRollMs ??
|
|
470
|
+
const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
|
|
395
471
|
delayNode.delayTime.value = preRollSeconds;
|
|
396
472
|
const gainNode = context.createGain();
|
|
397
473
|
gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
|
|
474
|
+
let compressor = null;
|
|
475
|
+
if (fullConfig.output?.enableCompression) {
|
|
476
|
+
compressor = context.createDynamicsCompressor();
|
|
477
|
+
const comp = fullConfig.output.compression;
|
|
478
|
+
compressor.threshold.value = comp.threshold ?? -24;
|
|
479
|
+
compressor.ratio.value = comp.ratio ?? 3;
|
|
480
|
+
compressor.attack.value = comp.attack ?? 3e-3;
|
|
481
|
+
compressor.release.value = comp.release ?? 0.05;
|
|
482
|
+
compressor.knee.value = 10;
|
|
483
|
+
}
|
|
398
484
|
const destination = context.createMediaStreamDestination();
|
|
399
485
|
try {
|
|
400
486
|
splitter.connect(delayNode);
|
|
401
487
|
delayNode.connect(gainNode);
|
|
402
|
-
|
|
488
|
+
if (compressor) {
|
|
489
|
+
gainNode.connect(compressor);
|
|
490
|
+
compressor.connect(destination);
|
|
491
|
+
console.log("Compression enabled:", fullConfig.output?.compression);
|
|
492
|
+
} else {
|
|
493
|
+
gainNode.connect(destination);
|
|
494
|
+
}
|
|
403
495
|
} catch (error) {
|
|
404
496
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
405
497
|
console.error("Failed to wire audio pipeline:", err);
|
|
@@ -408,10 +500,24 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
408
500
|
}
|
|
409
501
|
function updateGain(state) {
|
|
410
502
|
try {
|
|
411
|
-
const {
|
|
412
|
-
|
|
503
|
+
const {
|
|
504
|
+
speechGain = 1,
|
|
505
|
+
silenceGain = 0,
|
|
506
|
+
gainRampTime = 0.015,
|
|
507
|
+
smoothTransitions = true,
|
|
508
|
+
maxGainDb = 6
|
|
509
|
+
} = fullConfig.output;
|
|
510
|
+
const maxGainLinear = Math.pow(10, maxGainDb / 20);
|
|
511
|
+
const limitedSpeechGain = Math.min(speechGain, maxGainLinear);
|
|
512
|
+
const targetGain = state.isSpeaking ? limitedSpeechGain : silenceGain;
|
|
413
513
|
const now = context.currentTime;
|
|
414
|
-
|
|
514
|
+
if (smoothTransitions) {
|
|
515
|
+
gainNode.gain.cancelScheduledValues(now);
|
|
516
|
+
gainNode.gain.setValueAtTime(gainNode.gain.value, now);
|
|
517
|
+
gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime / 3);
|
|
518
|
+
} else {
|
|
519
|
+
gainNode.gain.setValueAtTime(targetGain, now);
|
|
520
|
+
}
|
|
415
521
|
} catch (error) {
|
|
416
522
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
417
523
|
console.error("Failed to update gain:", err);
|
|
@@ -467,6 +573,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
467
573
|
vadNode.disconnect();
|
|
468
574
|
delayNode.disconnect();
|
|
469
575
|
gainNode.disconnect();
|
|
576
|
+
if (compressor) {
|
|
577
|
+
compressor.disconnect();
|
|
578
|
+
}
|
|
470
579
|
destination.stream.getTracks().forEach((t) => t.stop());
|
|
471
580
|
unregisterPipeline();
|
|
472
581
|
} catch (error) {
|
|
@@ -483,7 +592,47 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
483
592
|
try {
|
|
484
593
|
if (newConfig.vad) {
|
|
485
594
|
vadStateMachine.updateConfig(newConfig.vad);
|
|
595
|
+
Object.assign(fullConfig.vad, newConfig.vad);
|
|
596
|
+
if (newConfig.vad.preRollMs !== void 0) {
|
|
597
|
+
const preRollSeconds2 = newConfig.vad.preRollMs / 1e3;
|
|
598
|
+
delayNode.delayTime.setValueAtTime(
|
|
599
|
+
preRollSeconds2,
|
|
600
|
+
context.currentTime
|
|
601
|
+
);
|
|
602
|
+
}
|
|
603
|
+
}
|
|
604
|
+
if (newConfig.output) {
|
|
605
|
+
Object.assign(fullConfig.output, newConfig.output);
|
|
606
|
+
updateGain(lastVadState);
|
|
607
|
+
if (compressor && newConfig.output.compression) {
|
|
608
|
+
const comp = newConfig.output.compression;
|
|
609
|
+
if (comp.threshold !== void 0) {
|
|
610
|
+
compressor.threshold.setValueAtTime(
|
|
611
|
+
comp.threshold,
|
|
612
|
+
context.currentTime
|
|
613
|
+
);
|
|
614
|
+
}
|
|
615
|
+
if (comp.ratio !== void 0) {
|
|
616
|
+
compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
|
|
617
|
+
}
|
|
618
|
+
if (comp.attack !== void 0) {
|
|
619
|
+
compressor.attack.setValueAtTime(
|
|
620
|
+
comp.attack,
|
|
621
|
+
context.currentTime
|
|
622
|
+
);
|
|
623
|
+
}
|
|
624
|
+
if (comp.release !== void 0) {
|
|
625
|
+
compressor.release.setValueAtTime(
|
|
626
|
+
comp.release,
|
|
627
|
+
context.currentTime
|
|
628
|
+
);
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
if (newConfig.livekit) {
|
|
633
|
+
Object.assign(fullConfig.livekit, newConfig.livekit);
|
|
486
634
|
}
|
|
635
|
+
console.log("Pipeline config updated:", newConfig);
|
|
487
636
|
} catch (error) {
|
|
488
637
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
489
638
|
console.error("Failed to update config:", err);
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
createAudioPipeline
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-AHBRT4RD.mjs";
|
|
4
|
+
import "../chunk-N553RHTI.mjs";
|
|
5
5
|
import "../chunk-OZ7KMC4S.mjs";
|
|
6
|
-
import "../chunk-
|
|
6
|
+
import "../chunk-YOSTLLCS.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-NMHKX64G.mjs";
|
|
9
9
|
export {
|
|
10
10
|
createAudioPipeline
|
|
11
11
|
};
|
package/dist/types.d.mts
CHANGED
|
@@ -35,46 +35,154 @@ interface AudioProcessingConfig {
|
|
|
35
35
|
vad?: {
|
|
36
36
|
enabled: boolean;
|
|
37
37
|
/**
|
|
38
|
-
* Plugin name to use. Defaults to '
|
|
38
|
+
* Plugin name to use. Defaults to 'energy-vad'.
|
|
39
39
|
*/
|
|
40
40
|
pluginName?: string;
|
|
41
41
|
/**
|
|
42
42
|
* Probability threshold for speech onset (0-1).
|
|
43
|
-
*
|
|
43
|
+
* When VAD probability rises above this, audio is unmuted.
|
|
44
|
+
* Lower = more sensitive (catches quiet speech, may include noise)
|
|
45
|
+
* Higher = less sensitive (only confident speech, may clip quiet parts)
|
|
46
|
+
* Default: 0.6 (optimized for voice-only)
|
|
44
47
|
*/
|
|
45
48
|
startThreshold?: number;
|
|
46
49
|
/**
|
|
47
50
|
* Probability threshold for speech offset (0-1).
|
|
48
|
-
*
|
|
51
|
+
* When VAD probability drops below this (after hangover), audio is muted.
|
|
52
|
+
* Lower = keeps audio on longer (less aggressive gating)
|
|
53
|
+
* Higher = mutes faster (more aggressive noise suppression)
|
|
54
|
+
* Default: 0.45 (balanced voice detection)
|
|
49
55
|
*/
|
|
50
56
|
stopThreshold?: number;
|
|
51
57
|
/**
|
|
52
|
-
* Time in ms to wait after speech stops before
|
|
53
|
-
*
|
|
58
|
+
* Time in ms to wait after speech stops before muting.
|
|
59
|
+
* Prevents rapid on/off toggling during pauses.
|
|
60
|
+
* Lower = more aggressive gating, may clip between words
|
|
61
|
+
* Higher = smoother but may let trailing noise through
|
|
62
|
+
* Default: 400ms (optimized for natural speech)
|
|
54
63
|
*/
|
|
55
64
|
hangoverMs?: number;
|
|
56
65
|
/**
|
|
57
|
-
* Time in ms of audio to buffer before speech onset
|
|
58
|
-
*
|
|
66
|
+
* Time in ms of audio to buffer before speech onset.
|
|
67
|
+
* Prevents cutting off the beginning of speech.
|
|
68
|
+
* Default: 250ms (generous pre-roll for voice)
|
|
59
69
|
*/
|
|
60
70
|
preRollMs?: number;
|
|
71
|
+
/**
|
|
72
|
+
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
+
* Filters out very brief noise spikes.
|
|
74
|
+
* Default: 100ms
|
|
75
|
+
*/
|
|
76
|
+
minSpeechDurationMs?: number;
|
|
77
|
+
/**
|
|
78
|
+
* Minimum silence duration in ms before allowing another speech segment.
|
|
79
|
+
* Prevents false positives from quick noise bursts.
|
|
80
|
+
* Default: 150ms
|
|
81
|
+
*/
|
|
82
|
+
minSilenceDurationMs?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Advanced: Energy VAD specific parameters
|
|
85
|
+
*/
|
|
86
|
+
energyVad?: {
|
|
87
|
+
/**
|
|
88
|
+
* Smoothing factor for energy calculation (0-1).
|
|
89
|
+
* Higher = more smoothing, slower to react
|
|
90
|
+
* Default: 0.95
|
|
91
|
+
*/
|
|
92
|
+
smoothing?: number;
|
|
93
|
+
/**
|
|
94
|
+
* Initial noise floor estimate.
|
|
95
|
+
* Default: 0.001
|
|
96
|
+
*/
|
|
97
|
+
initialNoiseFloor?: number;
|
|
98
|
+
/**
|
|
99
|
+
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
+
* Default: 0.01
|
|
101
|
+
*/
|
|
102
|
+
noiseFloorAdaptRateQuiet?: number;
|
|
103
|
+
/**
|
|
104
|
+
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
+
* Default: 0.001 (slower adaptation for speech)
|
|
106
|
+
*/
|
|
107
|
+
noiseFloorAdaptRateLoud?: number;
|
|
108
|
+
/**
|
|
109
|
+
* Minimum SNR (Signal-to-Noise Ratio) for speech detection.
|
|
110
|
+
* Default: 2.0 (voice is 2x louder than noise floor)
|
|
111
|
+
*/
|
|
112
|
+
minSNR?: number;
|
|
113
|
+
/**
|
|
114
|
+
* SNR range for probability scaling.
|
|
115
|
+
* Default: 8.0 (probability scales from minSNR to minSNR+snrRange)
|
|
116
|
+
*/
|
|
117
|
+
snrRange?: number;
|
|
118
|
+
};
|
|
61
119
|
};
|
|
62
120
|
/**
|
|
63
121
|
* Output gain and muting configuration.
|
|
64
122
|
*/
|
|
65
123
|
output?: {
|
|
66
124
|
/**
|
|
67
|
-
* Gain to apply when speaking (0-
|
|
125
|
+
* Gain to apply when speaking (0-infinity).
|
|
126
|
+
* Values > 1.0 will amplify the voice.
|
|
127
|
+
* Default: 1.0 (unity gain)
|
|
68
128
|
*/
|
|
69
129
|
speechGain?: number;
|
|
70
130
|
/**
|
|
71
|
-
* Gain to apply when silent (0-1).
|
|
131
|
+
* Gain to apply when silent (0-1).
|
|
132
|
+
* 0.0 = complete mute (recommended for voice-only)
|
|
133
|
+
* 0.1-0.3 = allow some background ambience
|
|
134
|
+
* Default: 0.0 (full mute for voice-only)
|
|
72
135
|
*/
|
|
73
136
|
silenceGain?: number;
|
|
74
137
|
/**
|
|
75
|
-
* Time in seconds to ramp gain changes.
|
|
138
|
+
* Time in seconds to ramp gain changes.
|
|
139
|
+
* Lower = faster transitions (may cause clicks)
|
|
140
|
+
* Higher = smoother transitions (may sound sluggish)
|
|
141
|
+
* Default: 0.015 (fast but smooth for voice)
|
|
76
142
|
*/
|
|
77
143
|
gainRampTime?: number;
|
|
144
|
+
/**
|
|
145
|
+
* Apply additional gain reduction during the transition to silence.
|
|
146
|
+
* Helps create cleaner cutoffs without abrupt clicks.
|
|
147
|
+
* Default: true
|
|
148
|
+
*/
|
|
149
|
+
smoothTransitions?: boolean;
|
|
150
|
+
/**
|
|
151
|
+
* Maximum gain in dB to apply (prevents clipping).
|
|
152
|
+
* Default: 6.0 dB (roughly 2x amplitude)
|
|
153
|
+
*/
|
|
154
|
+
maxGainDb?: number;
|
|
155
|
+
/**
|
|
156
|
+
* Apply dynamic range compression when speaking.
|
|
157
|
+
* Makes quiet parts louder and loud parts quieter.
|
|
158
|
+
* Default: false (transparent audio)
|
|
159
|
+
*/
|
|
160
|
+
enableCompression?: boolean;
|
|
161
|
+
/**
|
|
162
|
+
* Compression settings (when enabled)
|
|
163
|
+
*/
|
|
164
|
+
compression?: {
|
|
165
|
+
/**
|
|
166
|
+
* Threshold in dB above which compression starts.
|
|
167
|
+
* Default: -24.0 dB
|
|
168
|
+
*/
|
|
169
|
+
threshold?: number;
|
|
170
|
+
/**
|
|
171
|
+
* Compression ratio (1:N).
|
|
172
|
+
* Default: 3.0 (3:1 ratio)
|
|
173
|
+
*/
|
|
174
|
+
ratio?: number;
|
|
175
|
+
/**
|
|
176
|
+
* Attack time in seconds.
|
|
177
|
+
* Default: 0.003 (3ms)
|
|
178
|
+
*/
|
|
179
|
+
attack?: number;
|
|
180
|
+
/**
|
|
181
|
+
* Release time in seconds.
|
|
182
|
+
* Default: 0.05 (50ms)
|
|
183
|
+
*/
|
|
184
|
+
release?: number;
|
|
185
|
+
};
|
|
78
186
|
};
|
|
79
187
|
/**
|
|
80
188
|
* LiveKit integration configuration.
|