@tensamin/audio 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -54
- package/dist/{chunk-XMTQPMQ6.mjs → chunk-GVKCBKW6.mjs} +1 -1
- package/dist/{chunk-6P2RDBW5.mjs → chunk-H5UKZU2Y.mjs} +1 -1
- package/dist/chunk-N553RHTI.mjs +93 -0
- package/dist/chunk-VEJXAEMM.mjs +136 -0
- package/dist/{chunk-EXH2PNUE.mjs → chunk-XXTNAUYX.mjs} +133 -34
- package/dist/extensibility/plugins.js +52 -14
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +225 -54
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +225 -54
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +225 -54
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +118 -10
- package/dist/types.d.ts +118 -10
- package/dist/vad/vad-node.d.mts +2 -0
- package/dist/vad/vad-node.d.ts +2 -0
- package/dist/vad/vad-node.js +52 -14
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.d.mts +1 -0
- package/dist/vad/vad-state.d.ts +1 -0
- package/dist/vad/vad-state.js +42 -8
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
- package/dist/chunk-JJASCVEW.mjs +0 -59
- package/dist/chunk-R5JVHKWA.mjs +0 -98
|
@@ -121,13 +121,32 @@ To disable noise suppression, set noiseSuppression.enabled to false.`
|
|
|
121
121
|
};
|
|
122
122
|
|
|
123
123
|
// src/vad/vad-node.ts
|
|
124
|
-
var
|
|
124
|
+
var createEnergyVadWorkletCode = (vadConfig) => {
|
|
125
|
+
const energyParams = vadConfig?.energyVad || {};
|
|
126
|
+
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
|
+
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
129
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-3;
|
|
130
|
+
const minSNR = energyParams.minSNR ?? 2;
|
|
131
|
+
const snrRange = energyParams.snrRange ?? 8;
|
|
132
|
+
return `
|
|
125
133
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
126
134
|
constructor() {
|
|
127
135
|
super();
|
|
128
|
-
this.smoothing =
|
|
136
|
+
this.smoothing = ${smoothing};
|
|
129
137
|
this.energy = 0;
|
|
130
|
-
this.noiseFloor =
|
|
138
|
+
this.noiseFloor = ${initialNoiseFloor};
|
|
139
|
+
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
140
|
+
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
141
|
+
this.minSNR = ${minSNR};
|
|
142
|
+
this.snrRange = ${snrRange};
|
|
143
|
+
this.isSpeaking = false;
|
|
144
|
+
|
|
145
|
+
this.port.onmessage = (event) => {
|
|
146
|
+
if (event.data && event.data.isSpeaking !== undefined) {
|
|
147
|
+
this.isSpeaking = event.data.isSpeaking;
|
|
148
|
+
}
|
|
149
|
+
};
|
|
131
150
|
}
|
|
132
151
|
|
|
133
152
|
process(inputs, outputs, parameters) {
|
|
@@ -135,41 +154,54 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
135
154
|
if (!input || !input.length) return true;
|
|
136
155
|
const channel = input[0];
|
|
137
156
|
|
|
138
|
-
// Calculate RMS
|
|
157
|
+
// Calculate RMS (Root Mean Square) energy
|
|
139
158
|
let sum = 0;
|
|
140
159
|
for (let i = 0; i < channel.length; i++) {
|
|
141
160
|
sum += channel[i] * channel[i];
|
|
142
161
|
}
|
|
143
162
|
const rms = Math.sqrt(sum / channel.length);
|
|
144
163
|
|
|
145
|
-
//
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
164
|
+
// Adaptive noise floor estimation - ONLY during silence
|
|
165
|
+
// This prevents the noise floor from rising during speech
|
|
166
|
+
if (!this.isSpeaking) {
|
|
167
|
+
if (rms < this.noiseFloor) {
|
|
168
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
|
|
169
|
+
} else {
|
|
170
|
+
// Even during silence, if we detect a loud signal, adapt very slowly
|
|
171
|
+
// This could be brief noise we haven't classified as speech yet
|
|
172
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
|
|
173
|
+
}
|
|
150
174
|
}
|
|
175
|
+
// During speech, freeze the noise floor to maintain consistent detection
|
|
151
176
|
|
|
152
|
-
// Calculate
|
|
153
|
-
// This is a heuristic mapping from energy to 0-1
|
|
177
|
+
// Calculate Signal-to-Noise Ratio (SNR)
|
|
154
178
|
const snr = rms / (this.noiseFloor + 1e-6);
|
|
155
|
-
|
|
179
|
+
|
|
180
|
+
// Map SNR to probability (0-1)
|
|
181
|
+
// Probability is 0 when SNR <= minSNR
|
|
182
|
+
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
183
|
+
// Probability is 1 when SNR >= (minSNR + snrRange)
|
|
184
|
+
const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
|
|
156
185
|
|
|
157
|
-
this.port.postMessage({ probability });
|
|
186
|
+
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
158
187
|
|
|
159
188
|
return true;
|
|
160
189
|
}
|
|
161
190
|
}
|
|
162
191
|
registerProcessor('energy-vad-processor', EnergyVadProcessor);
|
|
163
192
|
`;
|
|
193
|
+
};
|
|
164
194
|
var EnergyVADPlugin = class {
|
|
165
195
|
name = "energy-vad";
|
|
196
|
+
workletNode = null;
|
|
166
197
|
async createNode(context, config, onDecision) {
|
|
167
198
|
if (!config?.enabled) {
|
|
168
199
|
console.log("VAD disabled, using passthrough node");
|
|
169
200
|
const pass = context.createGain();
|
|
170
201
|
return pass;
|
|
171
202
|
}
|
|
172
|
-
const
|
|
203
|
+
const workletCode = createEnergyVadWorkletCode(config);
|
|
204
|
+
const blob = new Blob([workletCode], {
|
|
173
205
|
type: "application/javascript"
|
|
174
206
|
});
|
|
175
207
|
const url = URL.createObjectURL(blob);
|
|
@@ -188,6 +220,7 @@ var EnergyVADPlugin = class {
|
|
|
188
220
|
let node;
|
|
189
221
|
try {
|
|
190
222
|
node = new AudioWorkletNode(context, "energy-vad-processor");
|
|
223
|
+
this.workletNode = node;
|
|
191
224
|
console.log("Energy VAD node created successfully");
|
|
192
225
|
} catch (e) {
|
|
193
226
|
const error = new Error(
|
|
@@ -213,6 +246,11 @@ var EnergyVADPlugin = class {
|
|
|
213
246
|
};
|
|
214
247
|
return node;
|
|
215
248
|
}
|
|
249
|
+
updateSpeakingState(isSpeaking) {
|
|
250
|
+
if (this.workletNode) {
|
|
251
|
+
this.workletNode.port.postMessage({ isSpeaking });
|
|
252
|
+
}
|
|
253
|
+
}
|
|
216
254
|
};
|
|
217
255
|
|
|
218
256
|
// src/extensibility/plugins.ts
|
|
@@ -249,31 +287,60 @@ var VADStateMachine = class {
|
|
|
249
287
|
currentState = "silent";
|
|
250
288
|
lastSpeechTime = 0;
|
|
251
289
|
speechStartTime = 0;
|
|
290
|
+
lastSilenceTime = 0;
|
|
252
291
|
frameDurationMs = 20;
|
|
253
292
|
// Assumed frame duration, updated by calls
|
|
254
293
|
constructor(config) {
|
|
255
294
|
this.config = {
|
|
256
295
|
enabled: config?.enabled ?? true,
|
|
257
296
|
pluginName: config?.pluginName ?? "energy-vad",
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
297
|
+
// Voice-optimized defaults
|
|
298
|
+
startThreshold: config?.startThreshold ?? 0.6,
|
|
299
|
+
// Higher threshold to avoid noise
|
|
300
|
+
stopThreshold: config?.stopThreshold ?? 0.45,
|
|
301
|
+
// Balanced for voice
|
|
302
|
+
hangoverMs: config?.hangoverMs ?? 400,
|
|
303
|
+
// Smooth for natural speech
|
|
304
|
+
preRollMs: config?.preRollMs ?? 250,
|
|
305
|
+
// Generous pre-roll
|
|
306
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 100,
|
|
307
|
+
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
308
|
+
energyVad: {
|
|
309
|
+
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
310
|
+
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
311
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
|
|
312
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
|
|
313
|
+
minSNR: config?.energyVad?.minSNR ?? 2,
|
|
314
|
+
snrRange: config?.energyVad?.snrRange ?? 8
|
|
315
|
+
}
|
|
262
316
|
};
|
|
317
|
+
this.lastSilenceTime = Date.now();
|
|
263
318
|
}
|
|
264
319
|
updateConfig(config) {
|
|
265
320
|
this.config = { ...this.config, ...config };
|
|
266
321
|
}
|
|
267
322
|
processFrame(probability, timestamp) {
|
|
268
|
-
const {
|
|
323
|
+
const {
|
|
324
|
+
startThreshold,
|
|
325
|
+
stopThreshold,
|
|
326
|
+
hangoverMs,
|
|
327
|
+
minSpeechDurationMs,
|
|
328
|
+
minSilenceDurationMs
|
|
329
|
+
} = this.config;
|
|
269
330
|
let newState = this.currentState;
|
|
270
331
|
if (this.currentState === "silent" || this.currentState === "speech_ending") {
|
|
271
332
|
if (probability >= startThreshold) {
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
333
|
+
const silenceDuration = timestamp - this.lastSilenceTime;
|
|
334
|
+
if (silenceDuration >= minSilenceDurationMs) {
|
|
335
|
+
newState = "speech_starting";
|
|
336
|
+
this.speechStartTime = timestamp;
|
|
337
|
+
this.lastSpeechTime = timestamp;
|
|
338
|
+
} else {
|
|
339
|
+
newState = "silent";
|
|
340
|
+
}
|
|
275
341
|
} else {
|
|
276
342
|
newState = "silent";
|
|
343
|
+
this.lastSilenceTime = timestamp;
|
|
277
344
|
}
|
|
278
345
|
} else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
|
|
279
346
|
if (probability >= stopThreshold) {
|
|
@@ -281,10 +348,15 @@ var VADStateMachine = class {
|
|
|
281
348
|
this.lastSpeechTime = timestamp;
|
|
282
349
|
} else {
|
|
283
350
|
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
351
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
284
352
|
if (timeSinceSpeech < hangoverMs) {
|
|
285
353
|
newState = "speaking";
|
|
354
|
+
} else if (speechDuration < minSpeechDurationMs) {
|
|
355
|
+
newState = "silent";
|
|
356
|
+
this.lastSilenceTime = timestamp;
|
|
286
357
|
} else {
|
|
287
358
|
newState = "speech_ending";
|
|
359
|
+
this.lastSilenceTime = timestamp;
|
|
288
360
|
}
|
|
289
361
|
}
|
|
290
362
|
}
|
|
@@ -303,7 +375,9 @@ var VADStateMachine = class {
|
|
|
303
375
|
async function createAudioPipeline(sourceTrack, config = {}) {
|
|
304
376
|
const context = getAudioContext();
|
|
305
377
|
registerPipeline();
|
|
306
|
-
const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
|
|
378
|
+
const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
|
|
379
|
+
config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl
|
|
380
|
+
);
|
|
307
381
|
const vadEnabled = config.vad?.enabled !== false;
|
|
308
382
|
const fullConfig = {
|
|
309
383
|
noiseSuppression: {
|
|
@@ -312,13 +386,38 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
312
386
|
},
|
|
313
387
|
vad: {
|
|
314
388
|
enabled: vadEnabled,
|
|
389
|
+
// Voice-optimized defaults (will be overridden by config)
|
|
390
|
+
startThreshold: 0.6,
|
|
391
|
+
stopThreshold: 0.45,
|
|
392
|
+
hangoverMs: 400,
|
|
393
|
+
preRollMs: 250,
|
|
394
|
+
minSpeechDurationMs: 100,
|
|
395
|
+
minSilenceDurationMs: 150,
|
|
396
|
+
energyVad: {
|
|
397
|
+
smoothing: 0.95,
|
|
398
|
+
initialNoiseFloor: 1e-3,
|
|
399
|
+
noiseFloorAdaptRateQuiet: 0.01,
|
|
400
|
+
noiseFloorAdaptRateLoud: 1e-3,
|
|
401
|
+
minSNR: 2,
|
|
402
|
+
snrRange: 8
|
|
403
|
+
},
|
|
315
404
|
...config.vad
|
|
316
405
|
},
|
|
317
406
|
output: {
|
|
318
407
|
speechGain: 1,
|
|
319
|
-
silenceGain:
|
|
320
|
-
//
|
|
321
|
-
gainRampTime: 0.
|
|
408
|
+
silenceGain: 0,
|
|
409
|
+
// Full mute for voice-only
|
|
410
|
+
gainRampTime: 0.015,
|
|
411
|
+
// Fast but smooth transitions
|
|
412
|
+
smoothTransitions: true,
|
|
413
|
+
maxGainDb: 6,
|
|
414
|
+
enableCompression: false,
|
|
415
|
+
compression: {
|
|
416
|
+
threshold: -24,
|
|
417
|
+
ratio: 3,
|
|
418
|
+
attack: 3e-3,
|
|
419
|
+
release: 0.05
|
|
420
|
+
},
|
|
322
421
|
...config.output
|
|
323
422
|
},
|
|
324
423
|
livekit: { manageTrackMute: false, ...config.livekit }
|
|
@@ -329,7 +428,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
329
428
|
output: fullConfig.output
|
|
330
429
|
});
|
|
331
430
|
if (!sourceTrack || sourceTrack.kind !== "audio") {
|
|
332
|
-
throw new Error(
|
|
431
|
+
throw new Error(
|
|
432
|
+
"createAudioPipeline requires a valid audio MediaStreamTrack"
|
|
433
|
+
);
|
|
333
434
|
}
|
|
334
435
|
if (sourceTrack.readyState === "ended") {
|
|
335
436
|
throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
|
|
@@ -343,10 +444,7 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
343
444
|
const nsPlugin = getNoiseSuppressionPlugin(
|
|
344
445
|
fullConfig.noiseSuppression?.pluginName
|
|
345
446
|
);
|
|
346
|
-
nsNode = await nsPlugin.createNode(
|
|
347
|
-
context,
|
|
348
|
-
fullConfig.noiseSuppression
|
|
349
|
-
);
|
|
447
|
+
nsNode = await nsPlugin.createNode(context, fullConfig.noiseSuppression);
|
|
350
448
|
} catch (error) {
|
|
351
449
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
352
450
|
console.error("Failed to create noise suppression node:", err);
|
|
@@ -354,27 +452,27 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
354
452
|
throw err;
|
|
355
453
|
}
|
|
356
454
|
const vadStateMachine = new VADStateMachine(fullConfig.vad);
|
|
455
|
+
let vadPlugin;
|
|
357
456
|
try {
|
|
358
|
-
|
|
359
|
-
vadNode = await vadPlugin.createNode(
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
const newState = vadStateMachine.processFrame(prob, timestamp);
|
|
366
|
-
if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
|
|
367
|
-
emitter.emit("vadChange", newState);
|
|
368
|
-
lastVadState = newState;
|
|
369
|
-
updateGain(newState);
|
|
370
|
-
}
|
|
371
|
-
} catch (vadError) {
|
|
372
|
-
const err = vadError instanceof Error ? vadError : new Error(String(vadError));
|
|
373
|
-
console.error("Error in VAD callback:", err);
|
|
374
|
-
emitter.emit("error", err);
|
|
457
|
+
vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
|
|
458
|
+
vadNode = await vadPlugin.createNode(context, fullConfig.vad, (prob) => {
|
|
459
|
+
try {
|
|
460
|
+
const timestamp = context.currentTime * 1e3;
|
|
461
|
+
const newState = vadStateMachine.processFrame(prob, timestamp);
|
|
462
|
+
if (vadPlugin && typeof vadPlugin.updateSpeakingState === "function") {
|
|
463
|
+
vadPlugin.updateSpeakingState(newState.isSpeaking);
|
|
375
464
|
}
|
|
465
|
+
if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
|
|
466
|
+
emitter.emit("vadChange", newState);
|
|
467
|
+
lastVadState = newState;
|
|
468
|
+
updateGain(newState);
|
|
469
|
+
}
|
|
470
|
+
} catch (vadError) {
|
|
471
|
+
const err = vadError instanceof Error ? vadError : new Error(String(vadError));
|
|
472
|
+
console.error("Error in VAD callback:", err);
|
|
473
|
+
emitter.emit("error", err);
|
|
376
474
|
}
|
|
377
|
-
);
|
|
475
|
+
});
|
|
378
476
|
} catch (error) {
|
|
379
477
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
380
478
|
console.error("Failed to create VAD node:", err);
|
|
@@ -391,15 +489,31 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
391
489
|
nsNode.connect(splitter);
|
|
392
490
|
splitter.connect(vadNode);
|
|
393
491
|
const delayNode = context.createDelay(1);
|
|
394
|
-
const preRollSeconds = (fullConfig.vad?.preRollMs ??
|
|
492
|
+
const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
|
|
395
493
|
delayNode.delayTime.value = preRollSeconds;
|
|
396
494
|
const gainNode = context.createGain();
|
|
397
495
|
gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
|
|
496
|
+
let compressor = null;
|
|
497
|
+
if (fullConfig.output?.enableCompression) {
|
|
498
|
+
compressor = context.createDynamicsCompressor();
|
|
499
|
+
const comp = fullConfig.output.compression;
|
|
500
|
+
compressor.threshold.value = comp.threshold ?? -24;
|
|
501
|
+
compressor.ratio.value = comp.ratio ?? 3;
|
|
502
|
+
compressor.attack.value = comp.attack ?? 3e-3;
|
|
503
|
+
compressor.release.value = comp.release ?? 0.05;
|
|
504
|
+
compressor.knee.value = 10;
|
|
505
|
+
}
|
|
398
506
|
const destination = context.createMediaStreamDestination();
|
|
399
507
|
try {
|
|
400
508
|
splitter.connect(delayNode);
|
|
401
509
|
delayNode.connect(gainNode);
|
|
402
|
-
|
|
510
|
+
if (compressor) {
|
|
511
|
+
gainNode.connect(compressor);
|
|
512
|
+
compressor.connect(destination);
|
|
513
|
+
console.log("Compression enabled:", fullConfig.output?.compression);
|
|
514
|
+
} else {
|
|
515
|
+
gainNode.connect(destination);
|
|
516
|
+
}
|
|
403
517
|
} catch (error) {
|
|
404
518
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
405
519
|
console.error("Failed to wire audio pipeline:", err);
|
|
@@ -408,10 +522,24 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
408
522
|
}
|
|
409
523
|
function updateGain(state) {
|
|
410
524
|
try {
|
|
411
|
-
const {
|
|
412
|
-
|
|
525
|
+
const {
|
|
526
|
+
speechGain = 1,
|
|
527
|
+
silenceGain = 0,
|
|
528
|
+
gainRampTime = 0.015,
|
|
529
|
+
smoothTransitions = true,
|
|
530
|
+
maxGainDb = 6
|
|
531
|
+
} = fullConfig.output;
|
|
532
|
+
const maxGainLinear = Math.pow(10, maxGainDb / 20);
|
|
533
|
+
const limitedSpeechGain = Math.min(speechGain, maxGainLinear);
|
|
534
|
+
const targetGain = state.isSpeaking ? limitedSpeechGain : silenceGain;
|
|
413
535
|
const now = context.currentTime;
|
|
414
|
-
|
|
536
|
+
if (smoothTransitions) {
|
|
537
|
+
gainNode.gain.cancelScheduledValues(now);
|
|
538
|
+
gainNode.gain.setValueAtTime(gainNode.gain.value, now);
|
|
539
|
+
gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime / 3);
|
|
540
|
+
} else {
|
|
541
|
+
gainNode.gain.setValueAtTime(targetGain, now);
|
|
542
|
+
}
|
|
415
543
|
} catch (error) {
|
|
416
544
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
417
545
|
console.error("Failed to update gain:", err);
|
|
@@ -467,6 +595,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
467
595
|
vadNode.disconnect();
|
|
468
596
|
delayNode.disconnect();
|
|
469
597
|
gainNode.disconnect();
|
|
598
|
+
if (compressor) {
|
|
599
|
+
compressor.disconnect();
|
|
600
|
+
}
|
|
470
601
|
destination.stream.getTracks().forEach((t) => t.stop());
|
|
471
602
|
unregisterPipeline();
|
|
472
603
|
} catch (error) {
|
|
@@ -483,7 +614,47 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
483
614
|
try {
|
|
484
615
|
if (newConfig.vad) {
|
|
485
616
|
vadStateMachine.updateConfig(newConfig.vad);
|
|
617
|
+
Object.assign(fullConfig.vad, newConfig.vad);
|
|
618
|
+
if (newConfig.vad.preRollMs !== void 0) {
|
|
619
|
+
const preRollSeconds2 = newConfig.vad.preRollMs / 1e3;
|
|
620
|
+
delayNode.delayTime.setValueAtTime(
|
|
621
|
+
preRollSeconds2,
|
|
622
|
+
context.currentTime
|
|
623
|
+
);
|
|
624
|
+
}
|
|
625
|
+
}
|
|
626
|
+
if (newConfig.output) {
|
|
627
|
+
Object.assign(fullConfig.output, newConfig.output);
|
|
628
|
+
updateGain(lastVadState);
|
|
629
|
+
if (compressor && newConfig.output.compression) {
|
|
630
|
+
const comp = newConfig.output.compression;
|
|
631
|
+
if (comp.threshold !== void 0) {
|
|
632
|
+
compressor.threshold.setValueAtTime(
|
|
633
|
+
comp.threshold,
|
|
634
|
+
context.currentTime
|
|
635
|
+
);
|
|
636
|
+
}
|
|
637
|
+
if (comp.ratio !== void 0) {
|
|
638
|
+
compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
|
|
639
|
+
}
|
|
640
|
+
if (comp.attack !== void 0) {
|
|
641
|
+
compressor.attack.setValueAtTime(
|
|
642
|
+
comp.attack,
|
|
643
|
+
context.currentTime
|
|
644
|
+
);
|
|
645
|
+
}
|
|
646
|
+
if (comp.release !== void 0) {
|
|
647
|
+
compressor.release.setValueAtTime(
|
|
648
|
+
comp.release,
|
|
649
|
+
context.currentTime
|
|
650
|
+
);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
}
|
|
654
|
+
if (newConfig.livekit) {
|
|
655
|
+
Object.assign(fullConfig.livekit, newConfig.livekit);
|
|
486
656
|
}
|
|
657
|
+
console.log("Pipeline config updated:", newConfig);
|
|
487
658
|
} catch (error) {
|
|
488
659
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
489
660
|
console.error("Failed to update config:", err);
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
createAudioPipeline
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-XXTNAUYX.mjs";
|
|
4
|
+
import "../chunk-N553RHTI.mjs";
|
|
5
5
|
import "../chunk-OZ7KMC4S.mjs";
|
|
6
|
-
import "../chunk-
|
|
6
|
+
import "../chunk-H5UKZU2Y.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-VEJXAEMM.mjs";
|
|
9
9
|
export {
|
|
10
10
|
createAudioPipeline
|
|
11
11
|
};
|
package/dist/types.d.mts
CHANGED
|
@@ -35,46 +35,154 @@ interface AudioProcessingConfig {
|
|
|
35
35
|
vad?: {
|
|
36
36
|
enabled: boolean;
|
|
37
37
|
/**
|
|
38
|
-
* Plugin name to use. Defaults to '
|
|
38
|
+
* Plugin name to use. Defaults to 'energy-vad'.
|
|
39
39
|
*/
|
|
40
40
|
pluginName?: string;
|
|
41
41
|
/**
|
|
42
42
|
* Probability threshold for speech onset (0-1).
|
|
43
|
-
*
|
|
43
|
+
* When VAD probability rises above this, audio is unmuted.
|
|
44
|
+
* Lower = more sensitive (catches quiet speech, may include noise)
|
|
45
|
+
* Higher = less sensitive (only confident speech, may clip quiet parts)
|
|
46
|
+
* Default: 0.6 (optimized for voice-only)
|
|
44
47
|
*/
|
|
45
48
|
startThreshold?: number;
|
|
46
49
|
/**
|
|
47
50
|
* Probability threshold for speech offset (0-1).
|
|
48
|
-
*
|
|
51
|
+
* When VAD probability drops below this (after hangover), audio is muted.
|
|
52
|
+
* Lower = keeps audio on longer (less aggressive gating)
|
|
53
|
+
* Higher = mutes faster (more aggressive noise suppression)
|
|
54
|
+
* Default: 0.45 (balanced voice detection)
|
|
49
55
|
*/
|
|
50
56
|
stopThreshold?: number;
|
|
51
57
|
/**
|
|
52
|
-
* Time in ms to wait after speech stops before
|
|
53
|
-
*
|
|
58
|
+
* Time in ms to wait after speech stops before muting.
|
|
59
|
+
* Prevents rapid on/off toggling during pauses.
|
|
60
|
+
* Lower = more aggressive gating, may clip between words
|
|
61
|
+
* Higher = smoother but may let trailing noise through
|
|
62
|
+
* Default: 400ms (optimized for natural speech)
|
|
54
63
|
*/
|
|
55
64
|
hangoverMs?: number;
|
|
56
65
|
/**
|
|
57
|
-
* Time in ms of audio to buffer before speech onset
|
|
58
|
-
*
|
|
66
|
+
* Time in ms of audio to buffer before speech onset.
|
|
67
|
+
* Prevents cutting off the beginning of speech.
|
|
68
|
+
* Default: 250ms (generous pre-roll for voice)
|
|
59
69
|
*/
|
|
60
70
|
preRollMs?: number;
|
|
71
|
+
/**
|
|
72
|
+
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
+
* Filters out very brief noise spikes.
|
|
74
|
+
* Default: 100ms
|
|
75
|
+
*/
|
|
76
|
+
minSpeechDurationMs?: number;
|
|
77
|
+
/**
|
|
78
|
+
* Minimum silence duration in ms before allowing another speech segment.
|
|
79
|
+
* Prevents false positives from quick noise bursts.
|
|
80
|
+
* Default: 150ms
|
|
81
|
+
*/
|
|
82
|
+
minSilenceDurationMs?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Advanced: Energy VAD specific parameters
|
|
85
|
+
*/
|
|
86
|
+
energyVad?: {
|
|
87
|
+
/**
|
|
88
|
+
* Smoothing factor for energy calculation (0-1).
|
|
89
|
+
* Higher = more smoothing, slower to react
|
|
90
|
+
* Default: 0.95
|
|
91
|
+
*/
|
|
92
|
+
smoothing?: number;
|
|
93
|
+
/**
|
|
94
|
+
* Initial noise floor estimate.
|
|
95
|
+
* Default: 0.001
|
|
96
|
+
*/
|
|
97
|
+
initialNoiseFloor?: number;
|
|
98
|
+
/**
|
|
99
|
+
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
+
* Default: 0.01
|
|
101
|
+
*/
|
|
102
|
+
noiseFloorAdaptRateQuiet?: number;
|
|
103
|
+
/**
|
|
104
|
+
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
+
* Default: 0.001 (slower adaptation for speech)
|
|
106
|
+
*/
|
|
107
|
+
noiseFloorAdaptRateLoud?: number;
|
|
108
|
+
/**
|
|
109
|
+
* Minimum SNR (Signal-to-Noise Ratio) for speech detection.
|
|
110
|
+
* Default: 2.0 (voice is 2x louder than noise floor)
|
|
111
|
+
*/
|
|
112
|
+
minSNR?: number;
|
|
113
|
+
/**
|
|
114
|
+
* SNR range for probability scaling.
|
|
115
|
+
* Default: 8.0 (probability scales from minSNR to minSNR+snrRange)
|
|
116
|
+
*/
|
|
117
|
+
snrRange?: number;
|
|
118
|
+
};
|
|
61
119
|
};
|
|
62
120
|
/**
|
|
63
121
|
* Output gain and muting configuration.
|
|
64
122
|
*/
|
|
65
123
|
output?: {
|
|
66
124
|
/**
|
|
67
|
-
* Gain to apply when speaking (0-
|
|
125
|
+
* Gain to apply when speaking (0-infinity).
|
|
126
|
+
* Values > 1.0 will amplify the voice.
|
|
127
|
+
* Default: 1.0 (unity gain)
|
|
68
128
|
*/
|
|
69
129
|
speechGain?: number;
|
|
70
130
|
/**
|
|
71
|
-
* Gain to apply when silent (0-1).
|
|
131
|
+
* Gain to apply when silent (0-1).
|
|
132
|
+
* 0.0 = complete mute (recommended for voice-only)
|
|
133
|
+
* 0.1-0.3 = allow some background ambience
|
|
134
|
+
* Default: 0.0 (full mute for voice-only)
|
|
72
135
|
*/
|
|
73
136
|
silenceGain?: number;
|
|
74
137
|
/**
|
|
75
|
-
* Time in seconds to ramp gain changes.
|
|
138
|
+
* Time in seconds to ramp gain changes.
|
|
139
|
+
* Lower = faster transitions (may cause clicks)
|
|
140
|
+
* Higher = smoother transitions (may sound sluggish)
|
|
141
|
+
* Default: 0.015 (fast but smooth for voice)
|
|
76
142
|
*/
|
|
77
143
|
gainRampTime?: number;
|
|
144
|
+
/**
|
|
145
|
+
* Apply additional gain reduction during the transition to silence.
|
|
146
|
+
* Helps create cleaner cutoffs without abrupt clicks.
|
|
147
|
+
* Default: true
|
|
148
|
+
*/
|
|
149
|
+
smoothTransitions?: boolean;
|
|
150
|
+
/**
|
|
151
|
+
* Maximum gain in dB to apply (prevents clipping).
|
|
152
|
+
* Default: 6.0 dB (roughly 2x amplitude)
|
|
153
|
+
*/
|
|
154
|
+
maxGainDb?: number;
|
|
155
|
+
/**
|
|
156
|
+
* Apply dynamic range compression when speaking.
|
|
157
|
+
* Makes quiet parts louder and loud parts quieter.
|
|
158
|
+
* Default: false (transparent audio)
|
|
159
|
+
*/
|
|
160
|
+
enableCompression?: boolean;
|
|
161
|
+
/**
|
|
162
|
+
* Compression settings (when enabled)
|
|
163
|
+
*/
|
|
164
|
+
compression?: {
|
|
165
|
+
/**
|
|
166
|
+
* Threshold in dB above which compression starts.
|
|
167
|
+
* Default: -24.0 dB
|
|
168
|
+
*/
|
|
169
|
+
threshold?: number;
|
|
170
|
+
/**
|
|
171
|
+
* Compression ratio (1:N).
|
|
172
|
+
* Default: 3.0 (3:1 ratio)
|
|
173
|
+
*/
|
|
174
|
+
ratio?: number;
|
|
175
|
+
/**
|
|
176
|
+
* Attack time in seconds.
|
|
177
|
+
* Default: 0.003 (3ms)
|
|
178
|
+
*/
|
|
179
|
+
attack?: number;
|
|
180
|
+
/**
|
|
181
|
+
* Release time in seconds.
|
|
182
|
+
* Default: 0.05 (50ms)
|
|
183
|
+
*/
|
|
184
|
+
release?: number;
|
|
185
|
+
};
|
|
78
186
|
};
|
|
79
187
|
/**
|
|
80
188
|
* LiveKit integration configuration.
|