@tensamin/audio 0.1.14 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -231
- package/dist/chunk-6BJ4XGSA.mjs +80 -0
- package/dist/chunk-AQ5RVY33.mjs +74 -0
- package/dist/chunk-IS37FHDN.mjs +33 -0
- package/dist/chunk-K4J3UUOR.mjs +178 -0
- package/dist/chunk-QNQK6QFB.mjs +71 -0
- package/dist/context/audio-context.d.mts +0 -24
- package/dist/context/audio-context.d.ts +0 -24
- package/dist/index.d.mts +2 -8
- package/dist/index.d.ts +2 -8
- package/dist/index.js +285 -680
- package/dist/index.mjs +8 -43
- package/dist/livekit/integration.d.mts +3 -7
- package/dist/livekit/integration.d.ts +3 -7
- package/dist/livekit/integration.js +280 -626
- package/dist/livekit/integration.mjs +7 -8
- package/dist/noise-suppression/deepfilternet-node.d.mts +12 -0
- package/dist/noise-suppression/deepfilternet-node.d.ts +12 -0
- package/dist/noise-suppression/deepfilternet-node.js +57 -0
- package/dist/noise-suppression/deepfilternet-node.mjs +6 -0
- package/dist/pipeline/audio-pipeline.d.mts +2 -2
- package/dist/pipeline/audio-pipeline.d.ts +2 -2
- package/dist/pipeline/audio-pipeline.js +219 -554
- package/dist/pipeline/audio-pipeline.mjs +4 -5
- package/dist/types.d.mts +42 -257
- package/dist/types.d.ts +42 -257
- package/dist/vad/vad-node.d.mts +7 -9
- package/dist/vad/vad-node.d.ts +7 -9
- package/dist/vad/vad-node.js +47 -156
- package/dist/vad/vad-node.mjs +3 -3
- package/dist/vad/vad-state.d.mts +9 -11
- package/dist/vad/vad-state.d.ts +9 -11
- package/dist/vad/vad-state.js +50 -79
- package/dist/vad/vad-state.mjs +3 -3
- package/package.json +21 -21
- package/dist/chunk-2G2JFHJY.mjs +0 -180
- package/dist/chunk-6F2HZUYO.mjs +0 -91
- package/dist/chunk-K4YLH73B.mjs +0 -103
- package/dist/chunk-R5M2DGAQ.mjs +0 -311
- package/dist/chunk-UFKIAMG3.mjs +0 -47
- package/dist/chunk-XO6B3D4A.mjs +0 -67
- package/dist/extensibility/plugins.d.mts +0 -9
- package/dist/extensibility/plugins.d.ts +0 -9
- package/dist/extensibility/plugins.js +0 -320
- package/dist/extensibility/plugins.mjs +0 -14
- package/dist/noise-suppression/rnnoise-node.d.mts +0 -10
- package/dist/noise-suppression/rnnoise-node.d.ts +0 -10
- package/dist/noise-suppression/rnnoise-node.js +0 -101
- package/dist/noise-suppression/rnnoise-node.mjs +0 -6
|
@@ -56,371 +56,171 @@ function unregisterPipeline() {
|
|
|
56
56
|
activePipelines = Math.max(0, activePipelines - 1);
|
|
57
57
|
}
|
|
58
58
|
|
|
59
|
-
// src/noise-suppression/
|
|
60
|
-
var
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
});
|
|
85
|
-
console.log("RNNoise WASM loaded successfully");
|
|
59
|
+
// src/noise-suppression/deepfilternet-node.ts
|
|
60
|
+
var import_deepfilternet3_noise_filter = require("deepfilternet3-noise-filter");
|
|
61
|
+
async function createDeepFilterNet3Node(context, config) {
|
|
62
|
+
const processorConfig = {
|
|
63
|
+
sampleRate: context.sampleRate,
|
|
64
|
+
noiseReductionLevel: config?.noiseReductionLevel ?? 60
|
|
65
|
+
};
|
|
66
|
+
if (config?.assetConfig) {
|
|
67
|
+
processorConfig.assetConfig = config.assetConfig;
|
|
68
|
+
}
|
|
69
|
+
const processor = new import_deepfilternet3_noise_filter.DeepFilterNet3Processor(processorConfig);
|
|
70
|
+
await processor.initialize();
|
|
71
|
+
const node = await processor.createAudioWorkletNode(context);
|
|
72
|
+
const enabled = config?.enabled ?? true;
|
|
73
|
+
if (!enabled) {
|
|
74
|
+
processor.setNoiseSuppressionEnabled(false);
|
|
75
|
+
}
|
|
76
|
+
return {
|
|
77
|
+
node,
|
|
78
|
+
processor,
|
|
79
|
+
dispose: () => {
|
|
80
|
+
try {
|
|
81
|
+
processor.destroy();
|
|
82
|
+
} catch (error) {
|
|
83
|
+
console.error("Failed to dispose DeepFilterNet3 processor", error);
|
|
86
84
|
}
|
|
87
|
-
} catch (error) {
|
|
88
|
-
const err = new Error(
|
|
89
|
-
`Failed to load RNNoise WASM binary: ${error instanceof Error ? error.message : String(error)}`
|
|
90
|
-
);
|
|
91
|
-
console.error(err);
|
|
92
|
-
throw err;
|
|
93
|
-
}
|
|
94
|
-
const workletUrl = config.workletUrl;
|
|
95
|
-
try {
|
|
96
|
-
await context.audioWorklet.addModule(workletUrl);
|
|
97
|
-
console.log("RNNoise worklet loaded successfully");
|
|
98
|
-
} catch (e) {
|
|
99
|
-
const error = new Error(
|
|
100
|
-
`Failed to load RNNoise worklet from ${workletUrl}: ${e instanceof Error ? e.message : String(e)}. Ensure the workletUrl points to a valid RNNoise worklet script.`
|
|
101
|
-
);
|
|
102
|
-
console.error(error.message);
|
|
103
|
-
throw error;
|
|
104
|
-
}
|
|
105
|
-
try {
|
|
106
|
-
const node = new RnnoiseWorkletNode(context, {
|
|
107
|
-
wasmBinary: this.wasmBuffer,
|
|
108
|
-
maxChannels: 1
|
|
109
|
-
// Mono for now
|
|
110
|
-
});
|
|
111
|
-
console.log("RNNoise worklet node created successfully");
|
|
112
|
-
return node;
|
|
113
|
-
} catch (error) {
|
|
114
|
-
const err = new Error(
|
|
115
|
-
`Failed to create RNNoise worklet node: ${error instanceof Error ? error.message : String(error)}`
|
|
116
|
-
);
|
|
117
|
-
console.error(err);
|
|
118
|
-
throw err;
|
|
119
85
|
}
|
|
120
|
-
}
|
|
121
|
-
}
|
|
86
|
+
};
|
|
87
|
+
}
|
|
122
88
|
|
|
123
89
|
// src/vad/vad-node.ts
|
|
124
|
-
|
|
125
|
-
const energyParams = vadConfig?.energyVad || {};
|
|
126
|
-
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
|
-
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
129
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
130
|
-
const minSNR = energyParams.minSNR ?? 12;
|
|
131
|
-
const snrRange = energyParams.snrRange ?? 10;
|
|
132
|
-
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
90
|
+
function createLevelDetectorWorkletCode(smoothing) {
|
|
133
91
|
return `
|
|
134
|
-
class
|
|
92
|
+
class LevelDetectorProcessor extends AudioWorkletProcessor {
|
|
135
93
|
constructor() {
|
|
136
94
|
super();
|
|
95
|
+
this.smoothed = 0;
|
|
137
96
|
this.smoothing = ${smoothing};
|
|
138
|
-
this.energy = 0;
|
|
139
|
-
this.noiseFloor = ${initialNoiseFloor};
|
|
140
|
-
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
141
|
-
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
142
|
-
this.minSNR = ${minSNR};
|
|
143
|
-
this.snrRange = ${snrRange};
|
|
144
|
-
this.minEnergy = ${minEnergy};
|
|
145
|
-
this.isSpeaking = false;
|
|
146
|
-
|
|
147
|
-
this.port.onmessage = (event) => {
|
|
148
|
-
if (event.data && event.data.isSpeaking !== undefined) {
|
|
149
|
-
this.isSpeaking = event.data.isSpeaking;
|
|
150
|
-
}
|
|
151
|
-
};
|
|
152
97
|
}
|
|
153
98
|
|
|
154
|
-
process(inputs
|
|
99
|
+
process(inputs) {
|
|
155
100
|
const input = inputs[0];
|
|
156
|
-
if (!input ||
|
|
101
|
+
if (!input || input.length === 0) return true;
|
|
157
102
|
const channel = input[0];
|
|
158
|
-
|
|
159
|
-
|
|
103
|
+
if (!channel || channel.length === 0) return true;
|
|
104
|
+
|
|
160
105
|
let sum = 0;
|
|
161
|
-
let peak = 0;
|
|
162
106
|
for (let i = 0; i < channel.length; i++) {
|
|
163
|
-
const sample =
|
|
164
|
-
sum +=
|
|
165
|
-
peak = Math.max(peak, sample);
|
|
107
|
+
const sample = channel[i];
|
|
108
|
+
sum += sample * sample;
|
|
166
109
|
}
|
|
167
|
-
const
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
172
|
-
|
|
173
|
-
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
174
|
-
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
175
|
-
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
176
|
-
const crestFactor = peak / (instantRms + 1e-10);
|
|
177
|
-
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
178
|
-
|
|
179
|
-
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
180
|
-
// This prevents sharp transients from affecting the noise floor
|
|
181
|
-
if (this.energy < this.noiseFloor) {
|
|
182
|
-
// Signal is quieter than noise floor, adapt downwards slowly
|
|
183
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
184
|
-
} else {
|
|
185
|
-
// Calculate SNR based on smoothed energy
|
|
186
|
-
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
187
|
-
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
188
|
-
|
|
189
|
-
// Only adapt upwards if:
|
|
190
|
-
// 1. SNR is low (< 10dB) - likely just background noise
|
|
191
|
-
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
192
|
-
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
193
|
-
// This is persistent background noise, adapt upwards
|
|
194
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
195
|
-
} else {
|
|
196
|
-
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
197
|
-
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
198
|
-
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
199
|
-
}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
// Ensure noise floor doesn't drop to absolute zero
|
|
203
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
204
|
-
|
|
205
|
-
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
206
|
-
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
207
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
208
|
-
|
|
209
|
-
// Map SNR dB to probability (0-1)
|
|
210
|
-
// Probability is 0 when snrDb <= minSNR
|
|
211
|
-
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
212
|
-
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
213
|
-
|
|
214
|
-
// Apply absolute energy threshold with soft knee
|
|
215
|
-
if (this.energy < this.minEnergy) {
|
|
216
|
-
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
217
|
-
probability *= Math.pow(energyRatio, 2);
|
|
218
|
-
}
|
|
219
|
-
|
|
220
|
-
// Apply crest factor penalty
|
|
221
|
-
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
222
|
-
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
223
|
-
// We penalize anything above 14dB
|
|
224
|
-
if (crestFactorDb > 14) {
|
|
225
|
-
const excess = crestFactorDb - 14;
|
|
226
|
-
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
227
|
-
probability *= penalty;
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
231
|
-
|
|
110
|
+
const rms = Math.sqrt(sum / channel.length);
|
|
111
|
+
this.smoothed = this.smoothed * this.smoothing + rms * (1 - this.smoothing);
|
|
112
|
+
const levelDb = 20 * Math.log10(Math.max(1e-8, this.smoothed));
|
|
113
|
+
this.port.postMessage({ levelDb });
|
|
232
114
|
return true;
|
|
233
115
|
}
|
|
234
116
|
}
|
|
235
|
-
|
|
117
|
+
|
|
118
|
+
registerProcessor('level-detector-processor', LevelDetectorProcessor);
|
|
236
119
|
`;
|
|
237
|
-
}
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
}
|
|
247
|
-
const workletCode = createEnergyVadWorkletCode(config);
|
|
248
|
-
const blob = new Blob([workletCode], {
|
|
249
|
-
type: "application/javascript"
|
|
250
|
-
});
|
|
251
|
-
const url = URL.createObjectURL(blob);
|
|
252
|
-
try {
|
|
253
|
-
await context.audioWorklet.addModule(url);
|
|
254
|
-
console.log("Energy VAD worklet loaded successfully");
|
|
255
|
-
} catch (e) {
|
|
256
|
-
const error = new Error(
|
|
257
|
-
`Failed to load Energy VAD worklet: ${e instanceof Error ? e.message : String(e)}`
|
|
258
|
-
);
|
|
259
|
-
console.error(error.message);
|
|
260
|
-
URL.revokeObjectURL(url);
|
|
261
|
-
throw error;
|
|
262
|
-
}
|
|
120
|
+
}
|
|
121
|
+
async function createLevelDetectorNode(context, onLevel, options) {
|
|
122
|
+
const smoothing = options?.smoothing ?? 0.9;
|
|
123
|
+
const workletCode = createLevelDetectorWorkletCode(smoothing);
|
|
124
|
+
const blob = new Blob([workletCode], { type: "application/javascript" });
|
|
125
|
+
const url = URL.createObjectURL(blob);
|
|
126
|
+
try {
|
|
127
|
+
await context.audioWorklet.addModule(url);
|
|
128
|
+
} finally {
|
|
263
129
|
URL.revokeObjectURL(url);
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
);
|
|
273
|
-
console.error(error.message);
|
|
274
|
-
throw error;
|
|
130
|
+
}
|
|
131
|
+
const node = new AudioWorkletNode(context, "level-detector-processor", {
|
|
132
|
+
numberOfInputs: 1,
|
|
133
|
+
numberOfOutputs: 0
|
|
134
|
+
});
|
|
135
|
+
node.port.onmessage = (event) => {
|
|
136
|
+
const { levelDb } = event.data ?? {};
|
|
137
|
+
if (typeof levelDb === "number" && !Number.isNaN(levelDb)) {
|
|
138
|
+
onLevel(levelDb);
|
|
275
139
|
}
|
|
276
|
-
|
|
140
|
+
};
|
|
141
|
+
node.port.onmessageerror = (event) => {
|
|
142
|
+
console.error("Level detector port error", event);
|
|
143
|
+
};
|
|
144
|
+
return {
|
|
145
|
+
node,
|
|
146
|
+
dispose: () => {
|
|
277
147
|
try {
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
onDecision(probability);
|
|
281
|
-
} else {
|
|
282
|
-
console.warn("Invalid VAD probability received:", event.data);
|
|
283
|
-
}
|
|
148
|
+
node.port.onmessage = null;
|
|
149
|
+
node.port.close();
|
|
284
150
|
} catch (error) {
|
|
285
|
-
console.error("
|
|
151
|
+
console.error("Failed to dispose level detector node", error);
|
|
286
152
|
}
|
|
287
|
-
};
|
|
288
|
-
node.port.onmessageerror = (event) => {
|
|
289
|
-
console.error("VAD port message error:", event);
|
|
290
|
-
};
|
|
291
|
-
return node;
|
|
292
|
-
}
|
|
293
|
-
updateSpeakingState(isSpeaking) {
|
|
294
|
-
if (this.workletNode) {
|
|
295
|
-
this.workletNode.port.postMessage({ isSpeaking });
|
|
296
153
|
}
|
|
297
|
-
}
|
|
298
|
-
};
|
|
299
|
-
|
|
300
|
-
// src/extensibility/plugins.ts
|
|
301
|
-
var nsPlugins = /* @__PURE__ */ new Map();
|
|
302
|
-
var vadPlugins = /* @__PURE__ */ new Map();
|
|
303
|
-
var defaultNs = new RNNoisePlugin();
|
|
304
|
-
nsPlugins.set(defaultNs.name, defaultNs);
|
|
305
|
-
var defaultVad = new EnergyVADPlugin();
|
|
306
|
-
vadPlugins.set(defaultVad.name, defaultVad);
|
|
307
|
-
function getNoiseSuppressionPlugin(name) {
|
|
308
|
-
if (!name) return defaultNs;
|
|
309
|
-
const plugin = nsPlugins.get(name);
|
|
310
|
-
if (!plugin) {
|
|
311
|
-
console.warn(
|
|
312
|
-
`Noise suppression plugin '${name}' not found, falling back to default.`
|
|
313
|
-
);
|
|
314
|
-
return defaultNs;
|
|
315
|
-
}
|
|
316
|
-
return plugin;
|
|
317
|
-
}
|
|
318
|
-
function getVADPlugin(name) {
|
|
319
|
-
if (!name) return defaultVad;
|
|
320
|
-
const plugin = vadPlugins.get(name);
|
|
321
|
-
if (!plugin) {
|
|
322
|
-
console.warn(`VAD plugin '${name}' not found, falling back to default.`);
|
|
323
|
-
return defaultVad;
|
|
324
|
-
}
|
|
325
|
-
return plugin;
|
|
154
|
+
};
|
|
326
155
|
}
|
|
327
156
|
|
|
328
157
|
// src/vad/vad-state.ts
|
|
329
|
-
var
|
|
158
|
+
var LevelBasedVAD = class {
|
|
330
159
|
config;
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
lastSilenceTime = 0;
|
|
335
|
-
frameDurationMs = 20;
|
|
336
|
-
// Assumed frame duration, updated by calls
|
|
160
|
+
speaking = false;
|
|
161
|
+
pendingSpeechSince = null;
|
|
162
|
+
pendingSilenceSince = null;
|
|
337
163
|
constructor(config) {
|
|
338
164
|
this.config = {
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
hangoverMs: config?.hangoverMs ?? 300,
|
|
347
|
-
// Smooth for natural speech
|
|
348
|
-
preRollMs: config?.preRollMs ?? 250,
|
|
349
|
-
// Generous pre-roll
|
|
350
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
351
|
-
// Aggressive transient rejection
|
|
352
|
-
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
353
|
-
energyVad: {
|
|
354
|
-
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
355
|
-
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
356
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
357
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
358
|
-
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
359
|
-
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
360
|
-
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
361
|
-
}
|
|
165
|
+
minDb: config.minDb,
|
|
166
|
+
maxDb: config.maxDb,
|
|
167
|
+
speakOnRatio: config.speakOnRatio ?? 0.6,
|
|
168
|
+
speakOffRatio: config.speakOffRatio ?? 0.3,
|
|
169
|
+
hangoverMs: config.hangoverMs ?? 350,
|
|
170
|
+
attackMs: config.attackMs ?? 50,
|
|
171
|
+
releaseMs: config.releaseMs ?? 120
|
|
362
172
|
};
|
|
363
|
-
this.lastSilenceTime = Date.now();
|
|
364
173
|
}
|
|
365
174
|
updateConfig(config) {
|
|
366
|
-
this.config = {
|
|
175
|
+
this.config = {
|
|
176
|
+
...this.config,
|
|
177
|
+
...config,
|
|
178
|
+
speakOnRatio: config.speakOnRatio ?? this.config.speakOnRatio,
|
|
179
|
+
speakOffRatio: config.speakOffRatio ?? this.config.speakOffRatio,
|
|
180
|
+
hangoverMs: config.hangoverMs ?? this.config.hangoverMs,
|
|
181
|
+
attackMs: config.attackMs ?? this.config.attackMs,
|
|
182
|
+
releaseMs: config.releaseMs ?? this.config.releaseMs
|
|
183
|
+
};
|
|
367
184
|
}
|
|
368
|
-
|
|
185
|
+
process(levelDb, timestampMs) {
|
|
369
186
|
const {
|
|
370
|
-
|
|
371
|
-
|
|
187
|
+
minDb,
|
|
188
|
+
maxDb,
|
|
189
|
+
speakOnRatio,
|
|
190
|
+
speakOffRatio,
|
|
372
191
|
hangoverMs,
|
|
373
|
-
|
|
374
|
-
|
|
192
|
+
attackMs,
|
|
193
|
+
releaseMs
|
|
375
194
|
} = this.config;
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
this.
|
|
383
|
-
this.
|
|
384
|
-
|
|
385
|
-
newState = "silent";
|
|
195
|
+
const clamped = Math.min(maxDb, Math.max(minDb, levelDb));
|
|
196
|
+
const norm = (clamped - minDb) / Math.max(1, maxDb - minDb);
|
|
197
|
+
if (!this.speaking) {
|
|
198
|
+
if (norm >= speakOnRatio) {
|
|
199
|
+
this.pendingSpeechSince = this.pendingSpeechSince ?? timestampMs;
|
|
200
|
+
if (timestampMs - this.pendingSpeechSince >= attackMs) {
|
|
201
|
+
this.speaking = true;
|
|
202
|
+
this.pendingSpeechSince = null;
|
|
203
|
+
this.pendingSilenceSince = null;
|
|
386
204
|
}
|
|
387
205
|
} else {
|
|
388
|
-
|
|
389
|
-
this.lastSilenceTime = timestamp;
|
|
206
|
+
this.pendingSpeechSince = null;
|
|
390
207
|
}
|
|
391
|
-
} else
|
|
392
|
-
if (
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
208
|
+
} else {
|
|
209
|
+
if (norm <= speakOffRatio) {
|
|
210
|
+
this.pendingSilenceSince = this.pendingSilenceSince ?? timestampMs;
|
|
211
|
+
const releaseWindow = Math.max(releaseMs, hangoverMs);
|
|
212
|
+
if (timestampMs - this.pendingSilenceSince >= releaseWindow) {
|
|
213
|
+
this.speaking = false;
|
|
214
|
+
this.pendingSilenceSince = null;
|
|
215
|
+
this.pendingSpeechSince = null;
|
|
398
216
|
}
|
|
399
|
-
this.lastSpeechTime = timestamp;
|
|
400
217
|
} else {
|
|
401
|
-
|
|
402
|
-
this.lastSilenceTime = timestamp;
|
|
403
|
-
}
|
|
404
|
-
} else if (this.currentState === "speaking") {
|
|
405
|
-
if (probability >= stopThreshold) {
|
|
406
|
-
newState = "speaking";
|
|
407
|
-
this.lastSpeechTime = timestamp;
|
|
408
|
-
} else {
|
|
409
|
-
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
410
|
-
if (timeSinceSpeech < hangoverMs) {
|
|
411
|
-
newState = "speaking";
|
|
412
|
-
} else {
|
|
413
|
-
newState = "speech_ending";
|
|
414
|
-
this.lastSilenceTime = timestamp;
|
|
415
|
-
}
|
|
218
|
+
this.pendingSilenceSince = null;
|
|
416
219
|
}
|
|
417
220
|
}
|
|
418
|
-
if (newState === "speech_ending") newState = "silent";
|
|
419
|
-
this.currentState = newState;
|
|
420
221
|
return {
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
state: newState
|
|
222
|
+
speaking: this.speaking,
|
|
223
|
+
levelDb: clamped
|
|
424
224
|
};
|
|
425
225
|
}
|
|
426
226
|
};
|
|
@@ -429,58 +229,33 @@ var VADStateMachine = class {
|
|
|
429
229
|
async function createAudioPipeline(sourceTrack, config = {}) {
|
|
430
230
|
const context = getAudioContext();
|
|
431
231
|
registerPipeline();
|
|
432
|
-
const
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
232
|
+
const nsConfig = {
|
|
233
|
+
enabled: config.noiseSuppression?.enabled ?? true,
|
|
234
|
+
noiseReductionLevel: config.noiseSuppression?.noiseReductionLevel ?? 60
|
|
235
|
+
};
|
|
236
|
+
if (config.noiseSuppression?.assetConfig) {
|
|
237
|
+
nsConfig.assetConfig = config.noiseSuppression.assetConfig;
|
|
238
|
+
}
|
|
436
239
|
const fullConfig = {
|
|
437
|
-
noiseSuppression:
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
hangoverMs: 400,
|
|
447
|
-
preRollMs: 250,
|
|
448
|
-
minSpeechDurationMs: 100,
|
|
449
|
-
minSilenceDurationMs: 150,
|
|
450
|
-
energyVad: {
|
|
451
|
-
smoothing: 0.95,
|
|
452
|
-
initialNoiseFloor: 1e-3,
|
|
453
|
-
noiseFloorAdaptRateQuiet: 0.01,
|
|
454
|
-
noiseFloorAdaptRateLoud: 1e-3,
|
|
455
|
-
minSNR: 2,
|
|
456
|
-
snrRange: 8
|
|
457
|
-
},
|
|
458
|
-
...config.vad
|
|
240
|
+
noiseSuppression: nsConfig,
|
|
241
|
+
speaking: {
|
|
242
|
+
minDb: config.speaking?.minDb ?? -60,
|
|
243
|
+
maxDb: config.speaking?.maxDb ?? -20,
|
|
244
|
+
speakOnRatio: config.speaking?.speakOnRatio ?? 0.6,
|
|
245
|
+
speakOffRatio: config.speaking?.speakOffRatio ?? 0.3,
|
|
246
|
+
hangoverMs: config.speaking?.hangoverMs ?? 350,
|
|
247
|
+
attackMs: config.speaking?.attackMs ?? 50,
|
|
248
|
+
releaseMs: config.speaking?.releaseMs ?? 120
|
|
459
249
|
},
|
|
460
250
|
output: {
|
|
461
|
-
speechGain: 1,
|
|
462
|
-
silenceGain: 0,
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
smoothTransitions: true,
|
|
467
|
-
maxGainDb: 6,
|
|
468
|
-
enableCompression: false,
|
|
469
|
-
compression: {
|
|
470
|
-
threshold: -24,
|
|
471
|
-
ratio: 3,
|
|
472
|
-
attack: 3e-3,
|
|
473
|
-
release: 0.05
|
|
474
|
-
},
|
|
475
|
-
...config.output
|
|
251
|
+
speechGain: config.output?.speechGain ?? 1,
|
|
252
|
+
silenceGain: config.output?.silenceGain ?? 0,
|
|
253
|
+
gainRampTime: config.output?.gainRampTime ?? 0.015,
|
|
254
|
+
maxGainDb: config.output?.maxGainDb ?? 6,
|
|
255
|
+
smoothTransitions: config.output?.smoothTransitions ?? true
|
|
476
256
|
},
|
|
477
|
-
|
|
257
|
+
muteWhenSilent: config.muteWhenSilent ?? false
|
|
478
258
|
};
|
|
479
|
-
console.log("Audio pipeline config:", {
|
|
480
|
-
noiseSuppression: fullConfig.noiseSuppression?.enabled,
|
|
481
|
-
vad: fullConfig.vad?.enabled,
|
|
482
|
-
output: fullConfig.output
|
|
483
|
-
});
|
|
484
259
|
if (!sourceTrack || sourceTrack.kind !== "audio") {
|
|
485
260
|
throw new Error(
|
|
486
261
|
"createAudioPipeline requires a valid audio MediaStreamTrack"
|
|
@@ -491,232 +266,122 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
491
266
|
}
|
|
492
267
|
const sourceStream = new MediaStream([sourceTrack]);
|
|
493
268
|
const sourceNode = context.createMediaStreamSource(sourceStream);
|
|
494
|
-
let nsNode;
|
|
495
|
-
let vadNode;
|
|
496
269
|
const emitter = (0, import_mitt.default)();
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
try {
|
|
514
|
-
const timestamp = context.currentTime * 1e3;
|
|
515
|
-
const newState = vadStateMachine.processFrame(prob, timestamp);
|
|
516
|
-
if (vadPlugin && typeof vadPlugin.updateSpeakingState === "function") {
|
|
517
|
-
vadPlugin.updateSpeakingState(newState.isSpeaking);
|
|
518
|
-
}
|
|
519
|
-
if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
|
|
520
|
-
emitter.emit("vadChange", newState);
|
|
521
|
-
lastVadState = newState;
|
|
522
|
-
updateGain(newState);
|
|
523
|
-
}
|
|
524
|
-
} catch (vadError) {
|
|
525
|
-
const err = vadError instanceof Error ? vadError : new Error(String(vadError));
|
|
526
|
-
console.error("Error in VAD callback:", err);
|
|
527
|
-
emitter.emit("error", err);
|
|
270
|
+
const vad = new LevelBasedVAD(fullConfig.speaking);
|
|
271
|
+
let lastState = { speaking: false, levelDb: -Infinity };
|
|
272
|
+
const nsHandle = await createDeepFilterNet3Node(
|
|
273
|
+
context,
|
|
274
|
+
fullConfig.noiseSuppression
|
|
275
|
+
);
|
|
276
|
+
const levelHandle = await createLevelDetectorNode(context, (levelDb) => {
|
|
277
|
+
try {
|
|
278
|
+
const timestamp = context.currentTime * 1e3;
|
|
279
|
+
const nextState = vad.process(levelDb, timestamp);
|
|
280
|
+
const speakingChanged = nextState.speaking !== lastState.speaking;
|
|
281
|
+
const levelChanged = Math.abs(nextState.levelDb - lastState.levelDb) > 0.5;
|
|
282
|
+
if (speakingChanged || levelChanged) {
|
|
283
|
+
lastState = nextState;
|
|
284
|
+
updateGain(nextState);
|
|
285
|
+
emitter.emit("speakingChange", nextState);
|
|
528
286
|
}
|
|
529
|
-
})
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
throw err;
|
|
535
|
-
}
|
|
536
|
-
let lastVadState = {
|
|
537
|
-
isSpeaking: false,
|
|
538
|
-
probability: 0,
|
|
539
|
-
state: "silent"
|
|
540
|
-
};
|
|
287
|
+
} catch (error) {
|
|
288
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
289
|
+
emitter.emit("error", err);
|
|
290
|
+
}
|
|
291
|
+
});
|
|
541
292
|
const splitter = context.createGain();
|
|
542
|
-
sourceNode.connect(
|
|
543
|
-
|
|
544
|
-
splitter.connect(
|
|
545
|
-
const delayNode = context.createDelay(1);
|
|
546
|
-
const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
|
|
547
|
-
delayNode.delayTime.value = preRollSeconds;
|
|
293
|
+
sourceNode.connect(nsHandle.node);
|
|
294
|
+
nsHandle.node.connect(splitter);
|
|
295
|
+
splitter.connect(levelHandle.node);
|
|
548
296
|
const gainNode = context.createGain();
|
|
549
297
|
gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
|
|
550
|
-
|
|
551
|
-
if (fullConfig.output?.enableCompression) {
|
|
552
|
-
compressor = context.createDynamicsCompressor();
|
|
553
|
-
const comp = fullConfig.output.compression;
|
|
554
|
-
compressor.threshold.value = comp.threshold ?? -24;
|
|
555
|
-
compressor.ratio.value = comp.ratio ?? 3;
|
|
556
|
-
compressor.attack.value = comp.attack ?? 3e-3;
|
|
557
|
-
compressor.release.value = comp.release ?? 0.05;
|
|
558
|
-
compressor.knee.value = 10;
|
|
559
|
-
}
|
|
298
|
+
splitter.connect(gainNode);
|
|
560
299
|
const destination = context.createMediaStreamDestination();
|
|
561
|
-
|
|
562
|
-
splitter.connect(delayNode);
|
|
563
|
-
delayNode.connect(gainNode);
|
|
564
|
-
if (compressor) {
|
|
565
|
-
gainNode.connect(compressor);
|
|
566
|
-
compressor.connect(destination);
|
|
567
|
-
console.log("Compression enabled:", fullConfig.output?.compression);
|
|
568
|
-
} else {
|
|
569
|
-
gainNode.connect(destination);
|
|
570
|
-
}
|
|
571
|
-
} catch (error) {
|
|
572
|
-
const err = error instanceof Error ? error : new Error(String(error));
|
|
573
|
-
console.error("Failed to wire audio pipeline:", err);
|
|
574
|
-
emitter.emit("error", err);
|
|
575
|
-
throw err;
|
|
576
|
-
}
|
|
300
|
+
gainNode.connect(destination);
|
|
577
301
|
function updateGain(state) {
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
gainNode.gain.setValueAtTime(targetGain, now);
|
|
596
|
-
}
|
|
597
|
-
} catch (error) {
|
|
598
|
-
const err = error instanceof Error ? error : new Error(String(error));
|
|
599
|
-
console.error("Failed to update gain:", err);
|
|
600
|
-
emitter.emit("error", err);
|
|
302
|
+
const {
|
|
303
|
+
speechGain = 1,
|
|
304
|
+
silenceGain = 0,
|
|
305
|
+
gainRampTime = 0.015,
|
|
306
|
+
smoothTransitions = true,
|
|
307
|
+
maxGainDb = 6
|
|
308
|
+
} = fullConfig.output ?? {};
|
|
309
|
+
const maxGainLinear = Math.pow(10, maxGainDb / 20);
|
|
310
|
+
const limitedSpeechGain = Math.min(speechGain ?? 1, maxGainLinear);
|
|
311
|
+
const target = state.speaking ? limitedSpeechGain : silenceGain ?? 0;
|
|
312
|
+
const now = context.currentTime;
|
|
313
|
+
gainNode.gain.cancelScheduledValues(now);
|
|
314
|
+
gainNode.gain.setValueAtTime(gainNode.gain.value, now);
|
|
315
|
+
if (smoothTransitions) {
|
|
316
|
+
gainNode.gain.setTargetAtTime(target, now, gainRampTime / 3);
|
|
317
|
+
} else {
|
|
318
|
+
gainNode.gain.setValueAtTime(target, now);
|
|
601
319
|
}
|
|
602
320
|
}
|
|
603
321
|
const audioTracks = destination.stream.getAudioTracks();
|
|
604
|
-
console.log("Destination stream tracks:", {
|
|
605
|
-
count: audioTracks.length,
|
|
606
|
-
tracks: audioTracks.map((t) => ({
|
|
607
|
-
id: t.id,
|
|
608
|
-
label: t.label,
|
|
609
|
-
enabled: t.enabled,
|
|
610
|
-
readyState: t.readyState
|
|
611
|
-
}))
|
|
612
|
-
});
|
|
613
322
|
if (audioTracks.length === 0) {
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
);
|
|
617
|
-
|
|
618
|
-
emitter.emit("error", err);
|
|
619
|
-
throw err;
|
|
323
|
+
nsHandle.dispose();
|
|
324
|
+
levelHandle.dispose();
|
|
325
|
+
unregisterPipeline();
|
|
326
|
+
throw new Error("Failed to create processed audio track");
|
|
620
327
|
}
|
|
621
328
|
const processedTrack = audioTracks[0];
|
|
622
|
-
if (!processedTrack || processedTrack.readyState === "ended") {
|
|
623
|
-
const err = new Error("Processed audio track is invalid or ended");
|
|
624
|
-
console.error(err);
|
|
625
|
-
emitter.emit("error", err);
|
|
626
|
-
throw err;
|
|
627
|
-
}
|
|
628
|
-
console.log("Audio pipeline created successfully:", {
|
|
629
|
-
sourceTrack: {
|
|
630
|
-
id: sourceTrack.id,
|
|
631
|
-
label: sourceTrack.label,
|
|
632
|
-
readyState: sourceTrack.readyState
|
|
633
|
-
},
|
|
634
|
-
processedTrack: {
|
|
635
|
-
id: processedTrack.id,
|
|
636
|
-
label: processedTrack.label,
|
|
637
|
-
readyState: processedTrack.readyState
|
|
638
|
-
},
|
|
639
|
-
config: {
|
|
640
|
-
noiseSuppression: fullConfig.noiseSuppression?.enabled,
|
|
641
|
-
vad: fullConfig.vad?.enabled
|
|
642
|
-
}
|
|
643
|
-
});
|
|
644
329
|
function dispose() {
|
|
645
330
|
try {
|
|
646
331
|
sourceNode.disconnect();
|
|
647
|
-
|
|
332
|
+
nsHandle.node.disconnect();
|
|
648
333
|
splitter.disconnect();
|
|
649
|
-
|
|
650
|
-
delayNode.disconnect();
|
|
334
|
+
levelHandle.node.disconnect();
|
|
651
335
|
gainNode.disconnect();
|
|
652
|
-
if (compressor) {
|
|
653
|
-
compressor.disconnect();
|
|
654
|
-
}
|
|
655
336
|
destination.stream.getTracks().forEach((t) => t.stop());
|
|
656
|
-
|
|
337
|
+
levelHandle.dispose();
|
|
338
|
+
nsHandle.dispose();
|
|
657
339
|
} catch (error) {
|
|
658
|
-
console.error("Error during pipeline disposal
|
|
340
|
+
console.error("Error during pipeline disposal", error);
|
|
341
|
+
} finally {
|
|
342
|
+
unregisterPipeline();
|
|
659
343
|
}
|
|
660
344
|
}
|
|
661
|
-
|
|
345
|
+
const handle = {
|
|
662
346
|
processedTrack,
|
|
663
347
|
events: emitter,
|
|
664
348
|
get state() {
|
|
665
|
-
return
|
|
349
|
+
return lastState;
|
|
666
350
|
},
|
|
667
|
-
setConfig: (
|
|
351
|
+
setConfig: (next) => {
|
|
668
352
|
try {
|
|
669
|
-
if (
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
context.currentTime
|
|
677
|
-
);
|
|
678
|
-
}
|
|
353
|
+
if (next.speaking) {
|
|
354
|
+
vad.updateConfig(next.speaking);
|
|
355
|
+
fullConfig.speaking = { ...fullConfig.speaking, ...next.speaking };
|
|
356
|
+
}
|
|
357
|
+
if (next.output) {
|
|
358
|
+
fullConfig.output = { ...fullConfig.output, ...next.output };
|
|
359
|
+
updateGain(lastState);
|
|
679
360
|
}
|
|
680
|
-
if (
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
if (comp.ratio !== void 0) {
|
|
692
|
-
compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
|
|
693
|
-
}
|
|
694
|
-
if (comp.attack !== void 0) {
|
|
695
|
-
compressor.attack.setValueAtTime(
|
|
696
|
-
comp.attack,
|
|
697
|
-
context.currentTime
|
|
698
|
-
);
|
|
699
|
-
}
|
|
700
|
-
if (comp.release !== void 0) {
|
|
701
|
-
compressor.release.setValueAtTime(
|
|
702
|
-
comp.release,
|
|
703
|
-
context.currentTime
|
|
704
|
-
);
|
|
705
|
-
}
|
|
361
|
+
if (next.noiseSuppression) {
|
|
362
|
+
const ns = next.noiseSuppression;
|
|
363
|
+
fullConfig.noiseSuppression = {
|
|
364
|
+
...fullConfig.noiseSuppression,
|
|
365
|
+
...ns
|
|
366
|
+
};
|
|
367
|
+
if (typeof ns.noiseReductionLevel === "number") {
|
|
368
|
+
nsHandle.processor.setSuppressionLevel(ns.noiseReductionLevel);
|
|
369
|
+
}
|
|
370
|
+
if (typeof ns.enabled === "boolean") {
|
|
371
|
+
nsHandle.processor.setNoiseSuppressionEnabled(ns.enabled);
|
|
706
372
|
}
|
|
707
373
|
}
|
|
708
|
-
if (
|
|
709
|
-
|
|
374
|
+
if (typeof next.muteWhenSilent === "boolean") {
|
|
375
|
+
fullConfig.muteWhenSilent = next.muteWhenSilent;
|
|
710
376
|
}
|
|
711
|
-
console.log("Pipeline config updated:", newConfig);
|
|
712
377
|
} catch (error) {
|
|
713
378
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
714
|
-
console.error("Failed to update config:", err);
|
|
715
379
|
emitter.emit("error", err);
|
|
716
380
|
}
|
|
717
381
|
},
|
|
718
382
|
dispose
|
|
719
383
|
};
|
|
384
|
+
return handle;
|
|
720
385
|
}
|
|
721
386
|
// Annotate the CommonJS export names for ESM import in node:
|
|
722
387
|
0 && (module.exports = {
|