@tensamin/audio 0.1.14 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -231
- package/dist/chunk-6BJ4XGSA.mjs +80 -0
- package/dist/chunk-AQ5RVY33.mjs +74 -0
- package/dist/chunk-IS37FHDN.mjs +33 -0
- package/dist/chunk-K4J3UUOR.mjs +178 -0
- package/dist/chunk-QNQK6QFB.mjs +71 -0
- package/dist/context/audio-context.d.mts +0 -24
- package/dist/context/audio-context.d.ts +0 -24
- package/dist/index.d.mts +2 -8
- package/dist/index.d.ts +2 -8
- package/dist/index.js +285 -680
- package/dist/index.mjs +8 -43
- package/dist/livekit/integration.d.mts +3 -7
- package/dist/livekit/integration.d.ts +3 -7
- package/dist/livekit/integration.js +280 -626
- package/dist/livekit/integration.mjs +7 -8
- package/dist/noise-suppression/deepfilternet-node.d.mts +12 -0
- package/dist/noise-suppression/deepfilternet-node.d.ts +12 -0
- package/dist/noise-suppression/deepfilternet-node.js +57 -0
- package/dist/noise-suppression/deepfilternet-node.mjs +6 -0
- package/dist/pipeline/audio-pipeline.d.mts +2 -2
- package/dist/pipeline/audio-pipeline.d.ts +2 -2
- package/dist/pipeline/audio-pipeline.js +219 -554
- package/dist/pipeline/audio-pipeline.mjs +4 -5
- package/dist/types.d.mts +42 -257
- package/dist/types.d.ts +42 -257
- package/dist/vad/vad-node.d.mts +7 -9
- package/dist/vad/vad-node.d.ts +7 -9
- package/dist/vad/vad-node.js +47 -156
- package/dist/vad/vad-node.mjs +3 -3
- package/dist/vad/vad-state.d.mts +9 -11
- package/dist/vad/vad-state.d.ts +9 -11
- package/dist/vad/vad-state.js +50 -79
- package/dist/vad/vad-state.mjs +3 -3
- package/package.json +21 -21
- package/dist/chunk-2G2JFHJY.mjs +0 -180
- package/dist/chunk-6F2HZUYO.mjs +0 -91
- package/dist/chunk-K4YLH73B.mjs +0 -103
- package/dist/chunk-R5M2DGAQ.mjs +0 -311
- package/dist/chunk-UFKIAMG3.mjs +0 -47
- package/dist/chunk-XO6B3D4A.mjs +0 -67
- package/dist/extensibility/plugins.d.mts +0 -9
- package/dist/extensibility/plugins.d.ts +0 -9
- package/dist/extensibility/plugins.js +0 -320
- package/dist/extensibility/plugins.mjs +0 -14
- package/dist/noise-suppression/rnnoise-node.d.mts +0 -10
- package/dist/noise-suppression/rnnoise-node.d.ts +0 -10
- package/dist/noise-suppression/rnnoise-node.js +0 -101
- package/dist/noise-suppression/rnnoise-node.mjs +0 -6
|
@@ -30,9 +30,10 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/livekit/integration.ts
|
|
31
31
|
var integration_exports = {};
|
|
32
32
|
__export(integration_exports, {
|
|
33
|
-
|
|
33
|
+
attachSpeakingDetectionToTrack: () => attachSpeakingDetectionToTrack
|
|
34
34
|
});
|
|
35
35
|
module.exports = __toCommonJS(integration_exports);
|
|
36
|
+
var import_mitt2 = require("mitt");
|
|
36
37
|
|
|
37
38
|
// src/pipeline/audio-pipeline.ts
|
|
38
39
|
var import_mitt = __toESM(require("mitt"));
|
|
@@ -58,371 +59,171 @@ function unregisterPipeline() {
|
|
|
58
59
|
activePipelines = Math.max(0, activePipelines - 1);
|
|
59
60
|
}
|
|
60
61
|
|
|
61
|
-
// src/noise-suppression/
|
|
62
|
-
var
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
});
|
|
87
|
-
console.log("RNNoise WASM loaded successfully");
|
|
62
|
+
// src/noise-suppression/deepfilternet-node.ts
|
|
63
|
+
var import_deepfilternet3_noise_filter = require("deepfilternet3-noise-filter");
|
|
64
|
+
async function createDeepFilterNet3Node(context, config) {
|
|
65
|
+
const processorConfig = {
|
|
66
|
+
sampleRate: context.sampleRate,
|
|
67
|
+
noiseReductionLevel: config?.noiseReductionLevel ?? 60
|
|
68
|
+
};
|
|
69
|
+
if (config?.assetConfig) {
|
|
70
|
+
processorConfig.assetConfig = config.assetConfig;
|
|
71
|
+
}
|
|
72
|
+
const processor = new import_deepfilternet3_noise_filter.DeepFilterNet3Processor(processorConfig);
|
|
73
|
+
await processor.initialize();
|
|
74
|
+
const node = await processor.createAudioWorkletNode(context);
|
|
75
|
+
const enabled = config?.enabled ?? true;
|
|
76
|
+
if (!enabled) {
|
|
77
|
+
processor.setNoiseSuppressionEnabled(false);
|
|
78
|
+
}
|
|
79
|
+
return {
|
|
80
|
+
node,
|
|
81
|
+
processor,
|
|
82
|
+
dispose: () => {
|
|
83
|
+
try {
|
|
84
|
+
processor.destroy();
|
|
85
|
+
} catch (error) {
|
|
86
|
+
console.error("Failed to dispose DeepFilterNet3 processor", error);
|
|
88
87
|
}
|
|
89
|
-
} catch (error) {
|
|
90
|
-
const err = new Error(
|
|
91
|
-
`Failed to load RNNoise WASM binary: ${error instanceof Error ? error.message : String(error)}`
|
|
92
|
-
);
|
|
93
|
-
console.error(err);
|
|
94
|
-
throw err;
|
|
95
|
-
}
|
|
96
|
-
const workletUrl = config.workletUrl;
|
|
97
|
-
try {
|
|
98
|
-
await context.audioWorklet.addModule(workletUrl);
|
|
99
|
-
console.log("RNNoise worklet loaded successfully");
|
|
100
|
-
} catch (e) {
|
|
101
|
-
const error = new Error(
|
|
102
|
-
`Failed to load RNNoise worklet from ${workletUrl}: ${e instanceof Error ? e.message : String(e)}. Ensure the workletUrl points to a valid RNNoise worklet script.`
|
|
103
|
-
);
|
|
104
|
-
console.error(error.message);
|
|
105
|
-
throw error;
|
|
106
88
|
}
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
wasmBinary: this.wasmBuffer,
|
|
110
|
-
maxChannels: 1
|
|
111
|
-
// Mono for now
|
|
112
|
-
});
|
|
113
|
-
console.log("RNNoise worklet node created successfully");
|
|
114
|
-
return node;
|
|
115
|
-
} catch (error) {
|
|
116
|
-
const err = new Error(
|
|
117
|
-
`Failed to create RNNoise worklet node: ${error instanceof Error ? error.message : String(error)}`
|
|
118
|
-
);
|
|
119
|
-
console.error(err);
|
|
120
|
-
throw err;
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
};
|
|
89
|
+
};
|
|
90
|
+
}
|
|
124
91
|
|
|
125
92
|
// src/vad/vad-node.ts
|
|
126
|
-
|
|
127
|
-
const energyParams = vadConfig?.energyVad || {};
|
|
128
|
-
const smoothing = energyParams.smoothing ?? 0.95;
|
|
129
|
-
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
130
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
131
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
132
|
-
const minSNR = energyParams.minSNR ?? 12;
|
|
133
|
-
const snrRange = energyParams.snrRange ?? 10;
|
|
134
|
-
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
93
|
+
function createLevelDetectorWorkletCode(smoothing) {
|
|
135
94
|
return `
|
|
136
|
-
class
|
|
95
|
+
class LevelDetectorProcessor extends AudioWorkletProcessor {
|
|
137
96
|
constructor() {
|
|
138
97
|
super();
|
|
98
|
+
this.smoothed = 0;
|
|
139
99
|
this.smoothing = ${smoothing};
|
|
140
|
-
this.energy = 0;
|
|
141
|
-
this.noiseFloor = ${initialNoiseFloor};
|
|
142
|
-
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
143
|
-
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
144
|
-
this.minSNR = ${minSNR};
|
|
145
|
-
this.snrRange = ${snrRange};
|
|
146
|
-
this.minEnergy = ${minEnergy};
|
|
147
|
-
this.isSpeaking = false;
|
|
148
|
-
|
|
149
|
-
this.port.onmessage = (event) => {
|
|
150
|
-
if (event.data && event.data.isSpeaking !== undefined) {
|
|
151
|
-
this.isSpeaking = event.data.isSpeaking;
|
|
152
|
-
}
|
|
153
|
-
};
|
|
154
100
|
}
|
|
155
101
|
|
|
156
|
-
process(inputs
|
|
102
|
+
process(inputs) {
|
|
157
103
|
const input = inputs[0];
|
|
158
|
-
if (!input ||
|
|
104
|
+
if (!input || input.length === 0) return true;
|
|
159
105
|
const channel = input[0];
|
|
160
|
-
|
|
161
|
-
|
|
106
|
+
if (!channel || channel.length === 0) return true;
|
|
107
|
+
|
|
162
108
|
let sum = 0;
|
|
163
|
-
let peak = 0;
|
|
164
109
|
for (let i = 0; i < channel.length; i++) {
|
|
165
|
-
const sample =
|
|
166
|
-
sum +=
|
|
167
|
-
peak = Math.max(peak, sample);
|
|
110
|
+
const sample = channel[i];
|
|
111
|
+
sum += sample * sample;
|
|
168
112
|
}
|
|
169
|
-
const
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
174
|
-
|
|
175
|
-
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
176
|
-
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
177
|
-
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
178
|
-
const crestFactor = peak / (instantRms + 1e-10);
|
|
179
|
-
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
180
|
-
|
|
181
|
-
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
182
|
-
// This prevents sharp transients from affecting the noise floor
|
|
183
|
-
if (this.energy < this.noiseFloor) {
|
|
184
|
-
// Signal is quieter than noise floor, adapt downwards slowly
|
|
185
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
186
|
-
} else {
|
|
187
|
-
// Calculate SNR based on smoothed energy
|
|
188
|
-
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
189
|
-
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
190
|
-
|
|
191
|
-
// Only adapt upwards if:
|
|
192
|
-
// 1. SNR is low (< 10dB) - likely just background noise
|
|
193
|
-
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
194
|
-
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
195
|
-
// This is persistent background noise, adapt upwards
|
|
196
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
197
|
-
} else {
|
|
198
|
-
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
199
|
-
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
200
|
-
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
201
|
-
}
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
// Ensure noise floor doesn't drop to absolute zero
|
|
205
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
206
|
-
|
|
207
|
-
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
208
|
-
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
209
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
210
|
-
|
|
211
|
-
// Map SNR dB to probability (0-1)
|
|
212
|
-
// Probability is 0 when snrDb <= minSNR
|
|
213
|
-
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
214
|
-
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
215
|
-
|
|
216
|
-
// Apply absolute energy threshold with soft knee
|
|
217
|
-
if (this.energy < this.minEnergy) {
|
|
218
|
-
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
219
|
-
probability *= Math.pow(energyRatio, 2);
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
// Apply crest factor penalty
|
|
223
|
-
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
224
|
-
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
225
|
-
// We penalize anything above 14dB
|
|
226
|
-
if (crestFactorDb > 14) {
|
|
227
|
-
const excess = crestFactorDb - 14;
|
|
228
|
-
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
229
|
-
probability *= penalty;
|
|
230
|
-
}
|
|
231
|
-
|
|
232
|
-
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
233
|
-
|
|
113
|
+
const rms = Math.sqrt(sum / channel.length);
|
|
114
|
+
this.smoothed = this.smoothed * this.smoothing + rms * (1 - this.smoothing);
|
|
115
|
+
const levelDb = 20 * Math.log10(Math.max(1e-8, this.smoothed));
|
|
116
|
+
this.port.postMessage({ levelDb });
|
|
234
117
|
return true;
|
|
235
118
|
}
|
|
236
119
|
}
|
|
237
|
-
|
|
120
|
+
|
|
121
|
+
registerProcessor('level-detector-processor', LevelDetectorProcessor);
|
|
238
122
|
`;
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
}
|
|
249
|
-
const workletCode = createEnergyVadWorkletCode(config);
|
|
250
|
-
const blob = new Blob([workletCode], {
|
|
251
|
-
type: "application/javascript"
|
|
252
|
-
});
|
|
253
|
-
const url = URL.createObjectURL(blob);
|
|
254
|
-
try {
|
|
255
|
-
await context.audioWorklet.addModule(url);
|
|
256
|
-
console.log("Energy VAD worklet loaded successfully");
|
|
257
|
-
} catch (e) {
|
|
258
|
-
const error = new Error(
|
|
259
|
-
`Failed to load Energy VAD worklet: ${e instanceof Error ? e.message : String(e)}`
|
|
260
|
-
);
|
|
261
|
-
console.error(error.message);
|
|
262
|
-
URL.revokeObjectURL(url);
|
|
263
|
-
throw error;
|
|
264
|
-
}
|
|
123
|
+
}
|
|
124
|
+
async function createLevelDetectorNode(context, onLevel, options) {
|
|
125
|
+
const smoothing = options?.smoothing ?? 0.9;
|
|
126
|
+
const workletCode = createLevelDetectorWorkletCode(smoothing);
|
|
127
|
+
const blob = new Blob([workletCode], { type: "application/javascript" });
|
|
128
|
+
const url = URL.createObjectURL(blob);
|
|
129
|
+
try {
|
|
130
|
+
await context.audioWorklet.addModule(url);
|
|
131
|
+
} finally {
|
|
265
132
|
URL.revokeObjectURL(url);
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
);
|
|
275
|
-
console.error(error.message);
|
|
276
|
-
throw error;
|
|
133
|
+
}
|
|
134
|
+
const node = new AudioWorkletNode(context, "level-detector-processor", {
|
|
135
|
+
numberOfInputs: 1,
|
|
136
|
+
numberOfOutputs: 0
|
|
137
|
+
});
|
|
138
|
+
node.port.onmessage = (event) => {
|
|
139
|
+
const { levelDb } = event.data ?? {};
|
|
140
|
+
if (typeof levelDb === "number" && !Number.isNaN(levelDb)) {
|
|
141
|
+
onLevel(levelDb);
|
|
277
142
|
}
|
|
278
|
-
|
|
143
|
+
};
|
|
144
|
+
node.port.onmessageerror = (event) => {
|
|
145
|
+
console.error("Level detector port error", event);
|
|
146
|
+
};
|
|
147
|
+
return {
|
|
148
|
+
node,
|
|
149
|
+
dispose: () => {
|
|
279
150
|
try {
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
onDecision(probability);
|
|
283
|
-
} else {
|
|
284
|
-
console.warn("Invalid VAD probability received:", event.data);
|
|
285
|
-
}
|
|
151
|
+
node.port.onmessage = null;
|
|
152
|
+
node.port.close();
|
|
286
153
|
} catch (error) {
|
|
287
|
-
console.error("
|
|
154
|
+
console.error("Failed to dispose level detector node", error);
|
|
288
155
|
}
|
|
289
|
-
};
|
|
290
|
-
node.port.onmessageerror = (event) => {
|
|
291
|
-
console.error("VAD port message error:", event);
|
|
292
|
-
};
|
|
293
|
-
return node;
|
|
294
|
-
}
|
|
295
|
-
updateSpeakingState(isSpeaking) {
|
|
296
|
-
if (this.workletNode) {
|
|
297
|
-
this.workletNode.port.postMessage({ isSpeaking });
|
|
298
156
|
}
|
|
299
|
-
}
|
|
300
|
-
};
|
|
301
|
-
|
|
302
|
-
// src/extensibility/plugins.ts
|
|
303
|
-
var nsPlugins = /* @__PURE__ */ new Map();
|
|
304
|
-
var vadPlugins = /* @__PURE__ */ new Map();
|
|
305
|
-
var defaultNs = new RNNoisePlugin();
|
|
306
|
-
nsPlugins.set(defaultNs.name, defaultNs);
|
|
307
|
-
var defaultVad = new EnergyVADPlugin();
|
|
308
|
-
vadPlugins.set(defaultVad.name, defaultVad);
|
|
309
|
-
function getNoiseSuppressionPlugin(name) {
|
|
310
|
-
if (!name) return defaultNs;
|
|
311
|
-
const plugin = nsPlugins.get(name);
|
|
312
|
-
if (!plugin) {
|
|
313
|
-
console.warn(
|
|
314
|
-
`Noise suppression plugin '${name}' not found, falling back to default.`
|
|
315
|
-
);
|
|
316
|
-
return defaultNs;
|
|
317
|
-
}
|
|
318
|
-
return plugin;
|
|
319
|
-
}
|
|
320
|
-
function getVADPlugin(name) {
|
|
321
|
-
if (!name) return defaultVad;
|
|
322
|
-
const plugin = vadPlugins.get(name);
|
|
323
|
-
if (!plugin) {
|
|
324
|
-
console.warn(`VAD plugin '${name}' not found, falling back to default.`);
|
|
325
|
-
return defaultVad;
|
|
326
|
-
}
|
|
327
|
-
return plugin;
|
|
157
|
+
};
|
|
328
158
|
}
|
|
329
159
|
|
|
330
160
|
// src/vad/vad-state.ts
|
|
331
|
-
var
|
|
161
|
+
var LevelBasedVAD = class {
|
|
332
162
|
config;
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
lastSilenceTime = 0;
|
|
337
|
-
frameDurationMs = 20;
|
|
338
|
-
// Assumed frame duration, updated by calls
|
|
163
|
+
speaking = false;
|
|
164
|
+
pendingSpeechSince = null;
|
|
165
|
+
pendingSilenceSince = null;
|
|
339
166
|
constructor(config) {
|
|
340
167
|
this.config = {
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
hangoverMs: config?.hangoverMs ?? 300,
|
|
349
|
-
// Smooth for natural speech
|
|
350
|
-
preRollMs: config?.preRollMs ?? 250,
|
|
351
|
-
// Generous pre-roll
|
|
352
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
353
|
-
// Aggressive transient rejection
|
|
354
|
-
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
355
|
-
energyVad: {
|
|
356
|
-
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
357
|
-
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
358
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
359
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
360
|
-
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
361
|
-
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
362
|
-
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
363
|
-
}
|
|
168
|
+
minDb: config.minDb,
|
|
169
|
+
maxDb: config.maxDb,
|
|
170
|
+
speakOnRatio: config.speakOnRatio ?? 0.6,
|
|
171
|
+
speakOffRatio: config.speakOffRatio ?? 0.3,
|
|
172
|
+
hangoverMs: config.hangoverMs ?? 350,
|
|
173
|
+
attackMs: config.attackMs ?? 50,
|
|
174
|
+
releaseMs: config.releaseMs ?? 120
|
|
364
175
|
};
|
|
365
|
-
this.lastSilenceTime = Date.now();
|
|
366
176
|
}
|
|
367
177
|
updateConfig(config) {
|
|
368
|
-
this.config = {
|
|
178
|
+
this.config = {
|
|
179
|
+
...this.config,
|
|
180
|
+
...config,
|
|
181
|
+
speakOnRatio: config.speakOnRatio ?? this.config.speakOnRatio,
|
|
182
|
+
speakOffRatio: config.speakOffRatio ?? this.config.speakOffRatio,
|
|
183
|
+
hangoverMs: config.hangoverMs ?? this.config.hangoverMs,
|
|
184
|
+
attackMs: config.attackMs ?? this.config.attackMs,
|
|
185
|
+
releaseMs: config.releaseMs ?? this.config.releaseMs
|
|
186
|
+
};
|
|
369
187
|
}
|
|
370
|
-
|
|
188
|
+
process(levelDb, timestampMs) {
|
|
371
189
|
const {
|
|
372
|
-
|
|
373
|
-
|
|
190
|
+
minDb,
|
|
191
|
+
maxDb,
|
|
192
|
+
speakOnRatio,
|
|
193
|
+
speakOffRatio,
|
|
374
194
|
hangoverMs,
|
|
375
|
-
|
|
376
|
-
|
|
195
|
+
attackMs,
|
|
196
|
+
releaseMs
|
|
377
197
|
} = this.config;
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
this.
|
|
385
|
-
this.
|
|
386
|
-
|
|
387
|
-
newState = "silent";
|
|
198
|
+
const clamped = Math.min(maxDb, Math.max(minDb, levelDb));
|
|
199
|
+
const norm = (clamped - minDb) / Math.max(1, maxDb - minDb);
|
|
200
|
+
if (!this.speaking) {
|
|
201
|
+
if (norm >= speakOnRatio) {
|
|
202
|
+
this.pendingSpeechSince = this.pendingSpeechSince ?? timestampMs;
|
|
203
|
+
if (timestampMs - this.pendingSpeechSince >= attackMs) {
|
|
204
|
+
this.speaking = true;
|
|
205
|
+
this.pendingSpeechSince = null;
|
|
206
|
+
this.pendingSilenceSince = null;
|
|
388
207
|
}
|
|
389
208
|
} else {
|
|
390
|
-
|
|
391
|
-
this.lastSilenceTime = timestamp;
|
|
209
|
+
this.pendingSpeechSince = null;
|
|
392
210
|
}
|
|
393
|
-
} else
|
|
394
|
-
if (
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
211
|
+
} else {
|
|
212
|
+
if (norm <= speakOffRatio) {
|
|
213
|
+
this.pendingSilenceSince = this.pendingSilenceSince ?? timestampMs;
|
|
214
|
+
const releaseWindow = Math.max(releaseMs, hangoverMs);
|
|
215
|
+
if (timestampMs - this.pendingSilenceSince >= releaseWindow) {
|
|
216
|
+
this.speaking = false;
|
|
217
|
+
this.pendingSilenceSince = null;
|
|
218
|
+
this.pendingSpeechSince = null;
|
|
400
219
|
}
|
|
401
|
-
this.lastSpeechTime = timestamp;
|
|
402
220
|
} else {
|
|
403
|
-
|
|
404
|
-
this.lastSilenceTime = timestamp;
|
|
405
|
-
}
|
|
406
|
-
} else if (this.currentState === "speaking") {
|
|
407
|
-
if (probability >= stopThreshold) {
|
|
408
|
-
newState = "speaking";
|
|
409
|
-
this.lastSpeechTime = timestamp;
|
|
410
|
-
} else {
|
|
411
|
-
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
412
|
-
if (timeSinceSpeech < hangoverMs) {
|
|
413
|
-
newState = "speaking";
|
|
414
|
-
} else {
|
|
415
|
-
newState = "speech_ending";
|
|
416
|
-
this.lastSilenceTime = timestamp;
|
|
417
|
-
}
|
|
221
|
+
this.pendingSilenceSince = null;
|
|
418
222
|
}
|
|
419
223
|
}
|
|
420
|
-
if (newState === "speech_ending") newState = "silent";
|
|
421
|
-
this.currentState = newState;
|
|
422
224
|
return {
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
state: newState
|
|
225
|
+
speaking: this.speaking,
|
|
226
|
+
levelDb: clamped
|
|
426
227
|
};
|
|
427
228
|
}
|
|
428
229
|
};
|
|
@@ -431,58 +232,33 @@ var VADStateMachine = class {
|
|
|
431
232
|
async function createAudioPipeline(sourceTrack, config = {}) {
|
|
432
233
|
const context = getAudioContext();
|
|
433
234
|
registerPipeline();
|
|
434
|
-
const
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
235
|
+
const nsConfig = {
|
|
236
|
+
enabled: config.noiseSuppression?.enabled ?? true,
|
|
237
|
+
noiseReductionLevel: config.noiseSuppression?.noiseReductionLevel ?? 60
|
|
238
|
+
};
|
|
239
|
+
if (config.noiseSuppression?.assetConfig) {
|
|
240
|
+
nsConfig.assetConfig = config.noiseSuppression.assetConfig;
|
|
241
|
+
}
|
|
438
242
|
const fullConfig = {
|
|
439
|
-
noiseSuppression:
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
hangoverMs: 400,
|
|
449
|
-
preRollMs: 250,
|
|
450
|
-
minSpeechDurationMs: 100,
|
|
451
|
-
minSilenceDurationMs: 150,
|
|
452
|
-
energyVad: {
|
|
453
|
-
smoothing: 0.95,
|
|
454
|
-
initialNoiseFloor: 1e-3,
|
|
455
|
-
noiseFloorAdaptRateQuiet: 0.01,
|
|
456
|
-
noiseFloorAdaptRateLoud: 1e-3,
|
|
457
|
-
minSNR: 2,
|
|
458
|
-
snrRange: 8
|
|
459
|
-
},
|
|
460
|
-
...config.vad
|
|
243
|
+
noiseSuppression: nsConfig,
|
|
244
|
+
speaking: {
|
|
245
|
+
minDb: config.speaking?.minDb ?? -60,
|
|
246
|
+
maxDb: config.speaking?.maxDb ?? -20,
|
|
247
|
+
speakOnRatio: config.speaking?.speakOnRatio ?? 0.6,
|
|
248
|
+
speakOffRatio: config.speaking?.speakOffRatio ?? 0.3,
|
|
249
|
+
hangoverMs: config.speaking?.hangoverMs ?? 350,
|
|
250
|
+
attackMs: config.speaking?.attackMs ?? 50,
|
|
251
|
+
releaseMs: config.speaking?.releaseMs ?? 120
|
|
461
252
|
},
|
|
462
253
|
output: {
|
|
463
|
-
speechGain: 1,
|
|
464
|
-
silenceGain: 0,
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
smoothTransitions: true,
|
|
469
|
-
maxGainDb: 6,
|
|
470
|
-
enableCompression: false,
|
|
471
|
-
compression: {
|
|
472
|
-
threshold: -24,
|
|
473
|
-
ratio: 3,
|
|
474
|
-
attack: 3e-3,
|
|
475
|
-
release: 0.05
|
|
476
|
-
},
|
|
477
|
-
...config.output
|
|
254
|
+
speechGain: config.output?.speechGain ?? 1,
|
|
255
|
+
silenceGain: config.output?.silenceGain ?? 0,
|
|
256
|
+
gainRampTime: config.output?.gainRampTime ?? 0.015,
|
|
257
|
+
maxGainDb: config.output?.maxGainDb ?? 6,
|
|
258
|
+
smoothTransitions: config.output?.smoothTransitions ?? true
|
|
478
259
|
},
|
|
479
|
-
|
|
260
|
+
muteWhenSilent: config.muteWhenSilent ?? false
|
|
480
261
|
};
|
|
481
|
-
console.log("Audio pipeline config:", {
|
|
482
|
-
noiseSuppression: fullConfig.noiseSuppression?.enabled,
|
|
483
|
-
vad: fullConfig.vad?.enabled,
|
|
484
|
-
output: fullConfig.output
|
|
485
|
-
});
|
|
486
262
|
if (!sourceTrack || sourceTrack.kind !== "audio") {
|
|
487
263
|
throw new Error(
|
|
488
264
|
"createAudioPipeline requires a valid audio MediaStreamTrack"
|
|
@@ -493,318 +269,196 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
493
269
|
}
|
|
494
270
|
const sourceStream = new MediaStream([sourceTrack]);
|
|
495
271
|
const sourceNode = context.createMediaStreamSource(sourceStream);
|
|
496
|
-
let nsNode;
|
|
497
|
-
let vadNode;
|
|
498
272
|
const emitter = (0, import_mitt.default)();
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
try {
|
|
516
|
-
const timestamp = context.currentTime * 1e3;
|
|
517
|
-
const newState = vadStateMachine.processFrame(prob, timestamp);
|
|
518
|
-
if (vadPlugin && typeof vadPlugin.updateSpeakingState === "function") {
|
|
519
|
-
vadPlugin.updateSpeakingState(newState.isSpeaking);
|
|
520
|
-
}
|
|
521
|
-
if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
|
|
522
|
-
emitter.emit("vadChange", newState);
|
|
523
|
-
lastVadState = newState;
|
|
524
|
-
updateGain(newState);
|
|
525
|
-
}
|
|
526
|
-
} catch (vadError) {
|
|
527
|
-
const err = vadError instanceof Error ? vadError : new Error(String(vadError));
|
|
528
|
-
console.error("Error in VAD callback:", err);
|
|
529
|
-
emitter.emit("error", err);
|
|
273
|
+
const vad = new LevelBasedVAD(fullConfig.speaking);
|
|
274
|
+
let lastState = { speaking: false, levelDb: -Infinity };
|
|
275
|
+
const nsHandle = await createDeepFilterNet3Node(
|
|
276
|
+
context,
|
|
277
|
+
fullConfig.noiseSuppression
|
|
278
|
+
);
|
|
279
|
+
const levelHandle = await createLevelDetectorNode(context, (levelDb) => {
|
|
280
|
+
try {
|
|
281
|
+
const timestamp = context.currentTime * 1e3;
|
|
282
|
+
const nextState = vad.process(levelDb, timestamp);
|
|
283
|
+
const speakingChanged = nextState.speaking !== lastState.speaking;
|
|
284
|
+
const levelChanged = Math.abs(nextState.levelDb - lastState.levelDb) > 0.5;
|
|
285
|
+
if (speakingChanged || levelChanged) {
|
|
286
|
+
lastState = nextState;
|
|
287
|
+
updateGain(nextState);
|
|
288
|
+
emitter.emit("speakingChange", nextState);
|
|
530
289
|
}
|
|
531
|
-
})
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
throw err;
|
|
537
|
-
}
|
|
538
|
-
let lastVadState = {
|
|
539
|
-
isSpeaking: false,
|
|
540
|
-
probability: 0,
|
|
541
|
-
state: "silent"
|
|
542
|
-
};
|
|
290
|
+
} catch (error) {
|
|
291
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
292
|
+
emitter.emit("error", err);
|
|
293
|
+
}
|
|
294
|
+
});
|
|
543
295
|
const splitter = context.createGain();
|
|
544
|
-
sourceNode.connect(
|
|
545
|
-
|
|
546
|
-
splitter.connect(
|
|
547
|
-
const delayNode = context.createDelay(1);
|
|
548
|
-
const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
|
|
549
|
-
delayNode.delayTime.value = preRollSeconds;
|
|
296
|
+
sourceNode.connect(nsHandle.node);
|
|
297
|
+
nsHandle.node.connect(splitter);
|
|
298
|
+
splitter.connect(levelHandle.node);
|
|
550
299
|
const gainNode = context.createGain();
|
|
551
300
|
gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
|
|
552
|
-
|
|
553
|
-
if (fullConfig.output?.enableCompression) {
|
|
554
|
-
compressor = context.createDynamicsCompressor();
|
|
555
|
-
const comp = fullConfig.output.compression;
|
|
556
|
-
compressor.threshold.value = comp.threshold ?? -24;
|
|
557
|
-
compressor.ratio.value = comp.ratio ?? 3;
|
|
558
|
-
compressor.attack.value = comp.attack ?? 3e-3;
|
|
559
|
-
compressor.release.value = comp.release ?? 0.05;
|
|
560
|
-
compressor.knee.value = 10;
|
|
561
|
-
}
|
|
301
|
+
splitter.connect(gainNode);
|
|
562
302
|
const destination = context.createMediaStreamDestination();
|
|
563
|
-
|
|
564
|
-
splitter.connect(delayNode);
|
|
565
|
-
delayNode.connect(gainNode);
|
|
566
|
-
if (compressor) {
|
|
567
|
-
gainNode.connect(compressor);
|
|
568
|
-
compressor.connect(destination);
|
|
569
|
-
console.log("Compression enabled:", fullConfig.output?.compression);
|
|
570
|
-
} else {
|
|
571
|
-
gainNode.connect(destination);
|
|
572
|
-
}
|
|
573
|
-
} catch (error) {
|
|
574
|
-
const err = error instanceof Error ? error : new Error(String(error));
|
|
575
|
-
console.error("Failed to wire audio pipeline:", err);
|
|
576
|
-
emitter.emit("error", err);
|
|
577
|
-
throw err;
|
|
578
|
-
}
|
|
303
|
+
gainNode.connect(destination);
|
|
579
304
|
function updateGain(state) {
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
gainNode.gain.setValueAtTime(targetGain, now);
|
|
598
|
-
}
|
|
599
|
-
} catch (error) {
|
|
600
|
-
const err = error instanceof Error ? error : new Error(String(error));
|
|
601
|
-
console.error("Failed to update gain:", err);
|
|
602
|
-
emitter.emit("error", err);
|
|
305
|
+
const {
|
|
306
|
+
speechGain = 1,
|
|
307
|
+
silenceGain = 0,
|
|
308
|
+
gainRampTime = 0.015,
|
|
309
|
+
smoothTransitions = true,
|
|
310
|
+
maxGainDb = 6
|
|
311
|
+
} = fullConfig.output ?? {};
|
|
312
|
+
const maxGainLinear = Math.pow(10, maxGainDb / 20);
|
|
313
|
+
const limitedSpeechGain = Math.min(speechGain ?? 1, maxGainLinear);
|
|
314
|
+
const target = state.speaking ? limitedSpeechGain : silenceGain ?? 0;
|
|
315
|
+
const now = context.currentTime;
|
|
316
|
+
gainNode.gain.cancelScheduledValues(now);
|
|
317
|
+
gainNode.gain.setValueAtTime(gainNode.gain.value, now);
|
|
318
|
+
if (smoothTransitions) {
|
|
319
|
+
gainNode.gain.setTargetAtTime(target, now, gainRampTime / 3);
|
|
320
|
+
} else {
|
|
321
|
+
gainNode.gain.setValueAtTime(target, now);
|
|
603
322
|
}
|
|
604
323
|
}
|
|
605
324
|
const audioTracks = destination.stream.getAudioTracks();
|
|
606
|
-
console.log("Destination stream tracks:", {
|
|
607
|
-
count: audioTracks.length,
|
|
608
|
-
tracks: audioTracks.map((t) => ({
|
|
609
|
-
id: t.id,
|
|
610
|
-
label: t.label,
|
|
611
|
-
enabled: t.enabled,
|
|
612
|
-
readyState: t.readyState
|
|
613
|
-
}))
|
|
614
|
-
});
|
|
615
325
|
if (audioTracks.length === 0) {
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
);
|
|
619
|
-
|
|
620
|
-
emitter.emit("error", err);
|
|
621
|
-
throw err;
|
|
326
|
+
nsHandle.dispose();
|
|
327
|
+
levelHandle.dispose();
|
|
328
|
+
unregisterPipeline();
|
|
329
|
+
throw new Error("Failed to create processed audio track");
|
|
622
330
|
}
|
|
623
331
|
const processedTrack = audioTracks[0];
|
|
624
|
-
if (!processedTrack || processedTrack.readyState === "ended") {
|
|
625
|
-
const err = new Error("Processed audio track is invalid or ended");
|
|
626
|
-
console.error(err);
|
|
627
|
-
emitter.emit("error", err);
|
|
628
|
-
throw err;
|
|
629
|
-
}
|
|
630
|
-
console.log("Audio pipeline created successfully:", {
|
|
631
|
-
sourceTrack: {
|
|
632
|
-
id: sourceTrack.id,
|
|
633
|
-
label: sourceTrack.label,
|
|
634
|
-
readyState: sourceTrack.readyState
|
|
635
|
-
},
|
|
636
|
-
processedTrack: {
|
|
637
|
-
id: processedTrack.id,
|
|
638
|
-
label: processedTrack.label,
|
|
639
|
-
readyState: processedTrack.readyState
|
|
640
|
-
},
|
|
641
|
-
config: {
|
|
642
|
-
noiseSuppression: fullConfig.noiseSuppression?.enabled,
|
|
643
|
-
vad: fullConfig.vad?.enabled
|
|
644
|
-
}
|
|
645
|
-
});
|
|
646
332
|
function dispose() {
|
|
647
333
|
try {
|
|
648
334
|
sourceNode.disconnect();
|
|
649
|
-
|
|
335
|
+
nsHandle.node.disconnect();
|
|
650
336
|
splitter.disconnect();
|
|
651
|
-
|
|
652
|
-
delayNode.disconnect();
|
|
337
|
+
levelHandle.node.disconnect();
|
|
653
338
|
gainNode.disconnect();
|
|
654
|
-
if (compressor) {
|
|
655
|
-
compressor.disconnect();
|
|
656
|
-
}
|
|
657
339
|
destination.stream.getTracks().forEach((t) => t.stop());
|
|
658
|
-
|
|
340
|
+
levelHandle.dispose();
|
|
341
|
+
nsHandle.dispose();
|
|
659
342
|
} catch (error) {
|
|
660
|
-
console.error("Error during pipeline disposal
|
|
343
|
+
console.error("Error during pipeline disposal", error);
|
|
344
|
+
} finally {
|
|
345
|
+
unregisterPipeline();
|
|
661
346
|
}
|
|
662
347
|
}
|
|
663
|
-
|
|
348
|
+
const handle = {
|
|
664
349
|
processedTrack,
|
|
665
350
|
events: emitter,
|
|
666
351
|
get state() {
|
|
667
|
-
return
|
|
352
|
+
return lastState;
|
|
668
353
|
},
|
|
669
|
-
setConfig: (
|
|
354
|
+
setConfig: (next) => {
|
|
670
355
|
try {
|
|
671
|
-
if (
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
context.currentTime
|
|
679
|
-
);
|
|
680
|
-
}
|
|
356
|
+
if (next.speaking) {
|
|
357
|
+
vad.updateConfig(next.speaking);
|
|
358
|
+
fullConfig.speaking = { ...fullConfig.speaking, ...next.speaking };
|
|
359
|
+
}
|
|
360
|
+
if (next.output) {
|
|
361
|
+
fullConfig.output = { ...fullConfig.output, ...next.output };
|
|
362
|
+
updateGain(lastState);
|
|
681
363
|
}
|
|
682
|
-
if (
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
if (comp.ratio !== void 0) {
|
|
694
|
-
compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
|
|
695
|
-
}
|
|
696
|
-
if (comp.attack !== void 0) {
|
|
697
|
-
compressor.attack.setValueAtTime(
|
|
698
|
-
comp.attack,
|
|
699
|
-
context.currentTime
|
|
700
|
-
);
|
|
701
|
-
}
|
|
702
|
-
if (comp.release !== void 0) {
|
|
703
|
-
compressor.release.setValueAtTime(
|
|
704
|
-
comp.release,
|
|
705
|
-
context.currentTime
|
|
706
|
-
);
|
|
707
|
-
}
|
|
364
|
+
if (next.noiseSuppression) {
|
|
365
|
+
const ns = next.noiseSuppression;
|
|
366
|
+
fullConfig.noiseSuppression = {
|
|
367
|
+
...fullConfig.noiseSuppression,
|
|
368
|
+
...ns
|
|
369
|
+
};
|
|
370
|
+
if (typeof ns.noiseReductionLevel === "number") {
|
|
371
|
+
nsHandle.processor.setSuppressionLevel(ns.noiseReductionLevel);
|
|
372
|
+
}
|
|
373
|
+
if (typeof ns.enabled === "boolean") {
|
|
374
|
+
nsHandle.processor.setNoiseSuppressionEnabled(ns.enabled);
|
|
708
375
|
}
|
|
709
376
|
}
|
|
710
|
-
if (
|
|
711
|
-
|
|
377
|
+
if (typeof next.muteWhenSilent === "boolean") {
|
|
378
|
+
fullConfig.muteWhenSilent = next.muteWhenSilent;
|
|
712
379
|
}
|
|
713
|
-
console.log("Pipeline config updated:", newConfig);
|
|
714
380
|
} catch (error) {
|
|
715
381
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
716
|
-
console.error("Failed to update config:", err);
|
|
717
382
|
emitter.emit("error", err);
|
|
718
383
|
}
|
|
719
384
|
},
|
|
720
385
|
dispose
|
|
721
386
|
};
|
|
387
|
+
return handle;
|
|
722
388
|
}
|
|
723
389
|
|
|
724
390
|
// src/livekit/integration.ts
|
|
725
|
-
async function
|
|
391
|
+
async function attachSpeakingDetectionToTrack(track, options = {}) {
|
|
726
392
|
if (!track) {
|
|
727
|
-
throw new Error(
|
|
728
|
-
|
|
729
|
-
const originalTrack = track.mediaStreamTrack;
|
|
730
|
-
if (!originalTrack) {
|
|
731
|
-
throw new Error("LocalAudioTrack has no underlying MediaStreamTrack");
|
|
732
|
-
}
|
|
733
|
-
if (originalTrack.readyState === "ended") {
|
|
734
|
-
throw new Error("Cannot attach processing to an ended MediaStreamTrack");
|
|
735
|
-
}
|
|
736
|
-
let pipeline;
|
|
737
|
-
try {
|
|
738
|
-
console.log("Creating audio processing pipeline...");
|
|
739
|
-
pipeline = await createAudioPipeline(originalTrack, config);
|
|
740
|
-
console.log("Audio processing pipeline created successfully");
|
|
741
|
-
} catch (error) {
|
|
742
|
-
const err = new Error(
|
|
743
|
-
`Failed to create audio pipeline: ${error instanceof Error ? error.message : String(error)}`
|
|
393
|
+
throw new Error(
|
|
394
|
+
"attachSpeakingDetectionToTrack requires a valid LocalAudioTrack"
|
|
744
395
|
);
|
|
745
|
-
console.error(err);
|
|
746
|
-
throw err;
|
|
747
|
-
}
|
|
748
|
-
if (!pipeline.processedTrack) {
|
|
749
|
-
throw new Error("Pipeline did not return a processed track");
|
|
750
396
|
}
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
console.log("LiveKit track replaced successfully");
|
|
755
|
-
} catch (error) {
|
|
756
|
-
pipeline.dispose();
|
|
757
|
-
const err = new Error(
|
|
758
|
-
`Failed to replace LiveKit track: ${error instanceof Error ? error.message : String(error)}`
|
|
759
|
-
);
|
|
760
|
-
console.error(err);
|
|
761
|
-
throw err;
|
|
397
|
+
const originalTrack = track.mediaStreamTrack;
|
|
398
|
+
if (!originalTrack || originalTrack.readyState === "ended") {
|
|
399
|
+
throw new Error("LocalAudioTrack has no live MediaStreamTrack to process");
|
|
762
400
|
}
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
isVadMuted = true;
|
|
776
|
-
}
|
|
777
|
-
}
|
|
778
|
-
} catch (error) {
|
|
779
|
-
console.error("Error handling VAD-based track muting:", error);
|
|
401
|
+
const pipeline = await createAudioPipeline(originalTrack, options);
|
|
402
|
+
await track.replaceTrack(pipeline.processedTrack);
|
|
403
|
+
const listeners = /* @__PURE__ */ new Set();
|
|
404
|
+
let mutedByController = false;
|
|
405
|
+
let currentState = pipeline.state;
|
|
406
|
+
const speakingHandler = (state) => {
|
|
407
|
+
currentState = state;
|
|
408
|
+
listeners.forEach((listener) => listener(state));
|
|
409
|
+
if (options.muteWhenSilent) {
|
|
410
|
+
if (!state.speaking && !track.isMuted) {
|
|
411
|
+
track.mute().catch((error) => console.error("mute failed", error));
|
|
412
|
+
mutedByController = true;
|
|
780
413
|
}
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
pipeline.
|
|
788
|
-
|
|
414
|
+
if (state.speaking && mutedByController) {
|
|
415
|
+
track.unmute().catch((error) => console.error("unmute failed", error));
|
|
416
|
+
mutedByController = false;
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
};
|
|
420
|
+
pipeline.events.on("speakingChange", speakingHandler);
|
|
421
|
+
const errorHandler = (error) => {
|
|
422
|
+
console.error("Audio pipeline error", error);
|
|
423
|
+
};
|
|
424
|
+
pipeline.events.on("error", errorHandler);
|
|
425
|
+
const controller = {
|
|
426
|
+
get speaking() {
|
|
427
|
+
return currentState.speaking;
|
|
428
|
+
},
|
|
429
|
+
get levelDb() {
|
|
430
|
+
return currentState.levelDb;
|
|
431
|
+
},
|
|
432
|
+
onChange: (listener) => {
|
|
433
|
+
listeners.add(listener);
|
|
434
|
+
listener(currentState);
|
|
435
|
+
return () => listeners.delete(listener);
|
|
436
|
+
},
|
|
437
|
+
setConfig: (config) => {
|
|
438
|
+
pipeline.setConfig(config);
|
|
439
|
+
if (typeof config.muteWhenSilent === "boolean") {
|
|
440
|
+
options.muteWhenSilent = config.muteWhenSilent;
|
|
441
|
+
}
|
|
442
|
+
},
|
|
443
|
+
dispose: () => {
|
|
444
|
+
pipeline.events.off("speakingChange", speakingHandler);
|
|
445
|
+
pipeline.events.off("error", errorHandler);
|
|
446
|
+
listeners.clear();
|
|
447
|
+
if (mutedByController && !track.isMuted) {
|
|
448
|
+
track.unmute().catch((error) => console.error("unmute failed", error));
|
|
449
|
+
mutedByController = false;
|
|
450
|
+
}
|
|
451
|
+
pipeline.dispose();
|
|
789
452
|
if (originalTrack.readyState === "live") {
|
|
790
|
-
console.log("Restoring original track...");
|
|
791
453
|
track.replaceTrack(originalTrack).catch((error) => {
|
|
792
|
-
console.error("Failed to restore original track
|
|
454
|
+
console.error("Failed to restore original track", error);
|
|
793
455
|
});
|
|
794
456
|
}
|
|
795
|
-
originalDispose();
|
|
796
|
-
} catch (error) {
|
|
797
|
-
console.error("Error during pipeline disposal:", error);
|
|
798
|
-
try {
|
|
799
|
-
originalDispose();
|
|
800
|
-
} catch (disposeError) {
|
|
801
|
-
console.error("Error calling original dispose:", disposeError);
|
|
802
|
-
}
|
|
803
457
|
}
|
|
804
458
|
};
|
|
805
|
-
return
|
|
459
|
+
return controller;
|
|
806
460
|
}
|
|
807
461
|
// Annotate the CommonJS export names for ESM import in node:
|
|
808
462
|
0 && (module.exports = {
|
|
809
|
-
|
|
463
|
+
attachSpeakingDetectionToTrack
|
|
810
464
|
});
|