@tensamin/audio 0.1.14 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +48 -231
  2. package/dist/chunk-6BJ4XGSA.mjs +80 -0
  3. package/dist/chunk-AQ5RVY33.mjs +74 -0
  4. package/dist/chunk-IS37FHDN.mjs +33 -0
  5. package/dist/chunk-K4J3UUOR.mjs +178 -0
  6. package/dist/chunk-QNQK6QFB.mjs +71 -0
  7. package/dist/context/audio-context.d.mts +0 -24
  8. package/dist/context/audio-context.d.ts +0 -24
  9. package/dist/index.d.mts +2 -8
  10. package/dist/index.d.ts +2 -8
  11. package/dist/index.js +285 -680
  12. package/dist/index.mjs +8 -43
  13. package/dist/livekit/integration.d.mts +3 -7
  14. package/dist/livekit/integration.d.ts +3 -7
  15. package/dist/livekit/integration.js +280 -626
  16. package/dist/livekit/integration.mjs +7 -8
  17. package/dist/noise-suppression/deepfilternet-node.d.mts +12 -0
  18. package/dist/noise-suppression/deepfilternet-node.d.ts +12 -0
  19. package/dist/noise-suppression/deepfilternet-node.js +57 -0
  20. package/dist/noise-suppression/deepfilternet-node.mjs +6 -0
  21. package/dist/pipeline/audio-pipeline.d.mts +2 -2
  22. package/dist/pipeline/audio-pipeline.d.ts +2 -2
  23. package/dist/pipeline/audio-pipeline.js +219 -554
  24. package/dist/pipeline/audio-pipeline.mjs +4 -5
  25. package/dist/types.d.mts +42 -257
  26. package/dist/types.d.ts +42 -257
  27. package/dist/vad/vad-node.d.mts +7 -9
  28. package/dist/vad/vad-node.d.ts +7 -9
  29. package/dist/vad/vad-node.js +47 -156
  30. package/dist/vad/vad-node.mjs +3 -3
  31. package/dist/vad/vad-state.d.mts +9 -11
  32. package/dist/vad/vad-state.d.ts +9 -11
  33. package/dist/vad/vad-state.js +50 -79
  34. package/dist/vad/vad-state.mjs +3 -3
  35. package/package.json +21 -21
  36. package/dist/chunk-2G2JFHJY.mjs +0 -180
  37. package/dist/chunk-6F2HZUYO.mjs +0 -91
  38. package/dist/chunk-K4YLH73B.mjs +0 -103
  39. package/dist/chunk-R5M2DGAQ.mjs +0 -311
  40. package/dist/chunk-UFKIAMG3.mjs +0 -47
  41. package/dist/chunk-XO6B3D4A.mjs +0 -67
  42. package/dist/extensibility/plugins.d.mts +0 -9
  43. package/dist/extensibility/plugins.d.ts +0 -9
  44. package/dist/extensibility/plugins.js +0 -320
  45. package/dist/extensibility/plugins.mjs +0 -14
  46. package/dist/noise-suppression/rnnoise-node.d.mts +0 -10
  47. package/dist/noise-suppression/rnnoise-node.d.ts +0 -10
  48. package/dist/noise-suppression/rnnoise-node.js +0 -101
  49. package/dist/noise-suppression/rnnoise-node.mjs +0 -6
@@ -30,9 +30,10 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/livekit/integration.ts
31
31
  var integration_exports = {};
32
32
  __export(integration_exports, {
33
- attachProcessingToTrack: () => attachProcessingToTrack
33
+ attachSpeakingDetectionToTrack: () => attachSpeakingDetectionToTrack
34
34
  });
35
35
  module.exports = __toCommonJS(integration_exports);
36
+ var import_mitt2 = require("mitt");
36
37
 
37
38
  // src/pipeline/audio-pipeline.ts
38
39
  var import_mitt = __toESM(require("mitt"));
@@ -58,371 +59,171 @@ function unregisterPipeline() {
58
59
  activePipelines = Math.max(0, activePipelines - 1);
59
60
  }
60
61
 
61
- // src/noise-suppression/rnnoise-node.ts
62
- var RNNoisePlugin = class {
63
- name = "rnnoise-ns";
64
- wasmBuffer = null;
65
- async createNode(context, config) {
66
- const { loadRnnoise, RnnoiseWorkletNode } = await import("@sapphi-red/web-noise-suppressor");
67
- if (!config?.enabled) {
68
- console.log("Noise suppression disabled, using passthrough node");
69
- const pass = context.createGain();
70
- return pass;
71
- }
72
- if (!config?.wasmUrl || !config?.simdUrl || !config?.workletUrl) {
73
- const error = new Error(
74
- `RNNoisePlugin requires 'wasmUrl', 'simdUrl', and 'workletUrl' to be configured. Please download the assets from @sapphi-red/web-noise-suppressor and provide the URLs in the config. Current config: wasmUrl=${config?.wasmUrl}, simdUrl=${config?.simdUrl}, workletUrl=${config?.workletUrl}
75
- To disable noise suppression, set noiseSuppression.enabled to false.`
76
- );
77
- console.error(error.message);
78
- throw error;
79
- }
80
- try {
81
- if (!this.wasmBuffer) {
82
- console.log("Loading RNNoise WASM binary...");
83
- this.wasmBuffer = await loadRnnoise({
84
- url: config.wasmUrl,
85
- simdUrl: config.simdUrl
86
- });
87
- console.log("RNNoise WASM loaded successfully");
62
+ // src/noise-suppression/deepfilternet-node.ts
63
+ var import_deepfilternet3_noise_filter = require("deepfilternet3-noise-filter");
64
+ async function createDeepFilterNet3Node(context, config) {
65
+ const processorConfig = {
66
+ sampleRate: context.sampleRate,
67
+ noiseReductionLevel: config?.noiseReductionLevel ?? 60
68
+ };
69
+ if (config?.assetConfig) {
70
+ processorConfig.assetConfig = config.assetConfig;
71
+ }
72
+ const processor = new import_deepfilternet3_noise_filter.DeepFilterNet3Processor(processorConfig);
73
+ await processor.initialize();
74
+ const node = await processor.createAudioWorkletNode(context);
75
+ const enabled = config?.enabled ?? true;
76
+ if (!enabled) {
77
+ processor.setNoiseSuppressionEnabled(false);
78
+ }
79
+ return {
80
+ node,
81
+ processor,
82
+ dispose: () => {
83
+ try {
84
+ processor.destroy();
85
+ } catch (error) {
86
+ console.error("Failed to dispose DeepFilterNet3 processor", error);
88
87
  }
89
- } catch (error) {
90
- const err = new Error(
91
- `Failed to load RNNoise WASM binary: ${error instanceof Error ? error.message : String(error)}`
92
- );
93
- console.error(err);
94
- throw err;
95
- }
96
- const workletUrl = config.workletUrl;
97
- try {
98
- await context.audioWorklet.addModule(workletUrl);
99
- console.log("RNNoise worklet loaded successfully");
100
- } catch (e) {
101
- const error = new Error(
102
- `Failed to load RNNoise worklet from ${workletUrl}: ${e instanceof Error ? e.message : String(e)}. Ensure the workletUrl points to a valid RNNoise worklet script.`
103
- );
104
- console.error(error.message);
105
- throw error;
106
88
  }
107
- try {
108
- const node = new RnnoiseWorkletNode(context, {
109
- wasmBinary: this.wasmBuffer,
110
- maxChannels: 1
111
- // Mono for now
112
- });
113
- console.log("RNNoise worklet node created successfully");
114
- return node;
115
- } catch (error) {
116
- const err = new Error(
117
- `Failed to create RNNoise worklet node: ${error instanceof Error ? error.message : String(error)}`
118
- );
119
- console.error(err);
120
- throw err;
121
- }
122
- }
123
- };
89
+ };
90
+ }
124
91
 
125
92
  // src/vad/vad-node.ts
126
- var createEnergyVadWorkletCode = (vadConfig) => {
127
- const energyParams = vadConfig?.energyVad || {};
128
- const smoothing = energyParams.smoothing ?? 0.95;
129
- const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
130
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
131
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
132
- const minSNR = energyParams.minSNR ?? 12;
133
- const snrRange = energyParams.snrRange ?? 10;
134
- const minEnergy = energyParams.minEnergy ?? 3e-3;
93
+ function createLevelDetectorWorkletCode(smoothing) {
135
94
  return `
136
- class EnergyVadProcessor extends AudioWorkletProcessor {
95
+ class LevelDetectorProcessor extends AudioWorkletProcessor {
137
96
  constructor() {
138
97
  super();
98
+ this.smoothed = 0;
139
99
  this.smoothing = ${smoothing};
140
- this.energy = 0;
141
- this.noiseFloor = ${initialNoiseFloor};
142
- this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
143
- this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
144
- this.minSNR = ${minSNR};
145
- this.snrRange = ${snrRange};
146
- this.minEnergy = ${minEnergy};
147
- this.isSpeaking = false;
148
-
149
- this.port.onmessage = (event) => {
150
- if (event.data && event.data.isSpeaking !== undefined) {
151
- this.isSpeaking = event.data.isSpeaking;
152
- }
153
- };
154
100
  }
155
101
 
156
- process(inputs, outputs, parameters) {
102
+ process(inputs) {
157
103
  const input = inputs[0];
158
- if (!input || !input.length) return true;
104
+ if (!input || input.length === 0) return true;
159
105
  const channel = input[0];
160
-
161
- // Calculate instantaneous RMS (Root Mean Square) energy
106
+ if (!channel || channel.length === 0) return true;
107
+
162
108
  let sum = 0;
163
- let peak = 0;
164
109
  for (let i = 0; i < channel.length; i++) {
165
- const sample = Math.abs(channel[i]);
166
- sum += channel[i] * channel[i];
167
- peak = Math.max(peak, sample);
110
+ const sample = channel[i];
111
+ sum += sample * sample;
168
112
  }
169
- const instantRms = Math.sqrt(sum / channel.length);
170
-
171
- // Smooth the RMS energy to reduce jitter
172
- // this.energy acts as the smoothed RMS value
173
- this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
174
-
175
- // Calculate Crest Factor (peak-to-RMS ratio)
176
- // Voice typically has crest factor of 2-4 (6-12dB)
177
- // Keyboard clicks have crest factor of 10-30+ (20-30dB)
178
- const crestFactor = peak / (instantRms + 1e-10);
179
- const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
180
-
181
- // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
182
- // This prevents sharp transients from affecting the noise floor
183
- if (this.energy < this.noiseFloor) {
184
- // Signal is quieter than noise floor, adapt downwards slowly
185
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
186
- } else {
187
- // Calculate SNR based on smoothed energy
188
- const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
189
- const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
190
-
191
- // Only adapt upwards if:
192
- // 1. SNR is low (< 10dB) - likely just background noise
193
- // 2. AND crest factor is low (< 15dB) - not a sharp transient
194
- if (smoothedSnrDb < 10 && crestFactorDb < 15) {
195
- // This is persistent background noise, adapt upwards
196
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
197
- } else {
198
- // Either high SNR (speech) or high crest factor (click) - adapt very slowly
199
- const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
200
- this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
201
- }
202
- }
203
-
204
- // Ensure noise floor doesn't drop to absolute zero
205
- this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
206
-
207
- // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
208
- const snr = this.energy / (this.noiseFloor + 1e-6);
209
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
210
-
211
- // Map SNR dB to probability (0-1)
212
- // Probability is 0 when snrDb <= minSNR
213
- // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
214
- let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
215
-
216
- // Apply absolute energy threshold with soft knee
217
- if (this.energy < this.minEnergy) {
218
- const energyRatio = this.energy / (this.minEnergy + 1e-6);
219
- probability *= Math.pow(energyRatio, 2);
220
- }
221
-
222
- // Apply crest factor penalty
223
- // Reject signals with high crest factor (sharp transients like keyboard clicks)
224
- // Voice: 6-12dB, Keyboard: 20-30dB
225
- // We penalize anything above 14dB
226
- if (crestFactorDb > 14) {
227
- const excess = crestFactorDb - 14;
228
- const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
229
- probability *= penalty;
230
- }
231
-
232
- this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
233
-
113
+ const rms = Math.sqrt(sum / channel.length);
114
+ this.smoothed = this.smoothed * this.smoothing + rms * (1 - this.smoothing);
115
+ const levelDb = 20 * Math.log10(Math.max(1e-8, this.smoothed));
116
+ this.port.postMessage({ levelDb });
234
117
  return true;
235
118
  }
236
119
  }
237
- registerProcessor('energy-vad-processor', EnergyVadProcessor);
120
+
121
+ registerProcessor('level-detector-processor', LevelDetectorProcessor);
238
122
  `;
239
- };
240
- var EnergyVADPlugin = class {
241
- name = "energy-vad";
242
- workletNode = null;
243
- async createNode(context, config, onDecision) {
244
- if (!config?.enabled) {
245
- console.log("VAD disabled, using passthrough node");
246
- const pass = context.createGain();
247
- return pass;
248
- }
249
- const workletCode = createEnergyVadWorkletCode(config);
250
- const blob = new Blob([workletCode], {
251
- type: "application/javascript"
252
- });
253
- const url = URL.createObjectURL(blob);
254
- try {
255
- await context.audioWorklet.addModule(url);
256
- console.log("Energy VAD worklet loaded successfully");
257
- } catch (e) {
258
- const error = new Error(
259
- `Failed to load Energy VAD worklet: ${e instanceof Error ? e.message : String(e)}`
260
- );
261
- console.error(error.message);
262
- URL.revokeObjectURL(url);
263
- throw error;
264
- }
123
+ }
124
+ async function createLevelDetectorNode(context, onLevel, options) {
125
+ const smoothing = options?.smoothing ?? 0.9;
126
+ const workletCode = createLevelDetectorWorkletCode(smoothing);
127
+ const blob = new Blob([workletCode], { type: "application/javascript" });
128
+ const url = URL.createObjectURL(blob);
129
+ try {
130
+ await context.audioWorklet.addModule(url);
131
+ } finally {
265
132
  URL.revokeObjectURL(url);
266
- let node;
267
- try {
268
- node = new AudioWorkletNode(context, "energy-vad-processor");
269
- this.workletNode = node;
270
- console.log("Energy VAD node created successfully");
271
- } catch (e) {
272
- const error = new Error(
273
- `Failed to create Energy VAD node: ${e instanceof Error ? e.message : String(e)}`
274
- );
275
- console.error(error.message);
276
- throw error;
133
+ }
134
+ const node = new AudioWorkletNode(context, "level-detector-processor", {
135
+ numberOfInputs: 1,
136
+ numberOfOutputs: 0
137
+ });
138
+ node.port.onmessage = (event) => {
139
+ const { levelDb } = event.data ?? {};
140
+ if (typeof levelDb === "number" && !Number.isNaN(levelDb)) {
141
+ onLevel(levelDb);
277
142
  }
278
- node.port.onmessage = (event) => {
143
+ };
144
+ node.port.onmessageerror = (event) => {
145
+ console.error("Level detector port error", event);
146
+ };
147
+ return {
148
+ node,
149
+ dispose: () => {
279
150
  try {
280
- const { probability } = event.data;
281
- if (typeof probability === "number" && !isNaN(probability)) {
282
- onDecision(probability);
283
- } else {
284
- console.warn("Invalid VAD probability received:", event.data);
285
- }
151
+ node.port.onmessage = null;
152
+ node.port.close();
286
153
  } catch (error) {
287
- console.error("Error in VAD message handler:", error);
154
+ console.error("Failed to dispose level detector node", error);
288
155
  }
289
- };
290
- node.port.onmessageerror = (event) => {
291
- console.error("VAD port message error:", event);
292
- };
293
- return node;
294
- }
295
- updateSpeakingState(isSpeaking) {
296
- if (this.workletNode) {
297
- this.workletNode.port.postMessage({ isSpeaking });
298
156
  }
299
- }
300
- };
301
-
302
- // src/extensibility/plugins.ts
303
- var nsPlugins = /* @__PURE__ */ new Map();
304
- var vadPlugins = /* @__PURE__ */ new Map();
305
- var defaultNs = new RNNoisePlugin();
306
- nsPlugins.set(defaultNs.name, defaultNs);
307
- var defaultVad = new EnergyVADPlugin();
308
- vadPlugins.set(defaultVad.name, defaultVad);
309
- function getNoiseSuppressionPlugin(name) {
310
- if (!name) return defaultNs;
311
- const plugin = nsPlugins.get(name);
312
- if (!plugin) {
313
- console.warn(
314
- `Noise suppression plugin '${name}' not found, falling back to default.`
315
- );
316
- return defaultNs;
317
- }
318
- return plugin;
319
- }
320
- function getVADPlugin(name) {
321
- if (!name) return defaultVad;
322
- const plugin = vadPlugins.get(name);
323
- if (!plugin) {
324
- console.warn(`VAD plugin '${name}' not found, falling back to default.`);
325
- return defaultVad;
326
- }
327
- return plugin;
157
+ };
328
158
  }
329
159
 
330
160
  // src/vad/vad-state.ts
331
- var VADStateMachine = class {
161
+ var LevelBasedVAD = class {
332
162
  config;
333
- currentState = "silent";
334
- lastSpeechTime = 0;
335
- speechStartTime = 0;
336
- lastSilenceTime = 0;
337
- frameDurationMs = 20;
338
- // Assumed frame duration, updated by calls
163
+ speaking = false;
164
+ pendingSpeechSince = null;
165
+ pendingSilenceSince = null;
339
166
  constructor(config) {
340
167
  this.config = {
341
- enabled: config?.enabled ?? true,
342
- pluginName: config?.pluginName ?? "energy-vad",
343
- // Voice-optimized defaults
344
- startThreshold: config?.startThreshold ?? 0.8,
345
- // Higher threshold to avoid noise
346
- stopThreshold: config?.stopThreshold ?? 0.3,
347
- // Balanced for voice
348
- hangoverMs: config?.hangoverMs ?? 300,
349
- // Smooth for natural speech
350
- preRollMs: config?.preRollMs ?? 250,
351
- // Generous pre-roll
352
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
353
- // Aggressive transient rejection
354
- minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
355
- energyVad: {
356
- smoothing: config?.energyVad?.smoothing ?? 0.95,
357
- initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
358
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
359
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
360
- minSNR: config?.energyVad?.minSNR ?? 12,
361
- snrRange: config?.energyVad?.snrRange ?? 10,
362
- minEnergy: config?.energyVad?.minEnergy ?? 3e-3
363
- }
168
+ minDb: config.minDb,
169
+ maxDb: config.maxDb,
170
+ speakOnRatio: config.speakOnRatio ?? 0.6,
171
+ speakOffRatio: config.speakOffRatio ?? 0.3,
172
+ hangoverMs: config.hangoverMs ?? 350,
173
+ attackMs: config.attackMs ?? 50,
174
+ releaseMs: config.releaseMs ?? 120
364
175
  };
365
- this.lastSilenceTime = Date.now();
366
176
  }
367
177
  updateConfig(config) {
368
- this.config = { ...this.config, ...config };
178
+ this.config = {
179
+ ...this.config,
180
+ ...config,
181
+ speakOnRatio: config.speakOnRatio ?? this.config.speakOnRatio,
182
+ speakOffRatio: config.speakOffRatio ?? this.config.speakOffRatio,
183
+ hangoverMs: config.hangoverMs ?? this.config.hangoverMs,
184
+ attackMs: config.attackMs ?? this.config.attackMs,
185
+ releaseMs: config.releaseMs ?? this.config.releaseMs
186
+ };
369
187
  }
370
- processFrame(probability, timestamp) {
188
+ process(levelDb, timestampMs) {
371
189
  const {
372
- startThreshold,
373
- stopThreshold,
190
+ minDb,
191
+ maxDb,
192
+ speakOnRatio,
193
+ speakOffRatio,
374
194
  hangoverMs,
375
- minSpeechDurationMs,
376
- minSilenceDurationMs
195
+ attackMs,
196
+ releaseMs
377
197
  } = this.config;
378
- let newState = this.currentState;
379
- if (this.currentState === "silent" || this.currentState === "speech_ending") {
380
- if (probability >= startThreshold) {
381
- const silenceDuration = timestamp - this.lastSilenceTime;
382
- if (silenceDuration >= minSilenceDurationMs) {
383
- newState = "speech_starting";
384
- this.speechStartTime = timestamp;
385
- this.lastSpeechTime = timestamp;
386
- } else {
387
- newState = "silent";
198
+ const clamped = Math.min(maxDb, Math.max(minDb, levelDb));
199
+ const norm = (clamped - minDb) / Math.max(1, maxDb - minDb);
200
+ if (!this.speaking) {
201
+ if (norm >= speakOnRatio) {
202
+ this.pendingSpeechSince = this.pendingSpeechSince ?? timestampMs;
203
+ if (timestampMs - this.pendingSpeechSince >= attackMs) {
204
+ this.speaking = true;
205
+ this.pendingSpeechSince = null;
206
+ this.pendingSilenceSince = null;
388
207
  }
389
208
  } else {
390
- newState = "silent";
391
- this.lastSilenceTime = timestamp;
209
+ this.pendingSpeechSince = null;
392
210
  }
393
- } else if (this.currentState === "speech_starting") {
394
- if (probability >= stopThreshold) {
395
- const speechDuration = timestamp - this.speechStartTime;
396
- if (speechDuration >= minSpeechDurationMs) {
397
- newState = "speaking";
398
- } else {
399
- newState = "speech_starting";
211
+ } else {
212
+ if (norm <= speakOffRatio) {
213
+ this.pendingSilenceSince = this.pendingSilenceSince ?? timestampMs;
214
+ const releaseWindow = Math.max(releaseMs, hangoverMs);
215
+ if (timestampMs - this.pendingSilenceSince >= releaseWindow) {
216
+ this.speaking = false;
217
+ this.pendingSilenceSince = null;
218
+ this.pendingSpeechSince = null;
400
219
  }
401
- this.lastSpeechTime = timestamp;
402
220
  } else {
403
- newState = "silent";
404
- this.lastSilenceTime = timestamp;
405
- }
406
- } else if (this.currentState === "speaking") {
407
- if (probability >= stopThreshold) {
408
- newState = "speaking";
409
- this.lastSpeechTime = timestamp;
410
- } else {
411
- const timeSinceSpeech = timestamp - this.lastSpeechTime;
412
- if (timeSinceSpeech < hangoverMs) {
413
- newState = "speaking";
414
- } else {
415
- newState = "speech_ending";
416
- this.lastSilenceTime = timestamp;
417
- }
221
+ this.pendingSilenceSince = null;
418
222
  }
419
223
  }
420
- if (newState === "speech_ending") newState = "silent";
421
- this.currentState = newState;
422
224
  return {
423
- isSpeaking: newState === "speaking",
424
- probability,
425
- state: newState
225
+ speaking: this.speaking,
226
+ levelDb: clamped
426
227
  };
427
228
  }
428
229
  };
@@ -431,58 +232,33 @@ var VADStateMachine = class {
431
232
  async function createAudioPipeline(sourceTrack, config = {}) {
432
233
  const context = getAudioContext();
433
234
  registerPipeline();
434
- const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
435
- config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl
436
- );
437
- const vadEnabled = config.vad?.enabled !== false;
235
+ const nsConfig = {
236
+ enabled: config.noiseSuppression?.enabled ?? true,
237
+ noiseReductionLevel: config.noiseSuppression?.noiseReductionLevel ?? 60
238
+ };
239
+ if (config.noiseSuppression?.assetConfig) {
240
+ nsConfig.assetConfig = config.noiseSuppression.assetConfig;
241
+ }
438
242
  const fullConfig = {
439
- noiseSuppression: {
440
- enabled: nsEnabled,
441
- ...config.noiseSuppression
442
- },
443
- vad: {
444
- enabled: vadEnabled,
445
- // Voice-optimized defaults (will be overridden by config)
446
- startThreshold: 0.6,
447
- stopThreshold: 0.45,
448
- hangoverMs: 400,
449
- preRollMs: 250,
450
- minSpeechDurationMs: 100,
451
- minSilenceDurationMs: 150,
452
- energyVad: {
453
- smoothing: 0.95,
454
- initialNoiseFloor: 1e-3,
455
- noiseFloorAdaptRateQuiet: 0.01,
456
- noiseFloorAdaptRateLoud: 1e-3,
457
- minSNR: 2,
458
- snrRange: 8
459
- },
460
- ...config.vad
243
+ noiseSuppression: nsConfig,
244
+ speaking: {
245
+ minDb: config.speaking?.minDb ?? -60,
246
+ maxDb: config.speaking?.maxDb ?? -20,
247
+ speakOnRatio: config.speaking?.speakOnRatio ?? 0.6,
248
+ speakOffRatio: config.speaking?.speakOffRatio ?? 0.3,
249
+ hangoverMs: config.speaking?.hangoverMs ?? 350,
250
+ attackMs: config.speaking?.attackMs ?? 50,
251
+ releaseMs: config.speaking?.releaseMs ?? 120
461
252
  },
462
253
  output: {
463
- speechGain: 1,
464
- silenceGain: 0,
465
- // Full mute for voice-only
466
- gainRampTime: 0.015,
467
- // Fast but smooth transitions
468
- smoothTransitions: true,
469
- maxGainDb: 6,
470
- enableCompression: false,
471
- compression: {
472
- threshold: -24,
473
- ratio: 3,
474
- attack: 3e-3,
475
- release: 0.05
476
- },
477
- ...config.output
254
+ speechGain: config.output?.speechGain ?? 1,
255
+ silenceGain: config.output?.silenceGain ?? 0,
256
+ gainRampTime: config.output?.gainRampTime ?? 0.015,
257
+ maxGainDb: config.output?.maxGainDb ?? 6,
258
+ smoothTransitions: config.output?.smoothTransitions ?? true
478
259
  },
479
- livekit: { manageTrackMute: false, ...config.livekit }
260
+ muteWhenSilent: config.muteWhenSilent ?? false
480
261
  };
481
- console.log("Audio pipeline config:", {
482
- noiseSuppression: fullConfig.noiseSuppression?.enabled,
483
- vad: fullConfig.vad?.enabled,
484
- output: fullConfig.output
485
- });
486
262
  if (!sourceTrack || sourceTrack.kind !== "audio") {
487
263
  throw new Error(
488
264
  "createAudioPipeline requires a valid audio MediaStreamTrack"
@@ -493,318 +269,196 @@ async function createAudioPipeline(sourceTrack, config = {}) {
493
269
  }
494
270
  const sourceStream = new MediaStream([sourceTrack]);
495
271
  const sourceNode = context.createMediaStreamSource(sourceStream);
496
- let nsNode;
497
- let vadNode;
498
272
  const emitter = (0, import_mitt.default)();
499
- try {
500
- const nsPlugin = getNoiseSuppressionPlugin(
501
- fullConfig.noiseSuppression?.pluginName
502
- );
503
- nsNode = await nsPlugin.createNode(context, fullConfig.noiseSuppression);
504
- } catch (error) {
505
- const err = error instanceof Error ? error : new Error(String(error));
506
- console.error("Failed to create noise suppression node:", err);
507
- emitter.emit("error", err);
508
- throw err;
509
- }
510
- const vadStateMachine = new VADStateMachine(fullConfig.vad);
511
- let vadPlugin;
512
- try {
513
- vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
514
- vadNode = await vadPlugin.createNode(context, fullConfig.vad, (prob) => {
515
- try {
516
- const timestamp = context.currentTime * 1e3;
517
- const newState = vadStateMachine.processFrame(prob, timestamp);
518
- if (vadPlugin && typeof vadPlugin.updateSpeakingState === "function") {
519
- vadPlugin.updateSpeakingState(newState.isSpeaking);
520
- }
521
- if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
522
- emitter.emit("vadChange", newState);
523
- lastVadState = newState;
524
- updateGain(newState);
525
- }
526
- } catch (vadError) {
527
- const err = vadError instanceof Error ? vadError : new Error(String(vadError));
528
- console.error("Error in VAD callback:", err);
529
- emitter.emit("error", err);
273
+ const vad = new LevelBasedVAD(fullConfig.speaking);
274
+ let lastState = { speaking: false, levelDb: -Infinity };
275
+ const nsHandle = await createDeepFilterNet3Node(
276
+ context,
277
+ fullConfig.noiseSuppression
278
+ );
279
+ const levelHandle = await createLevelDetectorNode(context, (levelDb) => {
280
+ try {
281
+ const timestamp = context.currentTime * 1e3;
282
+ const nextState = vad.process(levelDb, timestamp);
283
+ const speakingChanged = nextState.speaking !== lastState.speaking;
284
+ const levelChanged = Math.abs(nextState.levelDb - lastState.levelDb) > 0.5;
285
+ if (speakingChanged || levelChanged) {
286
+ lastState = nextState;
287
+ updateGain(nextState);
288
+ emitter.emit("speakingChange", nextState);
530
289
  }
531
- });
532
- } catch (error) {
533
- const err = error instanceof Error ? error : new Error(String(error));
534
- console.error("Failed to create VAD node:", err);
535
- emitter.emit("error", err);
536
- throw err;
537
- }
538
- let lastVadState = {
539
- isSpeaking: false,
540
- probability: 0,
541
- state: "silent"
542
- };
290
+ } catch (error) {
291
+ const err = error instanceof Error ? error : new Error(String(error));
292
+ emitter.emit("error", err);
293
+ }
294
+ });
543
295
  const splitter = context.createGain();
544
- sourceNode.connect(nsNode);
545
- nsNode.connect(splitter);
546
- splitter.connect(vadNode);
547
- const delayNode = context.createDelay(1);
548
- const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
549
- delayNode.delayTime.value = preRollSeconds;
296
+ sourceNode.connect(nsHandle.node);
297
+ nsHandle.node.connect(splitter);
298
+ splitter.connect(levelHandle.node);
550
299
  const gainNode = context.createGain();
551
300
  gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
552
- let compressor = null;
553
- if (fullConfig.output?.enableCompression) {
554
- compressor = context.createDynamicsCompressor();
555
- const comp = fullConfig.output.compression;
556
- compressor.threshold.value = comp.threshold ?? -24;
557
- compressor.ratio.value = comp.ratio ?? 3;
558
- compressor.attack.value = comp.attack ?? 3e-3;
559
- compressor.release.value = comp.release ?? 0.05;
560
- compressor.knee.value = 10;
561
- }
301
+ splitter.connect(gainNode);
562
302
  const destination = context.createMediaStreamDestination();
563
- try {
564
- splitter.connect(delayNode);
565
- delayNode.connect(gainNode);
566
- if (compressor) {
567
- gainNode.connect(compressor);
568
- compressor.connect(destination);
569
- console.log("Compression enabled:", fullConfig.output?.compression);
570
- } else {
571
- gainNode.connect(destination);
572
- }
573
- } catch (error) {
574
- const err = error instanceof Error ? error : new Error(String(error));
575
- console.error("Failed to wire audio pipeline:", err);
576
- emitter.emit("error", err);
577
- throw err;
578
- }
303
+ gainNode.connect(destination);
579
304
  function updateGain(state) {
580
- try {
581
- const {
582
- speechGain = 1,
583
- silenceGain = 0,
584
- gainRampTime = 0.015,
585
- smoothTransitions = true,
586
- maxGainDb = 6
587
- } = fullConfig.output;
588
- const maxGainLinear = Math.pow(10, maxGainDb / 20);
589
- const limitedSpeechGain = Math.min(speechGain, maxGainLinear);
590
- const targetGain = state.isSpeaking ? limitedSpeechGain : silenceGain;
591
- const now = context.currentTime;
592
- if (smoothTransitions) {
593
- gainNode.gain.cancelScheduledValues(now);
594
- gainNode.gain.setValueAtTime(gainNode.gain.value, now);
595
- gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime / 3);
596
- } else {
597
- gainNode.gain.setValueAtTime(targetGain, now);
598
- }
599
- } catch (error) {
600
- const err = error instanceof Error ? error : new Error(String(error));
601
- console.error("Failed to update gain:", err);
602
- emitter.emit("error", err);
305
+ const {
306
+ speechGain = 1,
307
+ silenceGain = 0,
308
+ gainRampTime = 0.015,
309
+ smoothTransitions = true,
310
+ maxGainDb = 6
311
+ } = fullConfig.output ?? {};
312
+ const maxGainLinear = Math.pow(10, maxGainDb / 20);
313
+ const limitedSpeechGain = Math.min(speechGain ?? 1, maxGainLinear);
314
+ const target = state.speaking ? limitedSpeechGain : silenceGain ?? 0;
315
+ const now = context.currentTime;
316
+ gainNode.gain.cancelScheduledValues(now);
317
+ gainNode.gain.setValueAtTime(gainNode.gain.value, now);
318
+ if (smoothTransitions) {
319
+ gainNode.gain.setTargetAtTime(target, now, gainRampTime / 3);
320
+ } else {
321
+ gainNode.gain.setValueAtTime(target, now);
603
322
  }
604
323
  }
605
324
  const audioTracks = destination.stream.getAudioTracks();
606
- console.log("Destination stream tracks:", {
607
- count: audioTracks.length,
608
- tracks: audioTracks.map((t) => ({
609
- id: t.id,
610
- label: t.label,
611
- enabled: t.enabled,
612
- readyState: t.readyState
613
- }))
614
- });
615
325
  if (audioTracks.length === 0) {
616
- const err = new Error(
617
- "Failed to create processed audio track: destination stream has no audio tracks. This may indicate an issue with the audio graph connection."
618
- );
619
- console.error(err);
620
- emitter.emit("error", err);
621
- throw err;
326
+ nsHandle.dispose();
327
+ levelHandle.dispose();
328
+ unregisterPipeline();
329
+ throw new Error("Failed to create processed audio track");
622
330
  }
623
331
  const processedTrack = audioTracks[0];
624
- if (!processedTrack || processedTrack.readyState === "ended") {
625
- const err = new Error("Processed audio track is invalid or ended");
626
- console.error(err);
627
- emitter.emit("error", err);
628
- throw err;
629
- }
630
- console.log("Audio pipeline created successfully:", {
631
- sourceTrack: {
632
- id: sourceTrack.id,
633
- label: sourceTrack.label,
634
- readyState: sourceTrack.readyState
635
- },
636
- processedTrack: {
637
- id: processedTrack.id,
638
- label: processedTrack.label,
639
- readyState: processedTrack.readyState
640
- },
641
- config: {
642
- noiseSuppression: fullConfig.noiseSuppression?.enabled,
643
- vad: fullConfig.vad?.enabled
644
- }
645
- });
646
332
  function dispose() {
647
333
  try {
648
334
  sourceNode.disconnect();
649
- nsNode.disconnect();
335
+ nsHandle.node.disconnect();
650
336
  splitter.disconnect();
651
- vadNode.disconnect();
652
- delayNode.disconnect();
337
+ levelHandle.node.disconnect();
653
338
  gainNode.disconnect();
654
- if (compressor) {
655
- compressor.disconnect();
656
- }
657
339
  destination.stream.getTracks().forEach((t) => t.stop());
658
- unregisterPipeline();
340
+ levelHandle.dispose();
341
+ nsHandle.dispose();
659
342
  } catch (error) {
660
- console.error("Error during pipeline disposal:", error);
343
+ console.error("Error during pipeline disposal", error);
344
+ } finally {
345
+ unregisterPipeline();
661
346
  }
662
347
  }
663
- return {
348
+ const handle = {
664
349
  processedTrack,
665
350
  events: emitter,
666
351
  get state() {
667
- return lastVadState;
352
+ return lastState;
668
353
  },
669
- setConfig: (newConfig) => {
354
+ setConfig: (next) => {
670
355
  try {
671
- if (newConfig.vad) {
672
- vadStateMachine.updateConfig(newConfig.vad);
673
- Object.assign(fullConfig.vad, newConfig.vad);
674
- if (newConfig.vad.preRollMs !== void 0) {
675
- const preRollSeconds2 = newConfig.vad.preRollMs / 1e3;
676
- delayNode.delayTime.setValueAtTime(
677
- preRollSeconds2,
678
- context.currentTime
679
- );
680
- }
356
+ if (next.speaking) {
357
+ vad.updateConfig(next.speaking);
358
+ fullConfig.speaking = { ...fullConfig.speaking, ...next.speaking };
359
+ }
360
+ if (next.output) {
361
+ fullConfig.output = { ...fullConfig.output, ...next.output };
362
+ updateGain(lastState);
681
363
  }
682
- if (newConfig.output) {
683
- Object.assign(fullConfig.output, newConfig.output);
684
- updateGain(lastVadState);
685
- if (compressor && newConfig.output.compression) {
686
- const comp = newConfig.output.compression;
687
- if (comp.threshold !== void 0) {
688
- compressor.threshold.setValueAtTime(
689
- comp.threshold,
690
- context.currentTime
691
- );
692
- }
693
- if (comp.ratio !== void 0) {
694
- compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
695
- }
696
- if (comp.attack !== void 0) {
697
- compressor.attack.setValueAtTime(
698
- comp.attack,
699
- context.currentTime
700
- );
701
- }
702
- if (comp.release !== void 0) {
703
- compressor.release.setValueAtTime(
704
- comp.release,
705
- context.currentTime
706
- );
707
- }
364
+ if (next.noiseSuppression) {
365
+ const ns = next.noiseSuppression;
366
+ fullConfig.noiseSuppression = {
367
+ ...fullConfig.noiseSuppression,
368
+ ...ns
369
+ };
370
+ if (typeof ns.noiseReductionLevel === "number") {
371
+ nsHandle.processor.setSuppressionLevel(ns.noiseReductionLevel);
372
+ }
373
+ if (typeof ns.enabled === "boolean") {
374
+ nsHandle.processor.setNoiseSuppressionEnabled(ns.enabled);
708
375
  }
709
376
  }
710
- if (newConfig.livekit) {
711
- Object.assign(fullConfig.livekit, newConfig.livekit);
377
+ if (typeof next.muteWhenSilent === "boolean") {
378
+ fullConfig.muteWhenSilent = next.muteWhenSilent;
712
379
  }
713
- console.log("Pipeline config updated:", newConfig);
714
380
  } catch (error) {
715
381
  const err = error instanceof Error ? error : new Error(String(error));
716
- console.error("Failed to update config:", err);
717
382
  emitter.emit("error", err);
718
383
  }
719
384
  },
720
385
  dispose
721
386
  };
387
+ return handle;
722
388
  }
723
389
 
724
390
  // src/livekit/integration.ts
725
- async function attachProcessingToTrack(track, config = {}) {
391
+ async function attachSpeakingDetectionToTrack(track, options = {}) {
726
392
  if (!track) {
727
- throw new Error("attachProcessingToTrack requires a valid LocalAudioTrack");
728
- }
729
- const originalTrack = track.mediaStreamTrack;
730
- if (!originalTrack) {
731
- throw new Error("LocalAudioTrack has no underlying MediaStreamTrack");
732
- }
733
- if (originalTrack.readyState === "ended") {
734
- throw new Error("Cannot attach processing to an ended MediaStreamTrack");
735
- }
736
- let pipeline;
737
- try {
738
- console.log("Creating audio processing pipeline...");
739
- pipeline = await createAudioPipeline(originalTrack, config);
740
- console.log("Audio processing pipeline created successfully");
741
- } catch (error) {
742
- const err = new Error(
743
- `Failed to create audio pipeline: ${error instanceof Error ? error.message : String(error)}`
393
+ throw new Error(
394
+ "attachSpeakingDetectionToTrack requires a valid LocalAudioTrack"
744
395
  );
745
- console.error(err);
746
- throw err;
747
- }
748
- if (!pipeline.processedTrack) {
749
- throw new Error("Pipeline did not return a processed track");
750
396
  }
751
- try {
752
- console.log("Replacing LiveKit track with processed track...");
753
- await track.replaceTrack(pipeline.processedTrack);
754
- console.log("LiveKit track replaced successfully");
755
- } catch (error) {
756
- pipeline.dispose();
757
- const err = new Error(
758
- `Failed to replace LiveKit track: ${error instanceof Error ? error.message : String(error)}`
759
- );
760
- console.error(err);
761
- throw err;
397
+ const originalTrack = track.mediaStreamTrack;
398
+ if (!originalTrack || originalTrack.readyState === "ended") {
399
+ throw new Error("LocalAudioTrack has no live MediaStreamTrack to process");
762
400
  }
763
- if (config.livekit?.manageTrackMute) {
764
- let isVadMuted = false;
765
- pipeline.events.on("vadChange", async (state) => {
766
- try {
767
- if (state.isSpeaking) {
768
- if (isVadMuted) {
769
- await track.unmute();
770
- isVadMuted = false;
771
- }
772
- } else {
773
- if (!track.isMuted) {
774
- await track.mute();
775
- isVadMuted = true;
776
- }
777
- }
778
- } catch (error) {
779
- console.error("Error handling VAD-based track muting:", error);
401
+ const pipeline = await createAudioPipeline(originalTrack, options);
402
+ await track.replaceTrack(pipeline.processedTrack);
403
+ const listeners = /* @__PURE__ */ new Set();
404
+ let mutedByController = false;
405
+ let currentState = pipeline.state;
406
+ const speakingHandler = (state) => {
407
+ currentState = state;
408
+ listeners.forEach((listener) => listener(state));
409
+ if (options.muteWhenSilent) {
410
+ if (!state.speaking && !track.isMuted) {
411
+ track.mute().catch((error) => console.error("mute failed", error));
412
+ mutedByController = true;
780
413
  }
781
- });
782
- }
783
- pipeline.events.on("error", (error) => {
784
- console.error("Audio pipeline error:", error);
785
- });
786
- const originalDispose = pipeline.dispose;
787
- pipeline.dispose = () => {
788
- try {
414
+ if (state.speaking && mutedByController) {
415
+ track.unmute().catch((error) => console.error("unmute failed", error));
416
+ mutedByController = false;
417
+ }
418
+ }
419
+ };
420
+ pipeline.events.on("speakingChange", speakingHandler);
421
+ const errorHandler = (error) => {
422
+ console.error("Audio pipeline error", error);
423
+ };
424
+ pipeline.events.on("error", errorHandler);
425
+ const controller = {
426
+ get speaking() {
427
+ return currentState.speaking;
428
+ },
429
+ get levelDb() {
430
+ return currentState.levelDb;
431
+ },
432
+ onChange: (listener) => {
433
+ listeners.add(listener);
434
+ listener(currentState);
435
+ return () => listeners.delete(listener);
436
+ },
437
+ setConfig: (config) => {
438
+ pipeline.setConfig(config);
439
+ if (typeof config.muteWhenSilent === "boolean") {
440
+ options.muteWhenSilent = config.muteWhenSilent;
441
+ }
442
+ },
443
+ dispose: () => {
444
+ pipeline.events.off("speakingChange", speakingHandler);
445
+ pipeline.events.off("error", errorHandler);
446
+ listeners.clear();
447
+ if (mutedByController && !track.isMuted) {
448
+ track.unmute().catch((error) => console.error("unmute failed", error));
449
+ mutedByController = false;
450
+ }
451
+ pipeline.dispose();
789
452
  if (originalTrack.readyState === "live") {
790
- console.log("Restoring original track...");
791
453
  track.replaceTrack(originalTrack).catch((error) => {
792
- console.error("Failed to restore original track:", error);
454
+ console.error("Failed to restore original track", error);
793
455
  });
794
456
  }
795
- originalDispose();
796
- } catch (error) {
797
- console.error("Error during pipeline disposal:", error);
798
- try {
799
- originalDispose();
800
- } catch (disposeError) {
801
- console.error("Error calling original dispose:", disposeError);
802
- }
803
457
  }
804
458
  };
805
- return pipeline;
459
+ return controller;
806
460
  }
807
461
  // Annotate the CommonJS export names for ESM import in node:
808
462
  0 && (module.exports = {
809
- attachProcessingToTrack
463
+ attachSpeakingDetectionToTrack
810
464
  });