@tensamin/audio 0.1.14 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +48 -231
  2. package/dist/chunk-6BJ4XGSA.mjs +80 -0
  3. package/dist/chunk-AQ5RVY33.mjs +74 -0
  4. package/dist/chunk-IS37FHDN.mjs +33 -0
  5. package/dist/chunk-K4J3UUOR.mjs +178 -0
  6. package/dist/chunk-QNQK6QFB.mjs +71 -0
  7. package/dist/context/audio-context.d.mts +0 -24
  8. package/dist/context/audio-context.d.ts +0 -24
  9. package/dist/index.d.mts +2 -8
  10. package/dist/index.d.ts +2 -8
  11. package/dist/index.js +285 -680
  12. package/dist/index.mjs +8 -43
  13. package/dist/livekit/integration.d.mts +3 -7
  14. package/dist/livekit/integration.d.ts +3 -7
  15. package/dist/livekit/integration.js +280 -626
  16. package/dist/livekit/integration.mjs +7 -8
  17. package/dist/noise-suppression/deepfilternet-node.d.mts +12 -0
  18. package/dist/noise-suppression/deepfilternet-node.d.ts +12 -0
  19. package/dist/noise-suppression/deepfilternet-node.js +57 -0
  20. package/dist/noise-suppression/deepfilternet-node.mjs +6 -0
  21. package/dist/pipeline/audio-pipeline.d.mts +2 -2
  22. package/dist/pipeline/audio-pipeline.d.ts +2 -2
  23. package/dist/pipeline/audio-pipeline.js +219 -554
  24. package/dist/pipeline/audio-pipeline.mjs +4 -5
  25. package/dist/types.d.mts +42 -257
  26. package/dist/types.d.ts +42 -257
  27. package/dist/vad/vad-node.d.mts +7 -9
  28. package/dist/vad/vad-node.d.ts +7 -9
  29. package/dist/vad/vad-node.js +47 -156
  30. package/dist/vad/vad-node.mjs +3 -3
  31. package/dist/vad/vad-state.d.mts +9 -11
  32. package/dist/vad/vad-state.d.ts +9 -11
  33. package/dist/vad/vad-state.js +50 -79
  34. package/dist/vad/vad-state.mjs +3 -3
  35. package/package.json +21 -21
  36. package/dist/chunk-2G2JFHJY.mjs +0 -180
  37. package/dist/chunk-6F2HZUYO.mjs +0 -91
  38. package/dist/chunk-K4YLH73B.mjs +0 -103
  39. package/dist/chunk-R5M2DGAQ.mjs +0 -311
  40. package/dist/chunk-UFKIAMG3.mjs +0 -47
  41. package/dist/chunk-XO6B3D4A.mjs +0 -67
  42. package/dist/extensibility/plugins.d.mts +0 -9
  43. package/dist/extensibility/plugins.d.ts +0 -9
  44. package/dist/extensibility/plugins.js +0 -320
  45. package/dist/extensibility/plugins.mjs +0 -14
  46. package/dist/noise-suppression/rnnoise-node.d.mts +0 -10
  47. package/dist/noise-suppression/rnnoise-node.d.ts +0 -10
  48. package/dist/noise-suppression/rnnoise-node.js +0 -101
  49. package/dist/noise-suppression/rnnoise-node.mjs +0 -6
package/dist/index.js CHANGED
@@ -30,24 +30,16 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
- EnergyVADPlugin: () => EnergyVADPlugin,
34
- RNNoisePlugin: () => RNNoisePlugin,
35
- VADStateMachine: () => VADStateMachine,
36
- attachProcessingToTrack: () => attachProcessingToTrack,
37
- closeAudioContext: () => closeAudioContext,
38
- createAudioPipeline: () => createAudioPipeline,
39
- getAudioContext: () => getAudioContext,
40
- getNoiseSuppressionPlugin: () => getNoiseSuppressionPlugin,
41
- getVADPlugin: () => getVADPlugin,
42
- registerNoiseSuppressionPlugin: () => registerNoiseSuppressionPlugin,
43
- registerPipeline: () => registerPipeline,
44
- registerVADPlugin: () => registerVADPlugin,
45
- resumeAudioContext: () => resumeAudioContext,
46
- suspendAudioContext: () => suspendAudioContext,
47
- unregisterPipeline: () => unregisterPipeline
33
+ attachSpeakingDetectionToTrack: () => attachSpeakingDetectionToTrack
48
34
  });
49
35
  module.exports = __toCommonJS(index_exports);
50
36
 
37
+ // src/livekit/integration.ts
38
+ var import_mitt2 = require("mitt");
39
+
40
+ // src/pipeline/audio-pipeline.ts
41
+ var import_mitt = __toESM(require("mitt"));
42
+
51
43
  // src/context/audio-context.ts
52
44
  var sharedContext = null;
53
45
  var activePipelines = 0;
@@ -68,398 +60,172 @@ function registerPipeline() {
68
60
  function unregisterPipeline() {
69
61
  activePipelines = Math.max(0, activePipelines - 1);
70
62
  }
71
- async function resumeAudioContext() {
72
- if (sharedContext && sharedContext.state === "suspended") {
73
- await sharedContext.resume();
74
- }
75
- }
76
- async function suspendAudioContext() {
77
- if (sharedContext && sharedContext.state === "running") {
78
- await sharedContext.suspend();
63
+
64
+ // src/noise-suppression/deepfilternet-node.ts
65
+ var import_deepfilternet3_noise_filter = require("deepfilternet3-noise-filter");
66
+ async function createDeepFilterNet3Node(context, config) {
67
+ const processorConfig = {
68
+ sampleRate: context.sampleRate,
69
+ noiseReductionLevel: config?.noiseReductionLevel ?? 60
70
+ };
71
+ if (config?.assetConfig) {
72
+ processorConfig.assetConfig = config.assetConfig;
79
73
  }
80
- }
81
- async function closeAudioContext() {
82
- if (sharedContext && sharedContext.state !== "closed") {
83
- await sharedContext.close();
74
+ const processor = new import_deepfilternet3_noise_filter.DeepFilterNet3Processor(processorConfig);
75
+ await processor.initialize();
76
+ const node = await processor.createAudioWorkletNode(context);
77
+ const enabled = config?.enabled ?? true;
78
+ if (!enabled) {
79
+ processor.setNoiseSuppressionEnabled(false);
84
80
  }
85
- sharedContext = null;
86
- activePipelines = 0;
87
- }
88
-
89
- // src/pipeline/audio-pipeline.ts
90
- var import_mitt = __toESM(require("mitt"));
91
-
92
- // src/noise-suppression/rnnoise-node.ts
93
- var RNNoisePlugin = class {
94
- name = "rnnoise-ns";
95
- wasmBuffer = null;
96
- async createNode(context, config) {
97
- const { loadRnnoise, RnnoiseWorkletNode } = await import("@sapphi-red/web-noise-suppressor");
98
- if (!config?.enabled) {
99
- console.log("Noise suppression disabled, using passthrough node");
100
- const pass = context.createGain();
101
- return pass;
102
- }
103
- if (!config?.wasmUrl || !config?.simdUrl || !config?.workletUrl) {
104
- const error = new Error(
105
- `RNNoisePlugin requires 'wasmUrl', 'simdUrl', and 'workletUrl' to be configured. Please download the assets from @sapphi-red/web-noise-suppressor and provide the URLs in the config. Current config: wasmUrl=${config?.wasmUrl}, simdUrl=${config?.simdUrl}, workletUrl=${config?.workletUrl}
106
- To disable noise suppression, set noiseSuppression.enabled to false.`
107
- );
108
- console.error(error.message);
109
- throw error;
110
- }
111
- try {
112
- if (!this.wasmBuffer) {
113
- console.log("Loading RNNoise WASM binary...");
114
- this.wasmBuffer = await loadRnnoise({
115
- url: config.wasmUrl,
116
- simdUrl: config.simdUrl
117
- });
118
- console.log("RNNoise WASM loaded successfully");
81
+ return {
82
+ node,
83
+ processor,
84
+ dispose: () => {
85
+ try {
86
+ processor.destroy();
87
+ } catch (error) {
88
+ console.error("Failed to dispose DeepFilterNet3 processor", error);
119
89
  }
120
- } catch (error) {
121
- const err = new Error(
122
- `Failed to load RNNoise WASM binary: ${error instanceof Error ? error.message : String(error)}`
123
- );
124
- console.error(err);
125
- throw err;
126
- }
127
- const workletUrl = config.workletUrl;
128
- try {
129
- await context.audioWorklet.addModule(workletUrl);
130
- console.log("RNNoise worklet loaded successfully");
131
- } catch (e) {
132
- const error = new Error(
133
- `Failed to load RNNoise worklet from ${workletUrl}: ${e instanceof Error ? e.message : String(e)}. Ensure the workletUrl points to a valid RNNoise worklet script.`
134
- );
135
- console.error(error.message);
136
- throw error;
137
90
  }
138
- try {
139
- const node = new RnnoiseWorkletNode(context, {
140
- wasmBinary: this.wasmBuffer,
141
- maxChannels: 1
142
- // Mono for now
143
- });
144
- console.log("RNNoise worklet node created successfully");
145
- return node;
146
- } catch (error) {
147
- const err = new Error(
148
- `Failed to create RNNoise worklet node: ${error instanceof Error ? error.message : String(error)}`
149
- );
150
- console.error(err);
151
- throw err;
152
- }
153
- }
154
- };
91
+ };
92
+ }
155
93
 
156
94
  // src/vad/vad-node.ts
157
- var createEnergyVadWorkletCode = (vadConfig) => {
158
- const energyParams = vadConfig?.energyVad || {};
159
- const smoothing = energyParams.smoothing ?? 0.95;
160
- const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
161
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
162
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
163
- const minSNR = energyParams.minSNR ?? 12;
164
- const snrRange = energyParams.snrRange ?? 10;
165
- const minEnergy = energyParams.minEnergy ?? 3e-3;
95
+ function createLevelDetectorWorkletCode(smoothing) {
166
96
  return `
167
- class EnergyVadProcessor extends AudioWorkletProcessor {
97
+ class LevelDetectorProcessor extends AudioWorkletProcessor {
168
98
  constructor() {
169
99
  super();
100
+ this.smoothed = 0;
170
101
  this.smoothing = ${smoothing};
171
- this.energy = 0;
172
- this.noiseFloor = ${initialNoiseFloor};
173
- this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
174
- this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
175
- this.minSNR = ${minSNR};
176
- this.snrRange = ${snrRange};
177
- this.minEnergy = ${minEnergy};
178
- this.isSpeaking = false;
179
-
180
- this.port.onmessage = (event) => {
181
- if (event.data && event.data.isSpeaking !== undefined) {
182
- this.isSpeaking = event.data.isSpeaking;
183
- }
184
- };
185
102
  }
186
103
 
187
- process(inputs, outputs, parameters) {
104
+ process(inputs) {
188
105
  const input = inputs[0];
189
- if (!input || !input.length) return true;
106
+ if (!input || input.length === 0) return true;
190
107
  const channel = input[0];
191
-
192
- // Calculate instantaneous RMS (Root Mean Square) energy
108
+ if (!channel || channel.length === 0) return true;
109
+
193
110
  let sum = 0;
194
- let peak = 0;
195
111
  for (let i = 0; i < channel.length; i++) {
196
- const sample = Math.abs(channel[i]);
197
- sum += channel[i] * channel[i];
198
- peak = Math.max(peak, sample);
199
- }
200
- const instantRms = Math.sqrt(sum / channel.length);
201
-
202
- // Smooth the RMS energy to reduce jitter
203
- // this.energy acts as the smoothed RMS value
204
- this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
205
-
206
- // Calculate Crest Factor (peak-to-RMS ratio)
207
- // Voice typically has crest factor of 2-4 (6-12dB)
208
- // Keyboard clicks have crest factor of 10-30+ (20-30dB)
209
- const crestFactor = peak / (instantRms + 1e-10);
210
- const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
211
-
212
- // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
213
- // This prevents sharp transients from affecting the noise floor
214
- if (this.energy < this.noiseFloor) {
215
- // Signal is quieter than noise floor, adapt downwards slowly
216
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
217
- } else {
218
- // Calculate SNR based on smoothed energy
219
- const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
220
- const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
221
-
222
- // Only adapt upwards if:
223
- // 1. SNR is low (< 10dB) - likely just background noise
224
- // 2. AND crest factor is low (< 15dB) - not a sharp transient
225
- if (smoothedSnrDb < 10 && crestFactorDb < 15) {
226
- // This is persistent background noise, adapt upwards
227
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
228
- } else {
229
- // Either high SNR (speech) or high crest factor (click) - adapt very slowly
230
- const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
231
- this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
232
- }
112
+ const sample = channel[i];
113
+ sum += sample * sample;
233
114
  }
234
-
235
- // Ensure noise floor doesn't drop to absolute zero
236
- this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
237
-
238
- // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
239
- const snr = this.energy / (this.noiseFloor + 1e-6);
240
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
241
-
242
- // Map SNR dB to probability (0-1)
243
- // Probability is 0 when snrDb <= minSNR
244
- // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
245
- let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
246
-
247
- // Apply absolute energy threshold with soft knee
248
- if (this.energy < this.minEnergy) {
249
- const energyRatio = this.energy / (this.minEnergy + 1e-6);
250
- probability *= Math.pow(energyRatio, 2);
251
- }
252
-
253
- // Apply crest factor penalty
254
- // Reject signals with high crest factor (sharp transients like keyboard clicks)
255
- // Voice: 6-12dB, Keyboard: 20-30dB
256
- // We penalize anything above 14dB
257
- if (crestFactorDb > 14) {
258
- const excess = crestFactorDb - 14;
259
- const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
260
- probability *= penalty;
261
- }
262
-
263
- this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
264
-
115
+ const rms = Math.sqrt(sum / channel.length);
116
+ this.smoothed = this.smoothed * this.smoothing + rms * (1 - this.smoothing);
117
+ const levelDb = 20 * Math.log10(Math.max(1e-8, this.smoothed));
118
+ this.port.postMessage({ levelDb });
265
119
  return true;
266
120
  }
267
121
  }
268
- registerProcessor('energy-vad-processor', EnergyVadProcessor);
122
+
123
+ registerProcessor('level-detector-processor', LevelDetectorProcessor);
269
124
  `;
270
- };
271
- var EnergyVADPlugin = class {
272
- name = "energy-vad";
273
- workletNode = null;
274
- async createNode(context, config, onDecision) {
275
- if (!config?.enabled) {
276
- console.log("VAD disabled, using passthrough node");
277
- const pass = context.createGain();
278
- return pass;
279
- }
280
- const workletCode = createEnergyVadWorkletCode(config);
281
- const blob = new Blob([workletCode], {
282
- type: "application/javascript"
283
- });
284
- const url = URL.createObjectURL(blob);
285
- try {
286
- await context.audioWorklet.addModule(url);
287
- console.log("Energy VAD worklet loaded successfully");
288
- } catch (e) {
289
- const error = new Error(
290
- `Failed to load Energy VAD worklet: ${e instanceof Error ? e.message : String(e)}`
291
- );
292
- console.error(error.message);
293
- URL.revokeObjectURL(url);
294
- throw error;
295
- }
125
+ }
126
+ async function createLevelDetectorNode(context, onLevel, options) {
127
+ const smoothing = options?.smoothing ?? 0.9;
128
+ const workletCode = createLevelDetectorWorkletCode(smoothing);
129
+ const blob = new Blob([workletCode], { type: "application/javascript" });
130
+ const url = URL.createObjectURL(blob);
131
+ try {
132
+ await context.audioWorklet.addModule(url);
133
+ } finally {
296
134
  URL.revokeObjectURL(url);
297
- let node;
298
- try {
299
- node = new AudioWorkletNode(context, "energy-vad-processor");
300
- this.workletNode = node;
301
- console.log("Energy VAD node created successfully");
302
- } catch (e) {
303
- const error = new Error(
304
- `Failed to create Energy VAD node: ${e instanceof Error ? e.message : String(e)}`
305
- );
306
- console.error(error.message);
307
- throw error;
135
+ }
136
+ const node = new AudioWorkletNode(context, "level-detector-processor", {
137
+ numberOfInputs: 1,
138
+ numberOfOutputs: 0
139
+ });
140
+ node.port.onmessage = (event) => {
141
+ const { levelDb } = event.data ?? {};
142
+ if (typeof levelDb === "number" && !Number.isNaN(levelDb)) {
143
+ onLevel(levelDb);
308
144
  }
309
- node.port.onmessage = (event) => {
145
+ };
146
+ node.port.onmessageerror = (event) => {
147
+ console.error("Level detector port error", event);
148
+ };
149
+ return {
150
+ node,
151
+ dispose: () => {
310
152
  try {
311
- const { probability } = event.data;
312
- if (typeof probability === "number" && !isNaN(probability)) {
313
- onDecision(probability);
314
- } else {
315
- console.warn("Invalid VAD probability received:", event.data);
316
- }
153
+ node.port.onmessage = null;
154
+ node.port.close();
317
155
  } catch (error) {
318
- console.error("Error in VAD message handler:", error);
156
+ console.error("Failed to dispose level detector node", error);
319
157
  }
320
- };
321
- node.port.onmessageerror = (event) => {
322
- console.error("VAD port message error:", event);
323
- };
324
- return node;
325
- }
326
- updateSpeakingState(isSpeaking) {
327
- if (this.workletNode) {
328
- this.workletNode.port.postMessage({ isSpeaking });
329
158
  }
330
- }
331
- };
332
-
333
- // src/extensibility/plugins.ts
334
- var nsPlugins = /* @__PURE__ */ new Map();
335
- var vadPlugins = /* @__PURE__ */ new Map();
336
- var defaultNs = new RNNoisePlugin();
337
- nsPlugins.set(defaultNs.name, defaultNs);
338
- var defaultVad = new EnergyVADPlugin();
339
- vadPlugins.set(defaultVad.name, defaultVad);
340
- function registerNoiseSuppressionPlugin(plugin) {
341
- nsPlugins.set(plugin.name, plugin);
342
- }
343
- function registerVADPlugin(plugin) {
344
- vadPlugins.set(plugin.name, plugin);
345
- }
346
- function getNoiseSuppressionPlugin(name) {
347
- if (!name) return defaultNs;
348
- const plugin = nsPlugins.get(name);
349
- if (!plugin) {
350
- console.warn(
351
- `Noise suppression plugin '${name}' not found, falling back to default.`
352
- );
353
- return defaultNs;
354
- }
355
- return plugin;
356
- }
357
- function getVADPlugin(name) {
358
- if (!name) return defaultVad;
359
- const plugin = vadPlugins.get(name);
360
- if (!plugin) {
361
- console.warn(`VAD plugin '${name}' not found, falling back to default.`);
362
- return defaultVad;
363
- }
364
- return plugin;
159
+ };
365
160
  }
366
161
 
367
162
  // src/vad/vad-state.ts
368
- var VADStateMachine = class {
163
+ var LevelBasedVAD = class {
369
164
  config;
370
- currentState = "silent";
371
- lastSpeechTime = 0;
372
- speechStartTime = 0;
373
- lastSilenceTime = 0;
374
- frameDurationMs = 20;
375
- // Assumed frame duration, updated by calls
165
+ speaking = false;
166
+ pendingSpeechSince = null;
167
+ pendingSilenceSince = null;
376
168
  constructor(config) {
377
169
  this.config = {
378
- enabled: config?.enabled ?? true,
379
- pluginName: config?.pluginName ?? "energy-vad",
380
- // Voice-optimized defaults
381
- startThreshold: config?.startThreshold ?? 0.8,
382
- // Higher threshold to avoid noise
383
- stopThreshold: config?.stopThreshold ?? 0.3,
384
- // Balanced for voice
385
- hangoverMs: config?.hangoverMs ?? 300,
386
- // Smooth for natural speech
387
- preRollMs: config?.preRollMs ?? 250,
388
- // Generous pre-roll
389
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
390
- // Aggressive transient rejection
391
- minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
392
- energyVad: {
393
- smoothing: config?.energyVad?.smoothing ?? 0.95,
394
- initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
395
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
396
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
397
- minSNR: config?.energyVad?.minSNR ?? 12,
398
- snrRange: config?.energyVad?.snrRange ?? 10,
399
- minEnergy: config?.energyVad?.minEnergy ?? 3e-3
400
- }
170
+ minDb: config.minDb,
171
+ maxDb: config.maxDb,
172
+ speakOnRatio: config.speakOnRatio ?? 0.6,
173
+ speakOffRatio: config.speakOffRatio ?? 0.3,
174
+ hangoverMs: config.hangoverMs ?? 350,
175
+ attackMs: config.attackMs ?? 50,
176
+ releaseMs: config.releaseMs ?? 120
401
177
  };
402
- this.lastSilenceTime = Date.now();
403
178
  }
404
179
  updateConfig(config) {
405
- this.config = { ...this.config, ...config };
180
+ this.config = {
181
+ ...this.config,
182
+ ...config,
183
+ speakOnRatio: config.speakOnRatio ?? this.config.speakOnRatio,
184
+ speakOffRatio: config.speakOffRatio ?? this.config.speakOffRatio,
185
+ hangoverMs: config.hangoverMs ?? this.config.hangoverMs,
186
+ attackMs: config.attackMs ?? this.config.attackMs,
187
+ releaseMs: config.releaseMs ?? this.config.releaseMs
188
+ };
406
189
  }
407
- processFrame(probability, timestamp) {
190
+ process(levelDb, timestampMs) {
408
191
  const {
409
- startThreshold,
410
- stopThreshold,
192
+ minDb,
193
+ maxDb,
194
+ speakOnRatio,
195
+ speakOffRatio,
411
196
  hangoverMs,
412
- minSpeechDurationMs,
413
- minSilenceDurationMs
197
+ attackMs,
198
+ releaseMs
414
199
  } = this.config;
415
- let newState = this.currentState;
416
- if (this.currentState === "silent" || this.currentState === "speech_ending") {
417
- if (probability >= startThreshold) {
418
- const silenceDuration = timestamp - this.lastSilenceTime;
419
- if (silenceDuration >= minSilenceDurationMs) {
420
- newState = "speech_starting";
421
- this.speechStartTime = timestamp;
422
- this.lastSpeechTime = timestamp;
423
- } else {
424
- newState = "silent";
200
+ const clamped = Math.min(maxDb, Math.max(minDb, levelDb));
201
+ const norm = (clamped - minDb) / Math.max(1, maxDb - minDb);
202
+ if (!this.speaking) {
203
+ if (norm >= speakOnRatio) {
204
+ this.pendingSpeechSince = this.pendingSpeechSince ?? timestampMs;
205
+ if (timestampMs - this.pendingSpeechSince >= attackMs) {
206
+ this.speaking = true;
207
+ this.pendingSpeechSince = null;
208
+ this.pendingSilenceSince = null;
425
209
  }
426
210
  } else {
427
- newState = "silent";
428
- this.lastSilenceTime = timestamp;
211
+ this.pendingSpeechSince = null;
429
212
  }
430
- } else if (this.currentState === "speech_starting") {
431
- if (probability >= stopThreshold) {
432
- const speechDuration = timestamp - this.speechStartTime;
433
- if (speechDuration >= minSpeechDurationMs) {
434
- newState = "speaking";
435
- } else {
436
- newState = "speech_starting";
213
+ } else {
214
+ if (norm <= speakOffRatio) {
215
+ this.pendingSilenceSince = this.pendingSilenceSince ?? timestampMs;
216
+ const releaseWindow = Math.max(releaseMs, hangoverMs);
217
+ if (timestampMs - this.pendingSilenceSince >= releaseWindow) {
218
+ this.speaking = false;
219
+ this.pendingSilenceSince = null;
220
+ this.pendingSpeechSince = null;
437
221
  }
438
- this.lastSpeechTime = timestamp;
439
222
  } else {
440
- newState = "silent";
441
- this.lastSilenceTime = timestamp;
442
- }
443
- } else if (this.currentState === "speaking") {
444
- if (probability >= stopThreshold) {
445
- newState = "speaking";
446
- this.lastSpeechTime = timestamp;
447
- } else {
448
- const timeSinceSpeech = timestamp - this.lastSpeechTime;
449
- if (timeSinceSpeech < hangoverMs) {
450
- newState = "speaking";
451
- } else {
452
- newState = "speech_ending";
453
- this.lastSilenceTime = timestamp;
454
- }
223
+ this.pendingSilenceSince = null;
455
224
  }
456
225
  }
457
- if (newState === "speech_ending") newState = "silent";
458
- this.currentState = newState;
459
226
  return {
460
- isSpeaking: newState === "speaking",
461
- probability,
462
- state: newState
227
+ speaking: this.speaking,
228
+ levelDb: clamped
463
229
  };
464
230
  }
465
231
  };
@@ -468,58 +234,33 @@ var VADStateMachine = class {
468
234
  async function createAudioPipeline(sourceTrack, config = {}) {
469
235
  const context = getAudioContext();
470
236
  registerPipeline();
471
- const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
472
- config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl
473
- );
474
- const vadEnabled = config.vad?.enabled !== false;
237
+ const nsConfig = {
238
+ enabled: config.noiseSuppression?.enabled ?? true,
239
+ noiseReductionLevel: config.noiseSuppression?.noiseReductionLevel ?? 60
240
+ };
241
+ if (config.noiseSuppression?.assetConfig) {
242
+ nsConfig.assetConfig = config.noiseSuppression.assetConfig;
243
+ }
475
244
  const fullConfig = {
476
- noiseSuppression: {
477
- enabled: nsEnabled,
478
- ...config.noiseSuppression
479
- },
480
- vad: {
481
- enabled: vadEnabled,
482
- // Voice-optimized defaults (will be overridden by config)
483
- startThreshold: 0.6,
484
- stopThreshold: 0.45,
485
- hangoverMs: 400,
486
- preRollMs: 250,
487
- minSpeechDurationMs: 100,
488
- minSilenceDurationMs: 150,
489
- energyVad: {
490
- smoothing: 0.95,
491
- initialNoiseFloor: 1e-3,
492
- noiseFloorAdaptRateQuiet: 0.01,
493
- noiseFloorAdaptRateLoud: 1e-3,
494
- minSNR: 2,
495
- snrRange: 8
496
- },
497
- ...config.vad
245
+ noiseSuppression: nsConfig,
246
+ speaking: {
247
+ minDb: config.speaking?.minDb ?? -60,
248
+ maxDb: config.speaking?.maxDb ?? -20,
249
+ speakOnRatio: config.speaking?.speakOnRatio ?? 0.6,
250
+ speakOffRatio: config.speaking?.speakOffRatio ?? 0.3,
251
+ hangoverMs: config.speaking?.hangoverMs ?? 350,
252
+ attackMs: config.speaking?.attackMs ?? 50,
253
+ releaseMs: config.speaking?.releaseMs ?? 120
498
254
  },
499
255
  output: {
500
- speechGain: 1,
501
- silenceGain: 0,
502
- // Full mute for voice-only
503
- gainRampTime: 0.015,
504
- // Fast but smooth transitions
505
- smoothTransitions: true,
506
- maxGainDb: 6,
507
- enableCompression: false,
508
- compression: {
509
- threshold: -24,
510
- ratio: 3,
511
- attack: 3e-3,
512
- release: 0.05
513
- },
514
- ...config.output
256
+ speechGain: config.output?.speechGain ?? 1,
257
+ silenceGain: config.output?.silenceGain ?? 0,
258
+ gainRampTime: config.output?.gainRampTime ?? 0.015,
259
+ maxGainDb: config.output?.maxGainDb ?? 6,
260
+ smoothTransitions: config.output?.smoothTransitions ?? true
515
261
  },
516
- livekit: { manageTrackMute: false, ...config.livekit }
262
+ muteWhenSilent: config.muteWhenSilent ?? false
517
263
  };
518
- console.log("Audio pipeline config:", {
519
- noiseSuppression: fullConfig.noiseSuppression?.enabled,
520
- vad: fullConfig.vad?.enabled,
521
- output: fullConfig.output
522
- });
523
264
  if (!sourceTrack || sourceTrack.kind !== "audio") {
524
265
  throw new Error(
525
266
  "createAudioPipeline requires a valid audio MediaStreamTrack"
@@ -530,332 +271,196 @@ async function createAudioPipeline(sourceTrack, config = {}) {
530
271
  }
531
272
  const sourceStream = new MediaStream([sourceTrack]);
532
273
  const sourceNode = context.createMediaStreamSource(sourceStream);
533
- let nsNode;
534
- let vadNode;
535
274
  const emitter = (0, import_mitt.default)();
536
- try {
537
- const nsPlugin = getNoiseSuppressionPlugin(
538
- fullConfig.noiseSuppression?.pluginName
539
- );
540
- nsNode = await nsPlugin.createNode(context, fullConfig.noiseSuppression);
541
- } catch (error) {
542
- const err = error instanceof Error ? error : new Error(String(error));
543
- console.error("Failed to create noise suppression node:", err);
544
- emitter.emit("error", err);
545
- throw err;
546
- }
547
- const vadStateMachine = new VADStateMachine(fullConfig.vad);
548
- let vadPlugin;
549
- try {
550
- vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
551
- vadNode = await vadPlugin.createNode(context, fullConfig.vad, (prob) => {
552
- try {
553
- const timestamp = context.currentTime * 1e3;
554
- const newState = vadStateMachine.processFrame(prob, timestamp);
555
- if (vadPlugin && typeof vadPlugin.updateSpeakingState === "function") {
556
- vadPlugin.updateSpeakingState(newState.isSpeaking);
557
- }
558
- if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
559
- emitter.emit("vadChange", newState);
560
- lastVadState = newState;
561
- updateGain(newState);
562
- }
563
- } catch (vadError) {
564
- const err = vadError instanceof Error ? vadError : new Error(String(vadError));
565
- console.error("Error in VAD callback:", err);
566
- emitter.emit("error", err);
275
+ const vad = new LevelBasedVAD(fullConfig.speaking);
276
+ let lastState = { speaking: false, levelDb: -Infinity };
277
+ const nsHandle = await createDeepFilterNet3Node(
278
+ context,
279
+ fullConfig.noiseSuppression
280
+ );
281
+ const levelHandle = await createLevelDetectorNode(context, (levelDb) => {
282
+ try {
283
+ const timestamp = context.currentTime * 1e3;
284
+ const nextState = vad.process(levelDb, timestamp);
285
+ const speakingChanged = nextState.speaking !== lastState.speaking;
286
+ const levelChanged = Math.abs(nextState.levelDb - lastState.levelDb) > 0.5;
287
+ if (speakingChanged || levelChanged) {
288
+ lastState = nextState;
289
+ updateGain(nextState);
290
+ emitter.emit("speakingChange", nextState);
567
291
  }
568
- });
569
- } catch (error) {
570
- const err = error instanceof Error ? error : new Error(String(error));
571
- console.error("Failed to create VAD node:", err);
572
- emitter.emit("error", err);
573
- throw err;
574
- }
575
- let lastVadState = {
576
- isSpeaking: false,
577
- probability: 0,
578
- state: "silent"
579
- };
292
+ } catch (error) {
293
+ const err = error instanceof Error ? error : new Error(String(error));
294
+ emitter.emit("error", err);
295
+ }
296
+ });
580
297
  const splitter = context.createGain();
581
- sourceNode.connect(nsNode);
582
- nsNode.connect(splitter);
583
- splitter.connect(vadNode);
584
- const delayNode = context.createDelay(1);
585
- const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
586
- delayNode.delayTime.value = preRollSeconds;
298
+ sourceNode.connect(nsHandle.node);
299
+ nsHandle.node.connect(splitter);
300
+ splitter.connect(levelHandle.node);
587
301
  const gainNode = context.createGain();
588
302
  gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
589
- let compressor = null;
590
- if (fullConfig.output?.enableCompression) {
591
- compressor = context.createDynamicsCompressor();
592
- const comp = fullConfig.output.compression;
593
- compressor.threshold.value = comp.threshold ?? -24;
594
- compressor.ratio.value = comp.ratio ?? 3;
595
- compressor.attack.value = comp.attack ?? 3e-3;
596
- compressor.release.value = comp.release ?? 0.05;
597
- compressor.knee.value = 10;
598
- }
303
+ splitter.connect(gainNode);
599
304
  const destination = context.createMediaStreamDestination();
600
- try {
601
- splitter.connect(delayNode);
602
- delayNode.connect(gainNode);
603
- if (compressor) {
604
- gainNode.connect(compressor);
605
- compressor.connect(destination);
606
- console.log("Compression enabled:", fullConfig.output?.compression);
607
- } else {
608
- gainNode.connect(destination);
609
- }
610
- } catch (error) {
611
- const err = error instanceof Error ? error : new Error(String(error));
612
- console.error("Failed to wire audio pipeline:", err);
613
- emitter.emit("error", err);
614
- throw err;
615
- }
305
+ gainNode.connect(destination);
616
306
  function updateGain(state) {
617
- try {
618
- const {
619
- speechGain = 1,
620
- silenceGain = 0,
621
- gainRampTime = 0.015,
622
- smoothTransitions = true,
623
- maxGainDb = 6
624
- } = fullConfig.output;
625
- const maxGainLinear = Math.pow(10, maxGainDb / 20);
626
- const limitedSpeechGain = Math.min(speechGain, maxGainLinear);
627
- const targetGain = state.isSpeaking ? limitedSpeechGain : silenceGain;
628
- const now = context.currentTime;
629
- if (smoothTransitions) {
630
- gainNode.gain.cancelScheduledValues(now);
631
- gainNode.gain.setValueAtTime(gainNode.gain.value, now);
632
- gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime / 3);
633
- } else {
634
- gainNode.gain.setValueAtTime(targetGain, now);
635
- }
636
- } catch (error) {
637
- const err = error instanceof Error ? error : new Error(String(error));
638
- console.error("Failed to update gain:", err);
639
- emitter.emit("error", err);
307
+ const {
308
+ speechGain = 1,
309
+ silenceGain = 0,
310
+ gainRampTime = 0.015,
311
+ smoothTransitions = true,
312
+ maxGainDb = 6
313
+ } = fullConfig.output ?? {};
314
+ const maxGainLinear = Math.pow(10, maxGainDb / 20);
315
+ const limitedSpeechGain = Math.min(speechGain ?? 1, maxGainLinear);
316
+ const target = state.speaking ? limitedSpeechGain : silenceGain ?? 0;
317
+ const now = context.currentTime;
318
+ gainNode.gain.cancelScheduledValues(now);
319
+ gainNode.gain.setValueAtTime(gainNode.gain.value, now);
320
+ if (smoothTransitions) {
321
+ gainNode.gain.setTargetAtTime(target, now, gainRampTime / 3);
322
+ } else {
323
+ gainNode.gain.setValueAtTime(target, now);
640
324
  }
641
325
  }
642
326
  const audioTracks = destination.stream.getAudioTracks();
643
- console.log("Destination stream tracks:", {
644
- count: audioTracks.length,
645
- tracks: audioTracks.map((t) => ({
646
- id: t.id,
647
- label: t.label,
648
- enabled: t.enabled,
649
- readyState: t.readyState
650
- }))
651
- });
652
327
  if (audioTracks.length === 0) {
653
- const err = new Error(
654
- "Failed to create processed audio track: destination stream has no audio tracks. This may indicate an issue with the audio graph connection."
655
- );
656
- console.error(err);
657
- emitter.emit("error", err);
658
- throw err;
328
+ nsHandle.dispose();
329
+ levelHandle.dispose();
330
+ unregisterPipeline();
331
+ throw new Error("Failed to create processed audio track");
659
332
  }
660
333
  const processedTrack = audioTracks[0];
661
- if (!processedTrack || processedTrack.readyState === "ended") {
662
- const err = new Error("Processed audio track is invalid or ended");
663
- console.error(err);
664
- emitter.emit("error", err);
665
- throw err;
666
- }
667
- console.log("Audio pipeline created successfully:", {
668
- sourceTrack: {
669
- id: sourceTrack.id,
670
- label: sourceTrack.label,
671
- readyState: sourceTrack.readyState
672
- },
673
- processedTrack: {
674
- id: processedTrack.id,
675
- label: processedTrack.label,
676
- readyState: processedTrack.readyState
677
- },
678
- config: {
679
- noiseSuppression: fullConfig.noiseSuppression?.enabled,
680
- vad: fullConfig.vad?.enabled
681
- }
682
- });
683
334
  function dispose() {
684
335
  try {
685
336
  sourceNode.disconnect();
686
- nsNode.disconnect();
337
+ nsHandle.node.disconnect();
687
338
  splitter.disconnect();
688
- vadNode.disconnect();
689
- delayNode.disconnect();
339
+ levelHandle.node.disconnect();
690
340
  gainNode.disconnect();
691
- if (compressor) {
692
- compressor.disconnect();
693
- }
694
341
  destination.stream.getTracks().forEach((t) => t.stop());
695
- unregisterPipeline();
342
+ levelHandle.dispose();
343
+ nsHandle.dispose();
696
344
  } catch (error) {
697
- console.error("Error during pipeline disposal:", error);
345
+ console.error("Error during pipeline disposal", error);
346
+ } finally {
347
+ unregisterPipeline();
698
348
  }
699
349
  }
700
- return {
350
+ const handle = {
701
351
  processedTrack,
702
352
  events: emitter,
703
353
  get state() {
704
- return lastVadState;
354
+ return lastState;
705
355
  },
706
- setConfig: (newConfig) => {
356
+ setConfig: (next) => {
707
357
  try {
708
- if (newConfig.vad) {
709
- vadStateMachine.updateConfig(newConfig.vad);
710
- Object.assign(fullConfig.vad, newConfig.vad);
711
- if (newConfig.vad.preRollMs !== void 0) {
712
- const preRollSeconds2 = newConfig.vad.preRollMs / 1e3;
713
- delayNode.delayTime.setValueAtTime(
714
- preRollSeconds2,
715
- context.currentTime
716
- );
717
- }
358
+ if (next.speaking) {
359
+ vad.updateConfig(next.speaking);
360
+ fullConfig.speaking = { ...fullConfig.speaking, ...next.speaking };
361
+ }
362
+ if (next.output) {
363
+ fullConfig.output = { ...fullConfig.output, ...next.output };
364
+ updateGain(lastState);
718
365
  }
719
- if (newConfig.output) {
720
- Object.assign(fullConfig.output, newConfig.output);
721
- updateGain(lastVadState);
722
- if (compressor && newConfig.output.compression) {
723
- const comp = newConfig.output.compression;
724
- if (comp.threshold !== void 0) {
725
- compressor.threshold.setValueAtTime(
726
- comp.threshold,
727
- context.currentTime
728
- );
729
- }
730
- if (comp.ratio !== void 0) {
731
- compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
732
- }
733
- if (comp.attack !== void 0) {
734
- compressor.attack.setValueAtTime(
735
- comp.attack,
736
- context.currentTime
737
- );
738
- }
739
- if (comp.release !== void 0) {
740
- compressor.release.setValueAtTime(
741
- comp.release,
742
- context.currentTime
743
- );
744
- }
366
+ if (next.noiseSuppression) {
367
+ const ns = next.noiseSuppression;
368
+ fullConfig.noiseSuppression = {
369
+ ...fullConfig.noiseSuppression,
370
+ ...ns
371
+ };
372
+ if (typeof ns.noiseReductionLevel === "number") {
373
+ nsHandle.processor.setSuppressionLevel(ns.noiseReductionLevel);
374
+ }
375
+ if (typeof ns.enabled === "boolean") {
376
+ nsHandle.processor.setNoiseSuppressionEnabled(ns.enabled);
745
377
  }
746
378
  }
747
- if (newConfig.livekit) {
748
- Object.assign(fullConfig.livekit, newConfig.livekit);
379
+ if (typeof next.muteWhenSilent === "boolean") {
380
+ fullConfig.muteWhenSilent = next.muteWhenSilent;
749
381
  }
750
- console.log("Pipeline config updated:", newConfig);
751
382
  } catch (error) {
752
383
  const err = error instanceof Error ? error : new Error(String(error));
753
- console.error("Failed to update config:", err);
754
384
  emitter.emit("error", err);
755
385
  }
756
386
  },
757
387
  dispose
758
388
  };
389
+ return handle;
759
390
  }
760
391
 
761
392
  // src/livekit/integration.ts
762
- async function attachProcessingToTrack(track, config = {}) {
393
+ async function attachSpeakingDetectionToTrack(track, options = {}) {
763
394
  if (!track) {
764
- throw new Error("attachProcessingToTrack requires a valid LocalAudioTrack");
765
- }
766
- const originalTrack = track.mediaStreamTrack;
767
- if (!originalTrack) {
768
- throw new Error("LocalAudioTrack has no underlying MediaStreamTrack");
769
- }
770
- if (originalTrack.readyState === "ended") {
771
- throw new Error("Cannot attach processing to an ended MediaStreamTrack");
772
- }
773
- let pipeline;
774
- try {
775
- console.log("Creating audio processing pipeline...");
776
- pipeline = await createAudioPipeline(originalTrack, config);
777
- console.log("Audio processing pipeline created successfully");
778
- } catch (error) {
779
- const err = new Error(
780
- `Failed to create audio pipeline: ${error instanceof Error ? error.message : String(error)}`
781
- );
782
- console.error(err);
783
- throw err;
784
- }
785
- if (!pipeline.processedTrack) {
786
- throw new Error("Pipeline did not return a processed track");
787
- }
788
- try {
789
- console.log("Replacing LiveKit track with processed track...");
790
- await track.replaceTrack(pipeline.processedTrack);
791
- console.log("LiveKit track replaced successfully");
792
- } catch (error) {
793
- pipeline.dispose();
794
- const err = new Error(
795
- `Failed to replace LiveKit track: ${error instanceof Error ? error.message : String(error)}`
395
+ throw new Error(
396
+ "attachSpeakingDetectionToTrack requires a valid LocalAudioTrack"
796
397
  );
797
- console.error(err);
798
- throw err;
799
398
  }
800
- if (config.livekit?.manageTrackMute) {
801
- let isVadMuted = false;
802
- pipeline.events.on("vadChange", async (state) => {
803
- try {
804
- if (state.isSpeaking) {
805
- if (isVadMuted) {
806
- await track.unmute();
807
- isVadMuted = false;
808
- }
809
- } else {
810
- if (!track.isMuted) {
811
- await track.mute();
812
- isVadMuted = true;
813
- }
814
- }
815
- } catch (error) {
816
- console.error("Error handling VAD-based track muting:", error);
399
+ const originalTrack = track.mediaStreamTrack;
400
+ if (!originalTrack || originalTrack.readyState === "ended") {
401
+ throw new Error("LocalAudioTrack has no live MediaStreamTrack to process");
402
+ }
403
+ const pipeline = await createAudioPipeline(originalTrack, options);
404
+ await track.replaceTrack(pipeline.processedTrack);
405
+ const listeners = /* @__PURE__ */ new Set();
406
+ let mutedByController = false;
407
+ let currentState = pipeline.state;
408
+ const speakingHandler = (state) => {
409
+ currentState = state;
410
+ listeners.forEach((listener) => listener(state));
411
+ if (options.muteWhenSilent) {
412
+ if (!state.speaking && !track.isMuted) {
413
+ track.mute().catch((error) => console.error("mute failed", error));
414
+ mutedByController = true;
817
415
  }
818
- });
819
- }
820
- pipeline.events.on("error", (error) => {
821
- console.error("Audio pipeline error:", error);
822
- });
823
- const originalDispose = pipeline.dispose;
824
- pipeline.dispose = () => {
825
- try {
416
+ if (state.speaking && mutedByController) {
417
+ track.unmute().catch((error) => console.error("unmute failed", error));
418
+ mutedByController = false;
419
+ }
420
+ }
421
+ };
422
+ pipeline.events.on("speakingChange", speakingHandler);
423
+ const errorHandler = (error) => {
424
+ console.error("Audio pipeline error", error);
425
+ };
426
+ pipeline.events.on("error", errorHandler);
427
+ const controller = {
428
+ get speaking() {
429
+ return currentState.speaking;
430
+ },
431
+ get levelDb() {
432
+ return currentState.levelDb;
433
+ },
434
+ onChange: (listener) => {
435
+ listeners.add(listener);
436
+ listener(currentState);
437
+ return () => listeners.delete(listener);
438
+ },
439
+ setConfig: (config) => {
440
+ pipeline.setConfig(config);
441
+ if (typeof config.muteWhenSilent === "boolean") {
442
+ options.muteWhenSilent = config.muteWhenSilent;
443
+ }
444
+ },
445
+ dispose: () => {
446
+ pipeline.events.off("speakingChange", speakingHandler);
447
+ pipeline.events.off("error", errorHandler);
448
+ listeners.clear();
449
+ if (mutedByController && !track.isMuted) {
450
+ track.unmute().catch((error) => console.error("unmute failed", error));
451
+ mutedByController = false;
452
+ }
453
+ pipeline.dispose();
826
454
  if (originalTrack.readyState === "live") {
827
- console.log("Restoring original track...");
828
455
  track.replaceTrack(originalTrack).catch((error) => {
829
- console.error("Failed to restore original track:", error);
456
+ console.error("Failed to restore original track", error);
830
457
  });
831
458
  }
832
- originalDispose();
833
- } catch (error) {
834
- console.error("Error during pipeline disposal:", error);
835
- try {
836
- originalDispose();
837
- } catch (disposeError) {
838
- console.error("Error calling original dispose:", disposeError);
839
- }
840
459
  }
841
460
  };
842
- return pipeline;
461
+ return controller;
843
462
  }
844
463
  // Annotate the CommonJS export names for ESM import in node:
845
464
  0 && (module.exports = {
846
- EnergyVADPlugin,
847
- RNNoisePlugin,
848
- VADStateMachine,
849
- attachProcessingToTrack,
850
- closeAudioContext,
851
- createAudioPipeline,
852
- getAudioContext,
853
- getNoiseSuppressionPlugin,
854
- getVADPlugin,
855
- registerNoiseSuppressionPlugin,
856
- registerPipeline,
857
- registerVADPlugin,
858
- resumeAudioContext,
859
- suspendAudioContext,
860
- unregisterPipeline
465
+ attachSpeakingDetectionToTrack
861
466
  });