@tensamin/audio 0.1.6 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -101,19 +101,20 @@ vad: {
101
101
  energyVad?: {
102
102
  smoothing: number; // Default: 0.95
103
103
  initialNoiseFloor: number; // Default: 0.001
104
- noiseFloorAdaptRateQuiet: number; // Default: 0.01
105
- noiseFloorAdaptRateLoud: number; // Default: 0.001
106
- minSNR: number; // Default: 2.0
107
- snrRange: number; // Default: 8.0
104
+ noiseFloorAdaptRateQuiet: number; // Default: 0.05
105
+ noiseFloorAdaptRateLoud: number; // Default: 0.01
106
+ minSNR: number; // Default: 10.0 (dB)
107
+ snrRange: number; // Default: 10.0 (dB)
108
+ minEnergy: number; // Default: 0.0005
108
109
  };
109
110
  }
110
111
  ```
111
112
 
112
113
  **Threshold Parameters:**
113
114
 
114
- - `startThreshold`: Probability threshold to unmute audio
115
- - `stopThreshold`: Probability threshold to mute audio (after hangover)
116
- - `hangoverMs`: Delay before muting after speech stops
115
+ - `startThreshold`: Probability threshold to unmute audio (Default: 0.8, ~18dB SNR)
116
+ - `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
117
+ - `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
117
118
  - `preRollMs`: Audio buffer duration before speech onset
118
119
  - `minSpeechDurationMs`: Minimum duration to consider as valid speech
119
120
  - `minSilenceDurationMs`: Minimum silence duration between speech segments
@@ -121,8 +122,9 @@ vad: {
121
122
  **Energy VAD Parameters:**
122
123
 
123
124
  - `smoothing`: Energy calculation smoothing factor (0-1)
124
- - `minSNR`: Minimum signal-to-noise ratio for speech detection
125
- - `snrRange`: Range for probability scaling from minSNR
125
+ - `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
126
+ - `snrRange`: Range in dB for probability scaling from minSNR
127
+ - `minEnergy`: Minimum absolute RMS energy to consider as speech
126
128
 
127
129
  ### Output Control
128
130
 
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  VADStateMachine
3
- } from "./chunk-N553RHTI.mjs";
3
+ } from "./chunk-XHMNP7NC.mjs";
4
4
  import {
5
5
  getAudioContext,
6
6
  registerPipeline,
@@ -9,7 +9,7 @@ import {
9
9
  import {
10
10
  getNoiseSuppressionPlugin,
11
11
  getVADPlugin
12
- } from "./chunk-ZCC7ID7L.mjs";
12
+ } from "./chunk-FOGC2MFA.mjs";
13
13
 
14
14
  // src/pipeline/audio-pipeline.ts
15
15
  import mitt from "mitt";
@@ -3,10 +3,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
3
3
  const energyParams = vadConfig?.energyVad || {};
4
4
  const smoothing = energyParams.smoothing ?? 0.95;
5
5
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
6
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
7
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-4;
8
- const minSNR = energyParams.minSNR ?? 2;
9
- const snrRange = energyParams.snrRange ?? 8;
6
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
7
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
8
+ const minSNR = energyParams.minSNR ?? 10;
9
+ const snrRange = energyParams.snrRange ?? 10;
10
+ const minEnergy = energyParams.minEnergy ?? 5e-4;
10
11
  return `
11
12
  class EnergyVadProcessor extends AudioWorkletProcessor {
12
13
  constructor() {
@@ -18,6 +19,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
18
19
  this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
19
20
  this.minSNR = ${minSNR};
20
21
  this.snrRange = ${snrRange};
22
+ this.minEnergy = ${minEnergy};
21
23
  this.isSpeaking = false;
22
24
 
23
25
  this.port.onmessage = (event) => {
@@ -32,44 +34,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
32
34
  if (!input || !input.length) return true;
33
35
  const channel = input[0];
34
36
 
35
- // Calculate RMS (Root Mean Square) energy
37
+ // Calculate instantaneous RMS (Root Mean Square) energy
36
38
  let sum = 0;
37
39
  for (let i = 0; i < channel.length; i++) {
38
40
  sum += channel[i] * channel[i];
39
41
  }
40
- const rms = Math.sqrt(sum / channel.length);
42
+ const instantRms = Math.sqrt(sum / channel.length);
41
43
 
42
- // Adaptive noise floor estimation - ONLY during silence
43
- // This prevents the noise floor from rising during speech
44
- if (!this.isSpeaking) {
45
- if (rms < this.noiseFloor) {
46
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
47
- } else {
48
- // Even during silence, if we detect a loud signal, adapt very slowly
49
- // This could be brief noise we haven't classified as speech yet
50
-
51
- // SAFEGUARD: If the signal is significantly louder than the noise floor (e.g. > 3x),
52
- // assume it's unclassified speech or a sudden loud noise and DO NOT adapt.
53
- // This prevents the noise floor from "chasing" the speech level during brief pauses or onsets.
54
- const instantSnr = rms / (this.noiseFloor + 1e-6);
55
-
56
- if (instantSnr < 3.0) {
57
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
58
- }
44
+ // Smooth the RMS energy to reduce jitter
45
+ // this.energy acts as the smoothed RMS value
46
+ this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
47
+
48
+ // Adaptive noise floor estimation
49
+ // We use the instantaneous RMS for noise floor tracking to react quickly to silence
50
+ if (instantRms < this.noiseFloor) {
51
+ // If signal is quieter than noise floor, adapt downwards quickly
52
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
53
+ } else {
54
+ // If signal is louder, adapt upwards
55
+ // We use a multi-stage adaptation rate:
56
+ // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
57
+ // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
58
+ // 3. Otherwise, adapt at the normal loud rate
59
+ const snr = instantRms / (this.noiseFloor + 1e-6);
60
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
61
+
62
+ let multiplier = 1.0;
63
+ if (this.isSpeaking) {
64
+ multiplier = 0.01;
65
+ } else if (snrDb > 20) {
66
+ multiplier = 0.1;
59
67
  }
68
+
69
+ const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
70
+ this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
60
71
  }
61
- // During speech, freeze the noise floor to maintain consistent detection
62
72
 
63
- // Calculate Signal-to-Noise Ratio (SNR)
64
- const snr = rms / (this.noiseFloor + 1e-6);
73
+ // Ensure noise floor doesn't drop to absolute zero
74
+ this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
75
+
76
+ // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
77
+ const snr = this.energy / (this.noiseFloor + 1e-6);
78
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
65
79
 
66
- // Map SNR to probability (0-1)
67
- // Probability is 0 when SNR <= minSNR
80
+ // Map SNR dB to probability (0-1)
81
+ // Probability is 0 when snrDb <= minSNR
68
82
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
69
- // Probability is 1 when SNR >= (minSNR + snrRange)
70
- const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
83
+ let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
84
+
85
+ // Apply absolute energy threshold
86
+ if (this.energy < this.minEnergy) {
87
+ probability = 0;
88
+ }
71
89
 
72
- this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
90
+ this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
73
91
 
74
92
  return true;
75
93
  }
@@ -3,7 +3,7 @@ import {
3
3
  } from "./chunk-XO6B3D4A.mjs";
4
4
  import {
5
5
  EnergyVADPlugin
6
- } from "./chunk-TLPO52HV.mjs";
6
+ } from "./chunk-3A2CTC4K.mjs";
7
7
 
8
8
  // src/extensibility/plugins.ts
9
9
  var nsPlugins = /* @__PURE__ */ new Map();
@@ -12,11 +12,11 @@ var VADStateMachine = class {
12
12
  enabled: config?.enabled ?? true,
13
13
  pluginName: config?.pluginName ?? "energy-vad",
14
14
  // Voice-optimized defaults
15
- startThreshold: config?.startThreshold ?? 0.6,
15
+ startThreshold: config?.startThreshold ?? 0.8,
16
16
  // Higher threshold to avoid noise
17
- stopThreshold: config?.stopThreshold ?? 0.45,
17
+ stopThreshold: config?.stopThreshold ?? 0.3,
18
18
  // Balanced for voice
19
- hangoverMs: config?.hangoverMs ?? 400,
19
+ hangoverMs: config?.hangoverMs ?? 300,
20
20
  // Smooth for natural speech
21
21
  preRollMs: config?.preRollMs ?? 250,
22
22
  // Generous pre-roll
@@ -25,10 +25,11 @@ var VADStateMachine = class {
25
25
  energyVad: {
26
26
  smoothing: config?.energyVad?.smoothing ?? 0.95,
27
27
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
28
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
29
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
30
- minSNR: config?.energyVad?.minSNR ?? 2,
31
- snrRange: config?.energyVad?.snrRange ?? 8
28
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
29
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
30
+ minSNR: config?.energyVad?.minSNR ?? 10,
31
+ snrRange: config?.energyVad?.snrRange ?? 10,
32
+ minEnergy: config?.energyVad?.minEnergy ?? 5e-4
32
33
  }
33
34
  };
34
35
  this.lastSilenceTime = Date.now();
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "./chunk-DF4AYGHJ.mjs";
3
+ } from "./chunk-2UPI6VWY.mjs";
4
4
 
5
5
  // src/livekit/integration.ts
6
6
  async function attachProcessingToTrack(track, config = {}) {
@@ -106,10 +106,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
106
106
  const energyParams = vadConfig?.energyVad || {};
107
107
  const smoothing = energyParams.smoothing ?? 0.95;
108
108
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
109
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
110
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-4;
111
- const minSNR = energyParams.minSNR ?? 2;
112
- const snrRange = energyParams.snrRange ?? 8;
109
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
110
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
111
+ const minSNR = energyParams.minSNR ?? 10;
112
+ const snrRange = energyParams.snrRange ?? 10;
113
+ const minEnergy = energyParams.minEnergy ?? 5e-4;
113
114
  return `
114
115
  class EnergyVadProcessor extends AudioWorkletProcessor {
115
116
  constructor() {
@@ -121,6 +122,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
121
122
  this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
122
123
  this.minSNR = ${minSNR};
123
124
  this.snrRange = ${snrRange};
125
+ this.minEnergy = ${minEnergy};
124
126
  this.isSpeaking = false;
125
127
 
126
128
  this.port.onmessage = (event) => {
@@ -135,44 +137,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
135
137
  if (!input || !input.length) return true;
136
138
  const channel = input[0];
137
139
 
138
- // Calculate RMS (Root Mean Square) energy
140
+ // Calculate instantaneous RMS (Root Mean Square) energy
139
141
  let sum = 0;
140
142
  for (let i = 0; i < channel.length; i++) {
141
143
  sum += channel[i] * channel[i];
142
144
  }
143
- const rms = Math.sqrt(sum / channel.length);
145
+ const instantRms = Math.sqrt(sum / channel.length);
144
146
 
145
- // Adaptive noise floor estimation - ONLY during silence
146
- // This prevents the noise floor from rising during speech
147
- if (!this.isSpeaking) {
148
- if (rms < this.noiseFloor) {
149
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
150
- } else {
151
- // Even during silence, if we detect a loud signal, adapt very slowly
152
- // This could be brief noise we haven't classified as speech yet
153
-
154
- // SAFEGUARD: If the signal is significantly louder than the noise floor (e.g. > 3x),
155
- // assume it's unclassified speech or a sudden loud noise and DO NOT adapt.
156
- // This prevents the noise floor from "chasing" the speech level during brief pauses or onsets.
157
- const instantSnr = rms / (this.noiseFloor + 1e-6);
158
-
159
- if (instantSnr < 3.0) {
160
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
161
- }
147
+ // Smooth the RMS energy to reduce jitter
148
+ // this.energy acts as the smoothed RMS value
149
+ this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
150
+
151
+ // Adaptive noise floor estimation
152
+ // We use the instantaneous RMS for noise floor tracking to react quickly to silence
153
+ if (instantRms < this.noiseFloor) {
154
+ // If signal is quieter than noise floor, adapt downwards quickly
155
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
156
+ } else {
157
+ // If signal is louder, adapt upwards
158
+ // We use a multi-stage adaptation rate:
159
+ // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
160
+ // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
161
+ // 3. Otherwise, adapt at the normal loud rate
162
+ const snr = instantRms / (this.noiseFloor + 1e-6);
163
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
164
+
165
+ let multiplier = 1.0;
166
+ if (this.isSpeaking) {
167
+ multiplier = 0.01;
168
+ } else if (snrDb > 20) {
169
+ multiplier = 0.1;
162
170
  }
171
+
172
+ const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
173
+ this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
163
174
  }
164
- // During speech, freeze the noise floor to maintain consistent detection
165
175
 
166
- // Calculate Signal-to-Noise Ratio (SNR)
167
- const snr = rms / (this.noiseFloor + 1e-6);
176
+ // Ensure noise floor doesn't drop to absolute zero
177
+ this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
178
+
179
+ // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
180
+ const snr = this.energy / (this.noiseFloor + 1e-6);
181
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
168
182
 
169
- // Map SNR to probability (0-1)
170
- // Probability is 0 when SNR <= minSNR
183
+ // Map SNR dB to probability (0-1)
184
+ // Probability is 0 when snrDb <= minSNR
171
185
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
172
- // Probability is 1 when SNR >= (minSNR + snrRange)
173
- const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
186
+ let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
187
+
188
+ // Apply absolute energy threshold
189
+ if (this.energy < this.minEnergy) {
190
+ probability = 0;
191
+ }
174
192
 
175
- this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
193
+ this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
176
194
 
177
195
  return true;
178
196
  }
@@ -3,9 +3,9 @@ import {
3
3
  getVADPlugin,
4
4
  registerNoiseSuppressionPlugin,
5
5
  registerVADPlugin
6
- } from "../chunk-ZCC7ID7L.mjs";
6
+ } from "../chunk-FOGC2MFA.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-TLPO52HV.mjs";
8
+ import "../chunk-3A2CTC4K.mjs";
9
9
  export {
10
10
  getNoiseSuppressionPlugin,
11
11
  getVADPlugin,
package/dist/index.js CHANGED
@@ -158,10 +158,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
158
158
  const energyParams = vadConfig?.energyVad || {};
159
159
  const smoothing = energyParams.smoothing ?? 0.95;
160
160
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
161
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
162
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-4;
163
- const minSNR = energyParams.minSNR ?? 2;
164
- const snrRange = energyParams.snrRange ?? 8;
161
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
162
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
163
+ const minSNR = energyParams.minSNR ?? 10;
164
+ const snrRange = energyParams.snrRange ?? 10;
165
+ const minEnergy = energyParams.minEnergy ?? 5e-4;
165
166
  return `
166
167
  class EnergyVadProcessor extends AudioWorkletProcessor {
167
168
  constructor() {
@@ -173,6 +174,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
173
174
  this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
174
175
  this.minSNR = ${minSNR};
175
176
  this.snrRange = ${snrRange};
177
+ this.minEnergy = ${minEnergy};
176
178
  this.isSpeaking = false;
177
179
 
178
180
  this.port.onmessage = (event) => {
@@ -187,44 +189,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
187
189
  if (!input || !input.length) return true;
188
190
  const channel = input[0];
189
191
 
190
- // Calculate RMS (Root Mean Square) energy
192
+ // Calculate instantaneous RMS (Root Mean Square) energy
191
193
  let sum = 0;
192
194
  for (let i = 0; i < channel.length; i++) {
193
195
  sum += channel[i] * channel[i];
194
196
  }
195
- const rms = Math.sqrt(sum / channel.length);
197
+ const instantRms = Math.sqrt(sum / channel.length);
196
198
 
197
- // Adaptive noise floor estimation - ONLY during silence
198
- // This prevents the noise floor from rising during speech
199
- if (!this.isSpeaking) {
200
- if (rms < this.noiseFloor) {
201
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
202
- } else {
203
- // Even during silence, if we detect a loud signal, adapt very slowly
204
- // This could be brief noise we haven't classified as speech yet
205
-
206
- // SAFEGUARD: If the signal is significantly louder than the noise floor (e.g. > 3x),
207
- // assume it's unclassified speech or a sudden loud noise and DO NOT adapt.
208
- // This prevents the noise floor from "chasing" the speech level during brief pauses or onsets.
209
- const instantSnr = rms / (this.noiseFloor + 1e-6);
210
-
211
- if (instantSnr < 3.0) {
212
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
213
- }
199
+ // Smooth the RMS energy to reduce jitter
200
+ // this.energy acts as the smoothed RMS value
201
+ this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
202
+
203
+ // Adaptive noise floor estimation
204
+ // We use the instantaneous RMS for noise floor tracking to react quickly to silence
205
+ if (instantRms < this.noiseFloor) {
206
+ // If signal is quieter than noise floor, adapt downwards quickly
207
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
208
+ } else {
209
+ // If signal is louder, adapt upwards
210
+ // We use a multi-stage adaptation rate:
211
+ // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
212
+ // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
213
+ // 3. Otherwise, adapt at the normal loud rate
214
+ const snr = instantRms / (this.noiseFloor + 1e-6);
215
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
216
+
217
+ let multiplier = 1.0;
218
+ if (this.isSpeaking) {
219
+ multiplier = 0.01;
220
+ } else if (snrDb > 20) {
221
+ multiplier = 0.1;
214
222
  }
223
+
224
+ const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
225
+ this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
215
226
  }
216
- // During speech, freeze the noise floor to maintain consistent detection
217
227
 
218
- // Calculate Signal-to-Noise Ratio (SNR)
219
- const snr = rms / (this.noiseFloor + 1e-6);
228
+ // Ensure noise floor doesn't drop to absolute zero
229
+ this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
230
+
231
+ // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
232
+ const snr = this.energy / (this.noiseFloor + 1e-6);
233
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
220
234
 
221
- // Map SNR to probability (0-1)
222
- // Probability is 0 when SNR <= minSNR
235
+ // Map SNR dB to probability (0-1)
236
+ // Probability is 0 when snrDb <= minSNR
223
237
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
224
- // Probability is 1 when SNR >= (minSNR + snrRange)
225
- const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
238
+ let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
239
+
240
+ // Apply absolute energy threshold
241
+ if (this.energy < this.minEnergy) {
242
+ probability = 0;
243
+ }
226
244
 
227
- this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
245
+ this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
228
246
 
229
247
  return true;
230
248
  }
@@ -342,11 +360,11 @@ var VADStateMachine = class {
342
360
  enabled: config?.enabled ?? true,
343
361
  pluginName: config?.pluginName ?? "energy-vad",
344
362
  // Voice-optimized defaults
345
- startThreshold: config?.startThreshold ?? 0.6,
363
+ startThreshold: config?.startThreshold ?? 0.8,
346
364
  // Higher threshold to avoid noise
347
- stopThreshold: config?.stopThreshold ?? 0.45,
365
+ stopThreshold: config?.stopThreshold ?? 0.3,
348
366
  // Balanced for voice
349
- hangoverMs: config?.hangoverMs ?? 400,
367
+ hangoverMs: config?.hangoverMs ?? 300,
350
368
  // Smooth for natural speech
351
369
  preRollMs: config?.preRollMs ?? 250,
352
370
  // Generous pre-roll
@@ -355,10 +373,11 @@ var VADStateMachine = class {
355
373
  energyVad: {
356
374
  smoothing: config?.energyVad?.smoothing ?? 0.95,
357
375
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
358
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
359
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
360
- minSNR: config?.energyVad?.minSNR ?? 2,
361
- snrRange: config?.energyVad?.snrRange ?? 8
376
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
377
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
378
+ minSNR: config?.energyVad?.minSNR ?? 10,
379
+ snrRange: config?.energyVad?.snrRange ?? 10,
380
+ minEnergy: config?.energyVad?.minEnergy ?? 5e-4
362
381
  }
363
382
  };
364
383
  this.lastSilenceTime = Date.now();
package/dist/index.mjs CHANGED
@@ -1,13 +1,13 @@
1
1
  import "./chunk-WBQAMGXK.mjs";
2
2
  import {
3
3
  attachProcessingToTrack
4
- } from "./chunk-TWQJGBBU.mjs";
4
+ } from "./chunk-Y6IG7XGC.mjs";
5
5
  import {
6
6
  createAudioPipeline
7
- } from "./chunk-DF4AYGHJ.mjs";
7
+ } from "./chunk-2UPI6VWY.mjs";
8
8
  import {
9
9
  VADStateMachine
10
- } from "./chunk-N553RHTI.mjs";
10
+ } from "./chunk-XHMNP7NC.mjs";
11
11
  import {
12
12
  closeAudioContext,
13
13
  getAudioContext,
@@ -21,13 +21,13 @@ import {
21
21
  getVADPlugin,
22
22
  registerNoiseSuppressionPlugin,
23
23
  registerVADPlugin
24
- } from "./chunk-ZCC7ID7L.mjs";
24
+ } from "./chunk-FOGC2MFA.mjs";
25
25
  import {
26
26
  RNNoisePlugin
27
27
  } from "./chunk-XO6B3D4A.mjs";
28
28
  import {
29
29
  EnergyVADPlugin
30
- } from "./chunk-TLPO52HV.mjs";
30
+ } from "./chunk-3A2CTC4K.mjs";
31
31
  export {
32
32
  EnergyVADPlugin,
33
33
  RNNoisePlugin,
@@ -127,10 +127,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
127
127
  const energyParams = vadConfig?.energyVad || {};
128
128
  const smoothing = energyParams.smoothing ?? 0.95;
129
129
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
130
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
131
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-4;
132
- const minSNR = energyParams.minSNR ?? 2;
133
- const snrRange = energyParams.snrRange ?? 8;
130
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
131
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
132
+ const minSNR = energyParams.minSNR ?? 10;
133
+ const snrRange = energyParams.snrRange ?? 10;
134
+ const minEnergy = energyParams.minEnergy ?? 5e-4;
134
135
  return `
135
136
  class EnergyVadProcessor extends AudioWorkletProcessor {
136
137
  constructor() {
@@ -142,6 +143,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
142
143
  this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
143
144
  this.minSNR = ${minSNR};
144
145
  this.snrRange = ${snrRange};
146
+ this.minEnergy = ${minEnergy};
145
147
  this.isSpeaking = false;
146
148
 
147
149
  this.port.onmessage = (event) => {
@@ -156,44 +158,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
156
158
  if (!input || !input.length) return true;
157
159
  const channel = input[0];
158
160
 
159
- // Calculate RMS (Root Mean Square) energy
161
+ // Calculate instantaneous RMS (Root Mean Square) energy
160
162
  let sum = 0;
161
163
  for (let i = 0; i < channel.length; i++) {
162
164
  sum += channel[i] * channel[i];
163
165
  }
164
- const rms = Math.sqrt(sum / channel.length);
166
+ const instantRms = Math.sqrt(sum / channel.length);
165
167
 
166
- // Adaptive noise floor estimation - ONLY during silence
167
- // This prevents the noise floor from rising during speech
168
- if (!this.isSpeaking) {
169
- if (rms < this.noiseFloor) {
170
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
171
- } else {
172
- // Even during silence, if we detect a loud signal, adapt very slowly
173
- // This could be brief noise we haven't classified as speech yet
174
-
175
- // SAFEGUARD: If the signal is significantly louder than the noise floor (e.g. > 3x),
176
- // assume it's unclassified speech or a sudden loud noise and DO NOT adapt.
177
- // This prevents the noise floor from "chasing" the speech level during brief pauses or onsets.
178
- const instantSnr = rms / (this.noiseFloor + 1e-6);
179
-
180
- if (instantSnr < 3.0) {
181
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
182
- }
168
+ // Smooth the RMS energy to reduce jitter
169
+ // this.energy acts as the smoothed RMS value
170
+ this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
171
+
172
+ // Adaptive noise floor estimation
173
+ // We use the instantaneous RMS for noise floor tracking to react quickly to silence
174
+ if (instantRms < this.noiseFloor) {
175
+ // If signal is quieter than noise floor, adapt downwards quickly
176
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
177
+ } else {
178
+ // If signal is louder, adapt upwards
179
+ // We use a multi-stage adaptation rate:
180
+ // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
181
+ // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
182
+ // 3. Otherwise, adapt at the normal loud rate
183
+ const snr = instantRms / (this.noiseFloor + 1e-6);
184
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
185
+
186
+ let multiplier = 1.0;
187
+ if (this.isSpeaking) {
188
+ multiplier = 0.01;
189
+ } else if (snrDb > 20) {
190
+ multiplier = 0.1;
183
191
  }
192
+
193
+ const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
194
+ this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
184
195
  }
185
- // During speech, freeze the noise floor to maintain consistent detection
186
196
 
187
- // Calculate Signal-to-Noise Ratio (SNR)
188
- const snr = rms / (this.noiseFloor + 1e-6);
197
+ // Ensure noise floor doesn't drop to absolute zero
198
+ this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
199
+
200
+ // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
201
+ const snr = this.energy / (this.noiseFloor + 1e-6);
202
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
189
203
 
190
- // Map SNR to probability (0-1)
191
- // Probability is 0 when SNR <= minSNR
204
+ // Map SNR dB to probability (0-1)
205
+ // Probability is 0 when snrDb <= minSNR
192
206
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
193
- // Probability is 1 when SNR >= (minSNR + snrRange)
194
- const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
207
+ let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
208
+
209
+ // Apply absolute energy threshold
210
+ if (this.energy < this.minEnergy) {
211
+ probability = 0;
212
+ }
195
213
 
196
- this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
214
+ this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
197
215
 
198
216
  return true;
199
217
  }
@@ -305,11 +323,11 @@ var VADStateMachine = class {
305
323
  enabled: config?.enabled ?? true,
306
324
  pluginName: config?.pluginName ?? "energy-vad",
307
325
  // Voice-optimized defaults
308
- startThreshold: config?.startThreshold ?? 0.6,
326
+ startThreshold: config?.startThreshold ?? 0.8,
309
327
  // Higher threshold to avoid noise
310
- stopThreshold: config?.stopThreshold ?? 0.45,
328
+ stopThreshold: config?.stopThreshold ?? 0.3,
311
329
  // Balanced for voice
312
- hangoverMs: config?.hangoverMs ?? 400,
330
+ hangoverMs: config?.hangoverMs ?? 300,
313
331
  // Smooth for natural speech
314
332
  preRollMs: config?.preRollMs ?? 250,
315
333
  // Generous pre-roll
@@ -318,10 +336,11 @@ var VADStateMachine = class {
318
336
  energyVad: {
319
337
  smoothing: config?.energyVad?.smoothing ?? 0.95,
320
338
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
321
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
322
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
323
- minSNR: config?.energyVad?.minSNR ?? 2,
324
- snrRange: config?.energyVad?.snrRange ?? 8
339
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
340
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
341
+ minSNR: config?.energyVad?.minSNR ?? 10,
342
+ snrRange: config?.energyVad?.snrRange ?? 10,
343
+ minEnergy: config?.energyVad?.minEnergy ?? 5e-4
325
344
  }
326
345
  };
327
346
  this.lastSilenceTime = Date.now();
@@ -1,12 +1,12 @@
1
1
  import {
2
2
  attachProcessingToTrack
3
- } from "../chunk-TWQJGBBU.mjs";
4
- import "../chunk-DF4AYGHJ.mjs";
5
- import "../chunk-N553RHTI.mjs";
3
+ } from "../chunk-Y6IG7XGC.mjs";
4
+ import "../chunk-2UPI6VWY.mjs";
5
+ import "../chunk-XHMNP7NC.mjs";
6
6
  import "../chunk-OZ7KMC4S.mjs";
7
- import "../chunk-ZCC7ID7L.mjs";
7
+ import "../chunk-FOGC2MFA.mjs";
8
8
  import "../chunk-XO6B3D4A.mjs";
9
- import "../chunk-TLPO52HV.mjs";
9
+ import "../chunk-3A2CTC4K.mjs";
10
10
  export {
11
11
  attachProcessingToTrack
12
12
  };
@@ -125,10 +125,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
125
125
  const energyParams = vadConfig?.energyVad || {};
126
126
  const smoothing = energyParams.smoothing ?? 0.95;
127
127
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
128
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
129
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-4;
130
- const minSNR = energyParams.minSNR ?? 2;
131
- const snrRange = energyParams.snrRange ?? 8;
128
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
129
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
130
+ const minSNR = energyParams.minSNR ?? 10;
131
+ const snrRange = energyParams.snrRange ?? 10;
132
+ const minEnergy = energyParams.minEnergy ?? 5e-4;
132
133
  return `
133
134
  class EnergyVadProcessor extends AudioWorkletProcessor {
134
135
  constructor() {
@@ -140,6 +141,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
140
141
  this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
141
142
  this.minSNR = ${minSNR};
142
143
  this.snrRange = ${snrRange};
144
+ this.minEnergy = ${minEnergy};
143
145
  this.isSpeaking = false;
144
146
 
145
147
  this.port.onmessage = (event) => {
@@ -154,44 +156,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
154
156
  if (!input || !input.length) return true;
155
157
  const channel = input[0];
156
158
 
157
- // Calculate RMS (Root Mean Square) energy
159
+ // Calculate instantaneous RMS (Root Mean Square) energy
158
160
  let sum = 0;
159
161
  for (let i = 0; i < channel.length; i++) {
160
162
  sum += channel[i] * channel[i];
161
163
  }
162
- const rms = Math.sqrt(sum / channel.length);
164
+ const instantRms = Math.sqrt(sum / channel.length);
163
165
 
164
- // Adaptive noise floor estimation - ONLY during silence
165
- // This prevents the noise floor from rising during speech
166
- if (!this.isSpeaking) {
167
- if (rms < this.noiseFloor) {
168
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
169
- } else {
170
- // Even during silence, if we detect a loud signal, adapt very slowly
171
- // This could be brief noise we haven't classified as speech yet
172
-
173
- // SAFEGUARD: If the signal is significantly louder than the noise floor (e.g. > 3x),
174
- // assume it's unclassified speech or a sudden loud noise and DO NOT adapt.
175
- // This prevents the noise floor from "chasing" the speech level during brief pauses or onsets.
176
- const instantSnr = rms / (this.noiseFloor + 1e-6);
177
-
178
- if (instantSnr < 3.0) {
179
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
180
- }
166
+ // Smooth the RMS energy to reduce jitter
167
+ // this.energy acts as the smoothed RMS value
168
+ this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
169
+
170
+ // Adaptive noise floor estimation
171
+ // We use the instantaneous RMS for noise floor tracking to react quickly to silence
172
+ if (instantRms < this.noiseFloor) {
173
+ // If signal is quieter than noise floor, adapt downwards quickly
174
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
175
+ } else {
176
+ // If signal is louder, adapt upwards
177
+ // We use a multi-stage adaptation rate:
178
+ // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
179
+ // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
180
+ // 3. Otherwise, adapt at the normal loud rate
181
+ const snr = instantRms / (this.noiseFloor + 1e-6);
182
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
183
+
184
+ let multiplier = 1.0;
185
+ if (this.isSpeaking) {
186
+ multiplier = 0.01;
187
+ } else if (snrDb > 20) {
188
+ multiplier = 0.1;
181
189
  }
190
+
191
+ const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
192
+ this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
182
193
  }
183
- // During speech, freeze the noise floor to maintain consistent detection
184
194
 
185
- // Calculate Signal-to-Noise Ratio (SNR)
186
- const snr = rms / (this.noiseFloor + 1e-6);
195
+ // Ensure noise floor doesn't drop to absolute zero
196
+ this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
197
+
198
+ // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
199
+ const snr = this.energy / (this.noiseFloor + 1e-6);
200
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
187
201
 
188
- // Map SNR to probability (0-1)
189
- // Probability is 0 when SNR <= minSNR
202
+ // Map SNR dB to probability (0-1)
203
+ // Probability is 0 when snrDb <= minSNR
190
204
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
191
- // Probability is 1 when SNR >= (minSNR + snrRange)
192
- const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
205
+ let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
206
+
207
+ // Apply absolute energy threshold
208
+ if (this.energy < this.minEnergy) {
209
+ probability = 0;
210
+ }
193
211
 
194
- this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
212
+ this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
195
213
 
196
214
  return true;
197
215
  }
@@ -303,11 +321,11 @@ var VADStateMachine = class {
303
321
  enabled: config?.enabled ?? true,
304
322
  pluginName: config?.pluginName ?? "energy-vad",
305
323
  // Voice-optimized defaults
306
- startThreshold: config?.startThreshold ?? 0.6,
324
+ startThreshold: config?.startThreshold ?? 0.8,
307
325
  // Higher threshold to avoid noise
308
- stopThreshold: config?.stopThreshold ?? 0.45,
326
+ stopThreshold: config?.stopThreshold ?? 0.3,
309
327
  // Balanced for voice
310
- hangoverMs: config?.hangoverMs ?? 400,
328
+ hangoverMs: config?.hangoverMs ?? 300,
311
329
  // Smooth for natural speech
312
330
  preRollMs: config?.preRollMs ?? 250,
313
331
  // Generous pre-roll
@@ -316,10 +334,11 @@ var VADStateMachine = class {
316
334
  energyVad: {
317
335
  smoothing: config?.energyVad?.smoothing ?? 0.95,
318
336
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
319
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
320
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
321
- minSNR: config?.energyVad?.minSNR ?? 2,
322
- snrRange: config?.energyVad?.snrRange ?? 8
337
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
338
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
339
+ minSNR: config?.energyVad?.minSNR ?? 10,
340
+ snrRange: config?.energyVad?.snrRange ?? 10,
341
+ minEnergy: config?.energyVad?.minEnergy ?? 5e-4
323
342
  }
324
343
  };
325
344
  this.lastSilenceTime = Date.now();
@@ -1,11 +1,11 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "../chunk-DF4AYGHJ.mjs";
4
- import "../chunk-N553RHTI.mjs";
3
+ } from "../chunk-2UPI6VWY.mjs";
4
+ import "../chunk-XHMNP7NC.mjs";
5
5
  import "../chunk-OZ7KMC4S.mjs";
6
- import "../chunk-ZCC7ID7L.mjs";
6
+ import "../chunk-FOGC2MFA.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-TLPO52HV.mjs";
8
+ import "../chunk-3A2CTC4K.mjs";
9
9
  export {
10
10
  createAudioPipeline
11
11
  };
package/dist/types.d.mts CHANGED
@@ -43,7 +43,7 @@ interface AudioProcessingConfig {
43
43
  * When VAD probability rises above this, audio is unmuted.
44
44
  * Lower = more sensitive (catches quiet speech, may include noise)
45
45
  * Higher = less sensitive (only confident speech, may clip quiet parts)
46
- * Default: 0.6 (optimized for voice-only)
46
+ * Default: 0.8 (aggressive noise rejection)
47
47
  */
48
48
  startThreshold?: number;
49
49
  /**
@@ -51,7 +51,7 @@ interface AudioProcessingConfig {
51
51
  * When VAD probability drops below this (after hangover), audio is muted.
52
52
  * Lower = keeps audio on longer (less aggressive gating)
53
53
  * Higher = mutes faster (more aggressive noise suppression)
54
- * Default: 0.45 (balanced voice detection)
54
+ * Default: 0.3 (wide hysteresis for stability)
55
55
  */
56
56
  stopThreshold?: number;
57
57
  /**
@@ -59,7 +59,7 @@ interface AudioProcessingConfig {
59
59
  * Prevents rapid on/off toggling during pauses.
60
60
  * Lower = more aggressive gating, may clip between words
61
61
  * Higher = smoother but may let trailing noise through
62
- * Default: 400ms (optimized for natural speech)
62
+ * Default: 300ms
63
63
  */
64
64
  hangoverMs?: number;
65
65
  /**
@@ -97,24 +97,30 @@ interface AudioProcessingConfig {
97
97
  initialNoiseFloor?: number;
98
98
  /**
99
99
  * Rate at which noise floor adapts to quiet signals (0-1).
100
- * Default: 0.01
100
+ * Default: 0.05
101
101
  */
102
102
  noiseFloorAdaptRateQuiet?: number;
103
103
  /**
104
104
  * Rate at which noise floor adapts to loud signals (0-1).
105
- * Default: 0.001 (slower adaptation for speech)
105
+ * Default: 0.01 (faster tracking of rising noise)
106
106
  */
107
107
  noiseFloorAdaptRateLoud?: number;
108
108
  /**
109
- * Minimum SNR (Signal-to-Noise Ratio) for speech detection.
110
- * Default: 2.0 (voice is 2x louder than noise floor)
109
+ * Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
110
+ * Default: 10.0 (more aggressive noise rejection)
111
111
  */
112
112
  minSNR?: number;
113
113
  /**
114
- * SNR range for probability scaling.
115
- * Default: 8.0 (probability scales from minSNR to minSNR+snrRange)
114
+ * SNR range in dB for probability scaling.
115
+ * Default: 10.0 (probability scales from minSNR to minSNR+snrRange)
116
116
  */
117
117
  snrRange?: number;
118
+ /**
119
+ * Minimum absolute RMS energy to consider as speech.
120
+ * Prevents triggering on very quiet background noise in silent rooms.
121
+ * Default: 0.0005
122
+ */
123
+ minEnergy?: number;
118
124
  };
119
125
  };
120
126
  /**
package/dist/types.d.ts CHANGED
@@ -43,7 +43,7 @@ interface AudioProcessingConfig {
43
43
  * When VAD probability rises above this, audio is unmuted.
44
44
  * Lower = more sensitive (catches quiet speech, may include noise)
45
45
  * Higher = less sensitive (only confident speech, may clip quiet parts)
46
- * Default: 0.6 (optimized for voice-only)
46
+ * Default: 0.8 (aggressive noise rejection)
47
47
  */
48
48
  startThreshold?: number;
49
49
  /**
@@ -51,7 +51,7 @@ interface AudioProcessingConfig {
51
51
  * When VAD probability drops below this (after hangover), audio is muted.
52
52
  * Lower = keeps audio on longer (less aggressive gating)
53
53
  * Higher = mutes faster (more aggressive noise suppression)
54
- * Default: 0.45 (balanced voice detection)
54
+ * Default: 0.3 (wide hysteresis for stability)
55
55
  */
56
56
  stopThreshold?: number;
57
57
  /**
@@ -59,7 +59,7 @@ interface AudioProcessingConfig {
59
59
  * Prevents rapid on/off toggling during pauses.
60
60
  * Lower = more aggressive gating, may clip between words
61
61
  * Higher = smoother but may let trailing noise through
62
- * Default: 400ms (optimized for natural speech)
62
+ * Default: 300ms
63
63
  */
64
64
  hangoverMs?: number;
65
65
  /**
@@ -97,24 +97,30 @@ interface AudioProcessingConfig {
97
97
  initialNoiseFloor?: number;
98
98
  /**
99
99
  * Rate at which noise floor adapts to quiet signals (0-1).
100
- * Default: 0.01
100
+ * Default: 0.05
101
101
  */
102
102
  noiseFloorAdaptRateQuiet?: number;
103
103
  /**
104
104
  * Rate at which noise floor adapts to loud signals (0-1).
105
- * Default: 0.001 (slower adaptation for speech)
105
+ * Default: 0.01 (faster tracking of rising noise)
106
106
  */
107
107
  noiseFloorAdaptRateLoud?: number;
108
108
  /**
109
- * Minimum SNR (Signal-to-Noise Ratio) for speech detection.
110
- * Default: 2.0 (voice is 2x louder than noise floor)
109
+ * Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
110
+ * Default: 10.0 (more aggressive noise rejection)
111
111
  */
112
112
  minSNR?: number;
113
113
  /**
114
- * SNR range for probability scaling.
115
- * Default: 8.0 (probability scales from minSNR to minSNR+snrRange)
114
+ * SNR range in dB for probability scaling.
115
+ * Default: 10.0 (probability scales from minSNR to minSNR+snrRange)
116
116
  */
117
117
  snrRange?: number;
118
+ /**
119
+ * Minimum absolute RMS energy to consider as speech.
120
+ * Prevents triggering on very quiet background noise in silent rooms.
121
+ * Default: 0.0005
122
+ */
123
+ minEnergy?: number;
118
124
  };
119
125
  };
120
126
  /**
@@ -27,10 +27,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
27
27
  const energyParams = vadConfig?.energyVad || {};
28
28
  const smoothing = energyParams.smoothing ?? 0.95;
29
29
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
30
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
31
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-4;
32
- const minSNR = energyParams.minSNR ?? 2;
33
- const snrRange = energyParams.snrRange ?? 8;
30
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
31
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
32
+ const minSNR = energyParams.minSNR ?? 10;
33
+ const snrRange = energyParams.snrRange ?? 10;
34
+ const minEnergy = energyParams.minEnergy ?? 5e-4;
34
35
  return `
35
36
  class EnergyVadProcessor extends AudioWorkletProcessor {
36
37
  constructor() {
@@ -42,6 +43,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
42
43
  this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
43
44
  this.minSNR = ${minSNR};
44
45
  this.snrRange = ${snrRange};
46
+ this.minEnergy = ${minEnergy};
45
47
  this.isSpeaking = false;
46
48
 
47
49
  this.port.onmessage = (event) => {
@@ -56,44 +58,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
56
58
  if (!input || !input.length) return true;
57
59
  const channel = input[0];
58
60
 
59
- // Calculate RMS (Root Mean Square) energy
61
+ // Calculate instantaneous RMS (Root Mean Square) energy
60
62
  let sum = 0;
61
63
  for (let i = 0; i < channel.length; i++) {
62
64
  sum += channel[i] * channel[i];
63
65
  }
64
- const rms = Math.sqrt(sum / channel.length);
66
+ const instantRms = Math.sqrt(sum / channel.length);
65
67
 
66
- // Adaptive noise floor estimation - ONLY during silence
67
- // This prevents the noise floor from rising during speech
68
- if (!this.isSpeaking) {
69
- if (rms < this.noiseFloor) {
70
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
71
- } else {
72
- // Even during silence, if we detect a loud signal, adapt very slowly
73
- // This could be brief noise we haven't classified as speech yet
74
-
75
- // SAFEGUARD: If the signal is significantly louder than the noise floor (e.g. > 3x),
76
- // assume it's unclassified speech or a sudden loud noise and DO NOT adapt.
77
- // This prevents the noise floor from "chasing" the speech level during brief pauses or onsets.
78
- const instantSnr = rms / (this.noiseFloor + 1e-6);
79
-
80
- if (instantSnr < 3.0) {
81
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
82
- }
68
+ // Smooth the RMS energy to reduce jitter
69
+ // this.energy acts as the smoothed RMS value
70
+ this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
71
+
72
+ // Adaptive noise floor estimation
73
+ // We use the instantaneous RMS for noise floor tracking to react quickly to silence
74
+ if (instantRms < this.noiseFloor) {
75
+ // If signal is quieter than noise floor, adapt downwards quickly
76
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
77
+ } else {
78
+ // If signal is louder, adapt upwards
79
+ // We use a multi-stage adaptation rate:
80
+ // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
81
+ // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
82
+ // 3. Otherwise, adapt at the normal loud rate
83
+ const snr = instantRms / (this.noiseFloor + 1e-6);
84
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
85
+
86
+ let multiplier = 1.0;
87
+ if (this.isSpeaking) {
88
+ multiplier = 0.01;
89
+ } else if (snrDb > 20) {
90
+ multiplier = 0.1;
83
91
  }
92
+
93
+ const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
94
+ this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
84
95
  }
85
- // During speech, freeze the noise floor to maintain consistent detection
86
96
 
87
- // Calculate Signal-to-Noise Ratio (SNR)
88
- const snr = rms / (this.noiseFloor + 1e-6);
97
+ // Ensure noise floor doesn't drop to absolute zero
98
+ this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
99
+
100
+ // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
101
+ const snr = this.energy / (this.noiseFloor + 1e-6);
102
+ const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
89
103
 
90
- // Map SNR to probability (0-1)
91
- // Probability is 0 when SNR <= minSNR
104
+ // Map SNR dB to probability (0-1)
105
+ // Probability is 0 when snrDb <= minSNR
92
106
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
93
- // Probability is 1 when SNR >= (minSNR + snrRange)
94
- const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
107
+ let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
108
+
109
+ // Apply absolute energy threshold
110
+ if (this.energy < this.minEnergy) {
111
+ probability = 0;
112
+ }
95
113
 
96
- this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
114
+ this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
97
115
 
98
116
  return true;
99
117
  }
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  EnergyVADPlugin
3
- } from "../chunk-TLPO52HV.mjs";
3
+ } from "../chunk-3A2CTC4K.mjs";
4
4
  export {
5
5
  EnergyVADPlugin
6
6
  };
@@ -36,11 +36,11 @@ var VADStateMachine = class {
36
36
  enabled: config?.enabled ?? true,
37
37
  pluginName: config?.pluginName ?? "energy-vad",
38
38
  // Voice-optimized defaults
39
- startThreshold: config?.startThreshold ?? 0.6,
39
+ startThreshold: config?.startThreshold ?? 0.8,
40
40
  // Higher threshold to avoid noise
41
- stopThreshold: config?.stopThreshold ?? 0.45,
41
+ stopThreshold: config?.stopThreshold ?? 0.3,
42
42
  // Balanced for voice
43
- hangoverMs: config?.hangoverMs ?? 400,
43
+ hangoverMs: config?.hangoverMs ?? 300,
44
44
  // Smooth for natural speech
45
45
  preRollMs: config?.preRollMs ?? 250,
46
46
  // Generous pre-roll
@@ -49,10 +49,11 @@ var VADStateMachine = class {
49
49
  energyVad: {
50
50
  smoothing: config?.energyVad?.smoothing ?? 0.95,
51
51
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
52
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
53
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
54
- minSNR: config?.energyVad?.minSNR ?? 2,
55
- snrRange: config?.energyVad?.snrRange ?? 8
52
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
53
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
54
+ minSNR: config?.energyVad?.minSNR ?? 10,
55
+ snrRange: config?.energyVad?.snrRange ?? 10,
56
+ minEnergy: config?.energyVad?.minEnergy ?? 5e-4
56
57
  }
57
58
  };
58
59
  this.lastSilenceTime = Date.now();
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  VADStateMachine
3
- } from "../chunk-N553RHTI.mjs";
3
+ } from "../chunk-XHMNP7NC.mjs";
4
4
  export {
5
5
  VADStateMachine
6
6
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tensamin/audio",
3
- "version": "0.1.6",
3
+ "version": "0.1.8",
4
4
  "main": "dist/index.js",
5
5
  "module": "dist/index.mjs",
6
6
  "types": "dist/index.d.ts",