@tensamin/audio 0.1.12 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -101,11 +101,11 @@ vad: {
101
101
  energyVad?: {
102
102
  smoothing: number; // Default: 0.95
103
103
  initialNoiseFloor: number; // Default: 0.001
104
- noiseFloorAdaptRateQuiet: number; // Default: 0.01
105
- noiseFloorAdaptRateLoud: number; // Default: 0.05
106
- minSNR: number; // Default: 10.0 (dB)
104
+ noiseFloorAdaptRateQuiet: number; // Default: 0.002
105
+ noiseFloorAdaptRateLoud: number; // Default: 0.02
106
+ minSNR: number; // Default: 12.0 (dB)
107
107
  snrRange: number; // Default: 10.0 (dB)
108
- minEnergy: number; // Default: 0.001
108
+ minEnergy: number; // Default: 0.003
109
109
  };
110
110
  }
111
111
  ```
@@ -116,7 +116,7 @@ vad: {
116
116
  - `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
117
117
  - `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
118
118
  - `preRollMs`: Audio buffer duration before speech onset
119
- - `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 150ms)
119
+ - `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 250ms)
120
120
  - `minSilenceDurationMs`: Minimum silence duration between speech segments
121
121
 
122
122
  **Energy VAD Parameters:**
@@ -124,7 +124,7 @@ vad: {
124
124
  - `smoothing`: Energy calculation smoothing factor (0-1)
125
125
  - `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
126
126
  - `snrRange`: Range in dB for probability scaling from minSNR
127
- - `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.001, ~-60dB)
127
+ - `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.003, ~-50dB)
128
128
 
129
129
  ### Output Control
130
130
 
@@ -3,11 +3,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
3
3
  const energyParams = vadConfig?.energyVad || {};
4
4
  const smoothing = energyParams.smoothing ?? 0.95;
5
5
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
6
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
7
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.05;
8
- const minSNR = energyParams.minSNR ?? 10;
6
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
7
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
8
+ const minSNR = energyParams.minSNR ?? 12;
9
9
  const snrRange = energyParams.snrRange ?? 10;
10
- const minEnergy = energyParams.minEnergy ?? 1e-3;
10
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
11
11
  return `
12
12
  class EnergyVadProcessor extends AudioWorkletProcessor {
13
13
  constructor() {
@@ -36,8 +36,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
36
36
 
37
37
  // Calculate instantaneous RMS (Root Mean Square) energy
38
38
  let sum = 0;
39
+ let peak = 0;
39
40
  for (let i = 0; i < channel.length; i++) {
41
+ const sample = Math.abs(channel[i]);
40
42
  sum += channel[i] * channel[i];
43
+ peak = Math.max(peak, sample);
41
44
  }
42
45
  const instantRms = Math.sqrt(sum / channel.length);
43
46
 
@@ -45,36 +48,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
45
48
  // this.energy acts as the smoothed RMS value
46
49
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
47
50
 
48
- // Adaptive noise floor estimation
49
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
50
- if (instantRms < this.noiseFloor) {
51
- // If signal is quieter than noise floor, adapt downwards quickly
52
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
51
+ // Calculate Crest Factor (peak-to-RMS ratio)
52
+ // Voice typically has crest factor of 2-4 (6-12dB)
53
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
54
+ const crestFactor = peak / (instantRms + 1e-10);
55
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
56
+
57
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
58
+ // This prevents sharp transients from affecting the noise floor
59
+ if (this.energy < this.noiseFloor) {
60
+ // Signal is quieter than noise floor, adapt downwards slowly
61
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
53
62
  } else {
54
- // If signal is louder, adapt upwards
55
- // We use a multi-stage adaptation rate:
56
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
57
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
58
- // 3. Otherwise, adapt at the normal loud rate
59
- const snr = instantRms / (this.noiseFloor + 1e-6);
60
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
63
+ // Calculate SNR based on smoothed energy
64
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
65
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
61
66
 
62
- let multiplier = 1.0;
63
- if (this.isSpeaking) {
64
- multiplier = 0.05;
65
- } else if (snrDb > 20) {
66
- multiplier = 0.2;
67
+ // Only adapt upwards if:
68
+ // 1. SNR is low (< 10dB) - likely just background noise
69
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
70
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
71
+ // This is persistent background noise, adapt upwards
72
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
73
+ } else {
74
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
75
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
76
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
67
77
  }
68
-
69
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
70
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
71
78
  }
72
79
 
73
80
  // Ensure noise floor doesn't drop to absolute zero
74
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
75
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
81
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
76
82
 
77
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
83
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
78
84
  const snr = this.energy / (this.noiseFloor + 1e-6);
79
85
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
80
86
 
@@ -83,11 +89,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
83
89
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
84
90
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
85
91
 
86
- // Apply absolute energy threshold
87
- // We use a soft threshold to avoid abrupt cutting
92
+ // Apply absolute energy threshold with soft knee
88
93
  if (this.energy < this.minEnergy) {
89
94
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
90
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
95
+ probability *= Math.pow(energyRatio, 2);
96
+ }
97
+
98
+ // Apply crest factor penalty
99
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
100
+ // Voice: 6-12dB, Keyboard: 20-30dB
101
+ // We penalize anything above 14dB
102
+ if (crestFactorDb > 14) {
103
+ const excess = crestFactorDb - 14;
104
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
105
+ probability *= penalty;
91
106
  }
92
107
 
93
108
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "./chunk-SMZJFNRU.mjs";
3
+ } from "./chunk-R5M2DGAQ.mjs";
4
4
 
5
5
  // src/livekit/integration.ts
6
6
  async function attachProcessingToTrack(track, config = {}) {
@@ -20,17 +20,17 @@ var VADStateMachine = class {
20
20
  // Smooth for natural speech
21
21
  preRollMs: config?.preRollMs ?? 250,
22
22
  // Generous pre-roll
23
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
24
- // Increased to filter keyboard clicks
23
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
24
+ // Aggressive transient rejection
25
25
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
26
26
  energyVad: {
27
27
  smoothing: config?.energyVad?.smoothing ?? 0.95,
28
28
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
29
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
30
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
31
- minSNR: config?.energyVad?.minSNR ?? 10,
29
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
30
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
31
+ minSNR: config?.energyVad?.minSNR ?? 12,
32
32
  snrRange: config?.energyVad?.snrRange ?? 10,
33
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
33
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
34
34
  }
35
35
  };
36
36
  this.lastSilenceTime = Date.now();
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  VADStateMachine
3
- } from "./chunk-DYY2MXMU.mjs";
3
+ } from "./chunk-K4YLH73B.mjs";
4
4
  import {
5
5
  getAudioContext,
6
6
  registerPipeline,
@@ -9,7 +9,7 @@ import {
9
9
  import {
10
10
  getNoiseSuppressionPlugin,
11
11
  getVADPlugin
12
- } from "./chunk-XZSFQJW4.mjs";
12
+ } from "./chunk-UFKIAMG3.mjs";
13
13
 
14
14
  // src/pipeline/audio-pipeline.ts
15
15
  import mitt from "mitt";
@@ -3,7 +3,7 @@ import {
3
3
  } from "./chunk-XO6B3D4A.mjs";
4
4
  import {
5
5
  EnergyVADPlugin
6
- } from "./chunk-KEWK2OKV.mjs";
6
+ } from "./chunk-2G2JFHJY.mjs";
7
7
 
8
8
  // src/extensibility/plugins.ts
9
9
  var nsPlugins = /* @__PURE__ */ new Map();
@@ -106,11 +106,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
106
106
  const energyParams = vadConfig?.energyVad || {};
107
107
  const smoothing = energyParams.smoothing ?? 0.95;
108
108
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
109
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
110
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.05;
111
- const minSNR = energyParams.minSNR ?? 10;
109
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
110
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
111
+ const minSNR = energyParams.minSNR ?? 12;
112
112
  const snrRange = energyParams.snrRange ?? 10;
113
- const minEnergy = energyParams.minEnergy ?? 1e-3;
113
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
114
114
  return `
115
115
  class EnergyVadProcessor extends AudioWorkletProcessor {
116
116
  constructor() {
@@ -139,8 +139,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
139
139
 
140
140
  // Calculate instantaneous RMS (Root Mean Square) energy
141
141
  let sum = 0;
142
+ let peak = 0;
142
143
  for (let i = 0; i < channel.length; i++) {
144
+ const sample = Math.abs(channel[i]);
143
145
  sum += channel[i] * channel[i];
146
+ peak = Math.max(peak, sample);
144
147
  }
145
148
  const instantRms = Math.sqrt(sum / channel.length);
146
149
 
@@ -148,36 +151,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
148
151
  // this.energy acts as the smoothed RMS value
149
152
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
150
153
 
151
- // Adaptive noise floor estimation
152
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
153
- if (instantRms < this.noiseFloor) {
154
- // If signal is quieter than noise floor, adapt downwards quickly
155
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
154
+ // Calculate Crest Factor (peak-to-RMS ratio)
155
+ // Voice typically has crest factor of 2-4 (6-12dB)
156
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
157
+ const crestFactor = peak / (instantRms + 1e-10);
158
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
159
+
160
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
161
+ // This prevents sharp transients from affecting the noise floor
162
+ if (this.energy < this.noiseFloor) {
163
+ // Signal is quieter than noise floor, adapt downwards slowly
164
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
156
165
  } else {
157
- // If signal is louder, adapt upwards
158
- // We use a multi-stage adaptation rate:
159
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
160
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
161
- // 3. Otherwise, adapt at the normal loud rate
162
- const snr = instantRms / (this.noiseFloor + 1e-6);
163
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
166
+ // Calculate SNR based on smoothed energy
167
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
168
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
164
169
 
165
- let multiplier = 1.0;
166
- if (this.isSpeaking) {
167
- multiplier = 0.05;
168
- } else if (snrDb > 20) {
169
- multiplier = 0.2;
170
+ // Only adapt upwards if:
171
+ // 1. SNR is low (< 10dB) - likely just background noise
172
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
173
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
174
+ // This is persistent background noise, adapt upwards
175
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
176
+ } else {
177
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
178
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
179
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
170
180
  }
171
-
172
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
173
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
174
181
  }
175
182
 
176
183
  // Ensure noise floor doesn't drop to absolute zero
177
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
178
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
184
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
179
185
 
180
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
186
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
181
187
  const snr = this.energy / (this.noiseFloor + 1e-6);
182
188
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
183
189
 
@@ -186,11 +192,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
186
192
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
187
193
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
188
194
 
189
- // Apply absolute energy threshold
190
- // We use a soft threshold to avoid abrupt cutting
195
+ // Apply absolute energy threshold with soft knee
191
196
  if (this.energy < this.minEnergy) {
192
197
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
193
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
198
+ probability *= Math.pow(energyRatio, 2);
199
+ }
200
+
201
+ // Apply crest factor penalty
202
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
203
+ // Voice: 6-12dB, Keyboard: 20-30dB
204
+ // We penalize anything above 14dB
205
+ if (crestFactorDb > 14) {
206
+ const excess = crestFactorDb - 14;
207
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
208
+ probability *= penalty;
194
209
  }
195
210
 
196
211
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -3,9 +3,9 @@ import {
3
3
  getVADPlugin,
4
4
  registerNoiseSuppressionPlugin,
5
5
  registerVADPlugin
6
- } from "../chunk-XZSFQJW4.mjs";
6
+ } from "../chunk-UFKIAMG3.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-KEWK2OKV.mjs";
8
+ import "../chunk-2G2JFHJY.mjs";
9
9
  export {
10
10
  getNoiseSuppressionPlugin,
11
11
  getVADPlugin,
package/dist/index.js CHANGED
@@ -158,11 +158,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
158
158
  const energyParams = vadConfig?.energyVad || {};
159
159
  const smoothing = energyParams.smoothing ?? 0.95;
160
160
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
161
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
162
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.05;
163
- const minSNR = energyParams.minSNR ?? 10;
161
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
162
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
163
+ const minSNR = energyParams.minSNR ?? 12;
164
164
  const snrRange = energyParams.snrRange ?? 10;
165
- const minEnergy = energyParams.minEnergy ?? 1e-3;
165
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
166
166
  return `
167
167
  class EnergyVadProcessor extends AudioWorkletProcessor {
168
168
  constructor() {
@@ -191,8 +191,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
191
191
 
192
192
  // Calculate instantaneous RMS (Root Mean Square) energy
193
193
  let sum = 0;
194
+ let peak = 0;
194
195
  for (let i = 0; i < channel.length; i++) {
196
+ const sample = Math.abs(channel[i]);
195
197
  sum += channel[i] * channel[i];
198
+ peak = Math.max(peak, sample);
196
199
  }
197
200
  const instantRms = Math.sqrt(sum / channel.length);
198
201
 
@@ -200,36 +203,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
200
203
  // this.energy acts as the smoothed RMS value
201
204
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
202
205
 
203
- // Adaptive noise floor estimation
204
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
205
- if (instantRms < this.noiseFloor) {
206
- // If signal is quieter than noise floor, adapt downwards quickly
207
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
206
+ // Calculate Crest Factor (peak-to-RMS ratio)
207
+ // Voice typically has crest factor of 2-4 (6-12dB)
208
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
209
+ const crestFactor = peak / (instantRms + 1e-10);
210
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
211
+
212
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
213
+ // This prevents sharp transients from affecting the noise floor
214
+ if (this.energy < this.noiseFloor) {
215
+ // Signal is quieter than noise floor, adapt downwards slowly
216
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
208
217
  } else {
209
- // If signal is louder, adapt upwards
210
- // We use a multi-stage adaptation rate:
211
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
212
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
213
- // 3. Otherwise, adapt at the normal loud rate
214
- const snr = instantRms / (this.noiseFloor + 1e-6);
215
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
218
+ // Calculate SNR based on smoothed energy
219
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
220
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
216
221
 
217
- let multiplier = 1.0;
218
- if (this.isSpeaking) {
219
- multiplier = 0.05;
220
- } else if (snrDb > 20) {
221
- multiplier = 0.2;
222
+ // Only adapt upwards if:
223
+ // 1. SNR is low (< 10dB) - likely just background noise
224
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
225
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
226
+ // This is persistent background noise, adapt upwards
227
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
228
+ } else {
229
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
230
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
231
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
222
232
  }
223
-
224
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
225
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
226
233
  }
227
234
 
228
235
  // Ensure noise floor doesn't drop to absolute zero
229
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
230
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
236
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
231
237
 
232
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
238
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
233
239
  const snr = this.energy / (this.noiseFloor + 1e-6);
234
240
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
235
241
 
@@ -238,11 +244,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
238
244
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
239
245
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
240
246
 
241
- // Apply absolute energy threshold
242
- // We use a soft threshold to avoid abrupt cutting
247
+ // Apply absolute energy threshold with soft knee
243
248
  if (this.energy < this.minEnergy) {
244
249
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
245
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
250
+ probability *= Math.pow(energyRatio, 2);
251
+ }
252
+
253
+ // Apply crest factor penalty
254
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
255
+ // Voice: 6-12dB, Keyboard: 20-30dB
256
+ // We penalize anything above 14dB
257
+ if (crestFactorDb > 14) {
258
+ const excess = crestFactorDb - 14;
259
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
260
+ probability *= penalty;
246
261
  }
247
262
 
248
263
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -371,17 +386,17 @@ var VADStateMachine = class {
371
386
  // Smooth for natural speech
372
387
  preRollMs: config?.preRollMs ?? 250,
373
388
  // Generous pre-roll
374
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
375
- // Increased to filter keyboard clicks
389
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
390
+ // Aggressive transient rejection
376
391
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
377
392
  energyVad: {
378
393
  smoothing: config?.energyVad?.smoothing ?? 0.95,
379
394
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
380
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
381
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
382
- minSNR: config?.energyVad?.minSNR ?? 10,
395
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
396
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
397
+ minSNR: config?.energyVad?.minSNR ?? 12,
383
398
  snrRange: config?.energyVad?.snrRange ?? 10,
384
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
399
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
385
400
  }
386
401
  };
387
402
  this.lastSilenceTime = Date.now();
package/dist/index.mjs CHANGED
@@ -1,13 +1,13 @@
1
1
  import "./chunk-WBQAMGXK.mjs";
2
2
  import {
3
3
  attachProcessingToTrack
4
- } from "./chunk-Q2I22TJG.mjs";
4
+ } from "./chunk-6F2HZUYO.mjs";
5
5
  import {
6
6
  createAudioPipeline
7
- } from "./chunk-SMZJFNRU.mjs";
7
+ } from "./chunk-R5M2DGAQ.mjs";
8
8
  import {
9
9
  VADStateMachine
10
- } from "./chunk-DYY2MXMU.mjs";
10
+ } from "./chunk-K4YLH73B.mjs";
11
11
  import {
12
12
  closeAudioContext,
13
13
  getAudioContext,
@@ -21,13 +21,13 @@ import {
21
21
  getVADPlugin,
22
22
  registerNoiseSuppressionPlugin,
23
23
  registerVADPlugin
24
- } from "./chunk-XZSFQJW4.mjs";
24
+ } from "./chunk-UFKIAMG3.mjs";
25
25
  import {
26
26
  RNNoisePlugin
27
27
  } from "./chunk-XO6B3D4A.mjs";
28
28
  import {
29
29
  EnergyVADPlugin
30
- } from "./chunk-KEWK2OKV.mjs";
30
+ } from "./chunk-2G2JFHJY.mjs";
31
31
  export {
32
32
  EnergyVADPlugin,
33
33
  RNNoisePlugin,
@@ -127,11 +127,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
127
127
  const energyParams = vadConfig?.energyVad || {};
128
128
  const smoothing = energyParams.smoothing ?? 0.95;
129
129
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
130
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
131
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.05;
132
- const minSNR = energyParams.minSNR ?? 10;
130
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
131
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
132
+ const minSNR = energyParams.minSNR ?? 12;
133
133
  const snrRange = energyParams.snrRange ?? 10;
134
- const minEnergy = energyParams.minEnergy ?? 1e-3;
134
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
135
135
  return `
136
136
  class EnergyVadProcessor extends AudioWorkletProcessor {
137
137
  constructor() {
@@ -160,8 +160,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
160
160
 
161
161
  // Calculate instantaneous RMS (Root Mean Square) energy
162
162
  let sum = 0;
163
+ let peak = 0;
163
164
  for (let i = 0; i < channel.length; i++) {
165
+ const sample = Math.abs(channel[i]);
164
166
  sum += channel[i] * channel[i];
167
+ peak = Math.max(peak, sample);
165
168
  }
166
169
  const instantRms = Math.sqrt(sum / channel.length);
167
170
 
@@ -169,36 +172,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
169
172
  // this.energy acts as the smoothed RMS value
170
173
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
171
174
 
172
- // Adaptive noise floor estimation
173
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
174
- if (instantRms < this.noiseFloor) {
175
- // If signal is quieter than noise floor, adapt downwards quickly
176
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
175
+ // Calculate Crest Factor (peak-to-RMS ratio)
176
+ // Voice typically has crest factor of 2-4 (6-12dB)
177
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
178
+ const crestFactor = peak / (instantRms + 1e-10);
179
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
180
+
181
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
182
+ // This prevents sharp transients from affecting the noise floor
183
+ if (this.energy < this.noiseFloor) {
184
+ // Signal is quieter than noise floor, adapt downwards slowly
185
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
177
186
  } else {
178
- // If signal is louder, adapt upwards
179
- // We use a multi-stage adaptation rate:
180
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
181
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
182
- // 3. Otherwise, adapt at the normal loud rate
183
- const snr = instantRms / (this.noiseFloor + 1e-6);
184
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
187
+ // Calculate SNR based on smoothed energy
188
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
189
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
185
190
 
186
- let multiplier = 1.0;
187
- if (this.isSpeaking) {
188
- multiplier = 0.05;
189
- } else if (snrDb > 20) {
190
- multiplier = 0.2;
191
+ // Only adapt upwards if:
192
+ // 1. SNR is low (< 10dB) - likely just background noise
193
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
194
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
195
+ // This is persistent background noise, adapt upwards
196
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
197
+ } else {
198
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
199
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
200
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
191
201
  }
192
-
193
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
194
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
195
202
  }
196
203
 
197
204
  // Ensure noise floor doesn't drop to absolute zero
198
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
199
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
205
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
200
206
 
201
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
207
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
202
208
  const snr = this.energy / (this.noiseFloor + 1e-6);
203
209
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
204
210
 
@@ -207,11 +213,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
207
213
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
208
214
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
209
215
 
210
- // Apply absolute energy threshold
211
- // We use a soft threshold to avoid abrupt cutting
216
+ // Apply absolute energy threshold with soft knee
212
217
  if (this.energy < this.minEnergy) {
213
218
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
214
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
219
+ probability *= Math.pow(energyRatio, 2);
220
+ }
221
+
222
+ // Apply crest factor penalty
223
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
224
+ // Voice: 6-12dB, Keyboard: 20-30dB
225
+ // We penalize anything above 14dB
226
+ if (crestFactorDb > 14) {
227
+ const excess = crestFactorDb - 14;
228
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
229
+ probability *= penalty;
215
230
  }
216
231
 
217
232
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -334,17 +349,17 @@ var VADStateMachine = class {
334
349
  // Smooth for natural speech
335
350
  preRollMs: config?.preRollMs ?? 250,
336
351
  // Generous pre-roll
337
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
338
- // Increased to filter keyboard clicks
352
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
353
+ // Aggressive transient rejection
339
354
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
340
355
  energyVad: {
341
356
  smoothing: config?.energyVad?.smoothing ?? 0.95,
342
357
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
343
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
344
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
345
- minSNR: config?.energyVad?.minSNR ?? 10,
358
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
359
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
360
+ minSNR: config?.energyVad?.minSNR ?? 12,
346
361
  snrRange: config?.energyVad?.snrRange ?? 10,
347
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
362
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
348
363
  }
349
364
  };
350
365
  this.lastSilenceTime = Date.now();
@@ -1,12 +1,12 @@
1
1
  import {
2
2
  attachProcessingToTrack
3
- } from "../chunk-Q2I22TJG.mjs";
4
- import "../chunk-SMZJFNRU.mjs";
5
- import "../chunk-DYY2MXMU.mjs";
3
+ } from "../chunk-6F2HZUYO.mjs";
4
+ import "../chunk-R5M2DGAQ.mjs";
5
+ import "../chunk-K4YLH73B.mjs";
6
6
  import "../chunk-OZ7KMC4S.mjs";
7
- import "../chunk-XZSFQJW4.mjs";
7
+ import "../chunk-UFKIAMG3.mjs";
8
8
  import "../chunk-XO6B3D4A.mjs";
9
- import "../chunk-KEWK2OKV.mjs";
9
+ import "../chunk-2G2JFHJY.mjs";
10
10
  export {
11
11
  attachProcessingToTrack
12
12
  };
@@ -125,11 +125,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
125
125
  const energyParams = vadConfig?.energyVad || {};
126
126
  const smoothing = energyParams.smoothing ?? 0.95;
127
127
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
128
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
129
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.05;
130
- const minSNR = energyParams.minSNR ?? 10;
128
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
129
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
130
+ const minSNR = energyParams.minSNR ?? 12;
131
131
  const snrRange = energyParams.snrRange ?? 10;
132
- const minEnergy = energyParams.minEnergy ?? 1e-3;
132
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
133
133
  return `
134
134
  class EnergyVadProcessor extends AudioWorkletProcessor {
135
135
  constructor() {
@@ -158,8 +158,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
158
158
 
159
159
  // Calculate instantaneous RMS (Root Mean Square) energy
160
160
  let sum = 0;
161
+ let peak = 0;
161
162
  for (let i = 0; i < channel.length; i++) {
163
+ const sample = Math.abs(channel[i]);
162
164
  sum += channel[i] * channel[i];
165
+ peak = Math.max(peak, sample);
163
166
  }
164
167
  const instantRms = Math.sqrt(sum / channel.length);
165
168
 
@@ -167,36 +170,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
167
170
  // this.energy acts as the smoothed RMS value
168
171
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
169
172
 
170
- // Adaptive noise floor estimation
171
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
172
- if (instantRms < this.noiseFloor) {
173
- // If signal is quieter than noise floor, adapt downwards quickly
174
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
173
+ // Calculate Crest Factor (peak-to-RMS ratio)
174
+ // Voice typically has crest factor of 2-4 (6-12dB)
175
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
176
+ const crestFactor = peak / (instantRms + 1e-10);
177
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
178
+
179
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
180
+ // This prevents sharp transients from affecting the noise floor
181
+ if (this.energy < this.noiseFloor) {
182
+ // Signal is quieter than noise floor, adapt downwards slowly
183
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
175
184
  } else {
176
- // If signal is louder, adapt upwards
177
- // We use a multi-stage adaptation rate:
178
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
179
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
180
- // 3. Otherwise, adapt at the normal loud rate
181
- const snr = instantRms / (this.noiseFloor + 1e-6);
182
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
185
+ // Calculate SNR based on smoothed energy
186
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
187
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
183
188
 
184
- let multiplier = 1.0;
185
- if (this.isSpeaking) {
186
- multiplier = 0.05;
187
- } else if (snrDb > 20) {
188
- multiplier = 0.2;
189
+ // Only adapt upwards if:
190
+ // 1. SNR is low (< 10dB) - likely just background noise
191
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
192
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
193
+ // This is persistent background noise, adapt upwards
194
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
195
+ } else {
196
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
197
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
198
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
189
199
  }
190
-
191
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
192
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
193
200
  }
194
201
 
195
202
  // Ensure noise floor doesn't drop to absolute zero
196
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
197
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
203
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
198
204
 
199
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
205
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
200
206
  const snr = this.energy / (this.noiseFloor + 1e-6);
201
207
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
202
208
 
@@ -205,11 +211,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
205
211
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
206
212
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
207
213
 
208
- // Apply absolute energy threshold
209
- // We use a soft threshold to avoid abrupt cutting
214
+ // Apply absolute energy threshold with soft knee
210
215
  if (this.energy < this.minEnergy) {
211
216
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
212
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
217
+ probability *= Math.pow(energyRatio, 2);
218
+ }
219
+
220
+ // Apply crest factor penalty
221
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
222
+ // Voice: 6-12dB, Keyboard: 20-30dB
223
+ // We penalize anything above 14dB
224
+ if (crestFactorDb > 14) {
225
+ const excess = crestFactorDb - 14;
226
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
227
+ probability *= penalty;
213
228
  }
214
229
 
215
230
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -332,17 +347,17 @@ var VADStateMachine = class {
332
347
  // Smooth for natural speech
333
348
  preRollMs: config?.preRollMs ?? 250,
334
349
  // Generous pre-roll
335
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
336
- // Increased to filter keyboard clicks
350
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
351
+ // Aggressive transient rejection
337
352
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
338
353
  energyVad: {
339
354
  smoothing: config?.energyVad?.smoothing ?? 0.95,
340
355
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
341
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
342
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
343
- minSNR: config?.energyVad?.minSNR ?? 10,
356
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
357
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
358
+ minSNR: config?.energyVad?.minSNR ?? 12,
344
359
  snrRange: config?.energyVad?.snrRange ?? 10,
345
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
360
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
346
361
  }
347
362
  };
348
363
  this.lastSilenceTime = Date.now();
@@ -1,11 +1,11 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "../chunk-SMZJFNRU.mjs";
4
- import "../chunk-DYY2MXMU.mjs";
3
+ } from "../chunk-R5M2DGAQ.mjs";
4
+ import "../chunk-K4YLH73B.mjs";
5
5
  import "../chunk-OZ7KMC4S.mjs";
6
- import "../chunk-XZSFQJW4.mjs";
6
+ import "../chunk-UFKIAMG3.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-KEWK2OKV.mjs";
8
+ import "../chunk-2G2JFHJY.mjs";
9
9
  export {
10
10
  createAudioPipeline
11
11
  };
package/dist/types.d.mts CHANGED
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
70
70
  preRollMs?: number;
71
71
  /**
72
72
  * Minimum speech duration in ms to consider it valid speech.
73
- * Filters out very brief noise spikes like keyboard clicks.
74
- * Default: 150ms
73
+ * Filters out brief transients like keyboard clicks.
74
+ * Default: 250ms (aggressive transient rejection)
75
75
  */
76
76
  minSpeechDurationMs?: number;
77
77
  /**
@@ -97,17 +97,18 @@ interface AudioProcessingConfig {
97
97
  initialNoiseFloor?: number;
98
98
  /**
99
99
  * Rate at which noise floor adapts to quiet signals (0-1).
100
- * Default: 0.01
100
+ * Default: 0.002 (very slow downward drift)
101
101
  */
102
102
  noiseFloorAdaptRateQuiet?: number;
103
103
  /**
104
104
  * Rate at which noise floor adapts to loud signals (0-1).
105
- * Default: 0.05 (faster tracking of rising noise)
105
+ * Applied to low-energy, low-crest-factor signals (background noise).
106
+ * Default: 0.02
106
107
  */
107
108
  noiseFloorAdaptRateLoud?: number;
108
109
  /**
109
110
  * Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
110
- * Default: 10.0 (more aggressive noise rejection)
111
+ * Default: 12.0 (aggressive noise rejection)
111
112
  */
112
113
  minSNR?: number;
113
114
  /**
@@ -117,8 +118,8 @@ interface AudioProcessingConfig {
117
118
  snrRange?: number;
118
119
  /**
119
120
  * Minimum absolute RMS energy to consider as speech.
120
- * Prevents triggering on very quiet background noise in silent rooms.
121
- * Default: 0.001 (approx -60dB)
121
+ * Prevents triggering on very quiet background noise.
122
+ * Default: 0.003 (approx -50dB, voice-appropriate level)
122
123
  */
123
124
  minEnergy?: number;
124
125
  };
package/dist/types.d.ts CHANGED
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
70
70
  preRollMs?: number;
71
71
  /**
72
72
  * Minimum speech duration in ms to consider it valid speech.
73
- * Filters out very brief noise spikes like keyboard clicks.
74
- * Default: 150ms
73
+ * Filters out brief transients like keyboard clicks.
74
+ * Default: 250ms (aggressive transient rejection)
75
75
  */
76
76
  minSpeechDurationMs?: number;
77
77
  /**
@@ -97,17 +97,18 @@ interface AudioProcessingConfig {
97
97
  initialNoiseFloor?: number;
98
98
  /**
99
99
  * Rate at which noise floor adapts to quiet signals (0-1).
100
- * Default: 0.01
100
+ * Default: 0.002 (very slow downward drift)
101
101
  */
102
102
  noiseFloorAdaptRateQuiet?: number;
103
103
  /**
104
104
  * Rate at which noise floor adapts to loud signals (0-1).
105
- * Default: 0.05 (faster tracking of rising noise)
105
+ * Applied to low-energy, low-crest-factor signals (background noise).
106
+ * Default: 0.02
106
107
  */
107
108
  noiseFloorAdaptRateLoud?: number;
108
109
  /**
109
110
  * Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
110
- * Default: 10.0 (more aggressive noise rejection)
111
+ * Default: 12.0 (aggressive noise rejection)
111
112
  */
112
113
  minSNR?: number;
113
114
  /**
@@ -117,8 +118,8 @@ interface AudioProcessingConfig {
117
118
  snrRange?: number;
118
119
  /**
119
120
  * Minimum absolute RMS energy to consider as speech.
120
- * Prevents triggering on very quiet background noise in silent rooms.
121
- * Default: 0.001 (approx -60dB)
121
+ * Prevents triggering on very quiet background noise.
122
+ * Default: 0.003 (approx -50dB, voice-appropriate level)
122
123
  */
123
124
  minEnergy?: number;
124
125
  };
@@ -27,11 +27,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
27
27
  const energyParams = vadConfig?.energyVad || {};
28
28
  const smoothing = energyParams.smoothing ?? 0.95;
29
29
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
30
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
31
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.05;
32
- const minSNR = energyParams.minSNR ?? 10;
30
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
31
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
32
+ const minSNR = energyParams.minSNR ?? 12;
33
33
  const snrRange = energyParams.snrRange ?? 10;
34
- const minEnergy = energyParams.minEnergy ?? 1e-3;
34
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
35
35
  return `
36
36
  class EnergyVadProcessor extends AudioWorkletProcessor {
37
37
  constructor() {
@@ -60,8 +60,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
60
60
 
61
61
  // Calculate instantaneous RMS (Root Mean Square) energy
62
62
  let sum = 0;
63
+ let peak = 0;
63
64
  for (let i = 0; i < channel.length; i++) {
65
+ const sample = Math.abs(channel[i]);
64
66
  sum += channel[i] * channel[i];
67
+ peak = Math.max(peak, sample);
65
68
  }
66
69
  const instantRms = Math.sqrt(sum / channel.length);
67
70
 
@@ -69,36 +72,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
69
72
  // this.energy acts as the smoothed RMS value
70
73
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
71
74
 
72
- // Adaptive noise floor estimation
73
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
74
- if (instantRms < this.noiseFloor) {
75
- // If signal is quieter than noise floor, adapt downwards quickly
76
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
75
+ // Calculate Crest Factor (peak-to-RMS ratio)
76
+ // Voice typically has crest factor of 2-4 (6-12dB)
77
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
78
+ const crestFactor = peak / (instantRms + 1e-10);
79
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
80
+
81
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
82
+ // This prevents sharp transients from affecting the noise floor
83
+ if (this.energy < this.noiseFloor) {
84
+ // Signal is quieter than noise floor, adapt downwards slowly
85
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
77
86
  } else {
78
- // If signal is louder, adapt upwards
79
- // We use a multi-stage adaptation rate:
80
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
81
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
82
- // 3. Otherwise, adapt at the normal loud rate
83
- const snr = instantRms / (this.noiseFloor + 1e-6);
84
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
87
+ // Calculate SNR based on smoothed energy
88
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
89
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
85
90
 
86
- let multiplier = 1.0;
87
- if (this.isSpeaking) {
88
- multiplier = 0.05;
89
- } else if (snrDb > 20) {
90
- multiplier = 0.2;
91
+ // Only adapt upwards if:
92
+ // 1. SNR is low (< 10dB) - likely just background noise
93
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
94
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
95
+ // This is persistent background noise, adapt upwards
96
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
97
+ } else {
98
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
99
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
100
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
91
101
  }
92
-
93
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
94
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
95
102
  }
96
103
 
97
104
  // Ensure noise floor doesn't drop to absolute zero
98
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
99
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
105
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
100
106
 
101
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
107
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
102
108
  const snr = this.energy / (this.noiseFloor + 1e-6);
103
109
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
104
110
 
@@ -107,11 +113,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
107
113
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
108
114
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
109
115
 
110
- // Apply absolute energy threshold
111
- // We use a soft threshold to avoid abrupt cutting
116
+ // Apply absolute energy threshold with soft knee
112
117
  if (this.energy < this.minEnergy) {
113
118
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
114
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
119
+ probability *= Math.pow(energyRatio, 2);
120
+ }
121
+
122
+ // Apply crest factor penalty
123
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
124
+ // Voice: 6-12dB, Keyboard: 20-30dB
125
+ // We penalize anything above 14dB
126
+ if (crestFactorDb > 14) {
127
+ const excess = crestFactorDb - 14;
128
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
129
+ probability *= penalty;
115
130
  }
116
131
 
117
132
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  EnergyVADPlugin
3
- } from "../chunk-KEWK2OKV.mjs";
3
+ } from "../chunk-2G2JFHJY.mjs";
4
4
  export {
5
5
  EnergyVADPlugin
6
6
  };
@@ -44,17 +44,17 @@ var VADStateMachine = class {
44
44
  // Smooth for natural speech
45
45
  preRollMs: config?.preRollMs ?? 250,
46
46
  // Generous pre-roll
47
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
48
- // Increased to filter keyboard clicks
47
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
48
+ // Aggressive transient rejection
49
49
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
50
50
  energyVad: {
51
51
  smoothing: config?.energyVad?.smoothing ?? 0.95,
52
52
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
53
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
54
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
55
- minSNR: config?.energyVad?.minSNR ?? 10,
53
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
54
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
55
+ minSNR: config?.energyVad?.minSNR ?? 12,
56
56
  snrRange: config?.energyVad?.snrRange ?? 10,
57
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
57
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
58
58
  }
59
59
  };
60
60
  this.lastSilenceTime = Date.now();
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  VADStateMachine
3
- } from "../chunk-DYY2MXMU.mjs";
3
+ } from "../chunk-K4YLH73B.mjs";
4
4
  export {
5
5
  VADStateMachine
6
6
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tensamin/audio",
3
- "version": "0.1.12",
3
+ "version": "0.1.14",
4
4
  "main": "dist/index.js",
5
5
  "module": "dist/index.mjs",
6
6
  "types": "dist/index.d.ts",