@tensamin/audio 0.1.13 → 0.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -101,11 +101,11 @@ vad: {
101
101
  energyVad?: {
102
102
  smoothing: number; // Default: 0.95
103
103
  initialNoiseFloor: number; // Default: 0.001
104
- noiseFloorAdaptRateQuiet: number; // Default: 0.01
105
- noiseFloorAdaptRateLoud: number; // Default: 0.1
106
- minSNR: number; // Default: 10.0 (dB)
104
+ noiseFloorAdaptRateQuiet: number; // Default: 0.002
105
+ noiseFloorAdaptRateLoud: number; // Default: 0.02
106
+ minSNR: number; // Default: 12.0 (dB)
107
107
  snrRange: number; // Default: 10.0 (dB)
108
- minEnergy: number; // Default: 0.001
108
+ minEnergy: number; // Default: 0.003
109
109
  };
110
110
  }
111
111
  ```
@@ -116,7 +116,7 @@ vad: {
116
116
  - `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
117
117
  - `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
118
118
  - `preRollMs`: Audio buffer duration before speech onset
119
- - `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 150ms)
119
+ - `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 250ms)
120
120
  - `minSilenceDurationMs`: Minimum silence duration between speech segments
121
121
 
122
122
  **Energy VAD Parameters:**
@@ -124,7 +124,7 @@ vad: {
124
124
  - `smoothing`: Energy calculation smoothing factor (0-1)
125
125
  - `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
126
126
  - `snrRange`: Range in dB for probability scaling from minSNR
127
- - `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.001, ~-60dB)
127
+ - `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.003, ~-50dB)
128
128
 
129
129
  ### Output Control
130
130
 
@@ -3,11 +3,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
3
3
  const energyParams = vadConfig?.energyVad || {};
4
4
  const smoothing = energyParams.smoothing ?? 0.95;
5
5
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
6
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
7
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
8
- const minSNR = energyParams.minSNR ?? 10;
6
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
7
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
8
+ const minSNR = energyParams.minSNR ?? 12;
9
9
  const snrRange = energyParams.snrRange ?? 10;
10
- const minEnergy = energyParams.minEnergy ?? 1e-3;
10
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
11
11
  return `
12
12
  class EnergyVadProcessor extends AudioWorkletProcessor {
13
13
  constructor() {
@@ -36,8 +36,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
36
36
 
37
37
  // Calculate instantaneous RMS (Root Mean Square) energy
38
38
  let sum = 0;
39
+ let peak = 0;
39
40
  for (let i = 0; i < channel.length; i++) {
41
+ const sample = Math.abs(channel[i]);
40
42
  sum += channel[i] * channel[i];
43
+ peak = Math.max(peak, sample);
41
44
  }
42
45
  const instantRms = Math.sqrt(sum / channel.length);
43
46
 
@@ -45,30 +48,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
45
48
  // this.energy acts as the smoothed RMS value
46
49
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
47
50
 
48
- // Adaptive noise floor estimation
49
- // We use a TWO-PASS approach to avoid circular dependencies:
50
- // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
51
- const instantSnr = instantRms / (this.noiseFloor + 1e-6);
52
- const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
53
-
54
- // Adapt the noise floor based on instantaneous SNR
55
- if (instantRms < this.noiseFloor) {
56
- // Signal is quieter than noise floor, adapt downwards quickly
57
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
58
- } else if (instantSnrDb < 12) {
59
- // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
60
- // Adapt upwards at normal rate to track rising noise
61
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
51
+ // Calculate Crest Factor (peak-to-RMS ratio)
52
+ // Voice typically has crest factor of 2-4 (6-12dB)
53
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
54
+ const crestFactor = peak / (instantRms + 1e-10);
55
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
56
+
57
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
58
+ // This prevents sharp transients from affecting the noise floor
59
+ if (this.energy < this.noiseFloor) {
60
+ // Signal is quieter than noise floor, adapt downwards slowly
61
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
62
62
  } else {
63
- // Signal has high SNR (>= 12dB) - likely speech or transient
64
- // Adapt VERY slowly to avoid "chasing" speech
65
- const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
66
- this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
63
+ // Calculate SNR based on smoothed energy
64
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
65
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
66
+
67
+ // Only adapt upwards if:
68
+ // 1. SNR is low (< 10dB) - likely just background noise
69
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
70
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
71
+ // This is persistent background noise, adapt upwards
72
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
73
+ } else {
74
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
75
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
76
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
77
+ }
67
78
  }
68
79
 
69
80
  // Ensure noise floor doesn't drop to absolute zero
70
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
71
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
81
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
72
82
 
73
83
  // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
74
84
  const snr = this.energy / (this.noiseFloor + 1e-6);
@@ -79,11 +89,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
79
89
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
80
90
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
81
91
 
82
- // Apply absolute energy threshold
83
- // We use a soft threshold to avoid abrupt cutting
92
+ // Apply absolute energy threshold with soft knee
84
93
  if (this.energy < this.minEnergy) {
85
94
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
86
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
95
+ probability *= Math.pow(energyRatio, 2);
96
+ }
97
+
98
+ // Apply crest factor penalty
99
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
100
+ // Voice: 6-12dB, Keyboard: 20-30dB
101
+ // We penalize anything above 14dB
102
+ if (crestFactorDb > 14) {
103
+ const excess = crestFactorDb - 14;
104
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
105
+ probability *= penalty;
87
106
  }
88
107
 
89
108
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "./chunk-RD4GDIPO.mjs";
3
+ } from "./chunk-R5M2DGAQ.mjs";
4
4
 
5
5
  // src/livekit/integration.ts
6
6
  async function attachProcessingToTrack(track, config = {}) {
@@ -20,17 +20,17 @@ var VADStateMachine = class {
20
20
  // Smooth for natural speech
21
21
  preRollMs: config?.preRollMs ?? 250,
22
22
  // Generous pre-roll
23
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
24
- // Increased to filter keyboard clicks
23
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
24
+ // Aggressive transient rejection
25
25
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
26
26
  energyVad: {
27
27
  smoothing: config?.energyVad?.smoothing ?? 0.95,
28
28
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
29
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
30
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
31
- minSNR: config?.energyVad?.minSNR ?? 10,
29
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
30
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
31
+ minSNR: config?.energyVad?.minSNR ?? 12,
32
32
  snrRange: config?.energyVad?.snrRange ?? 10,
33
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
33
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
34
34
  }
35
35
  };
36
36
  this.lastSilenceTime = Date.now();
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  VADStateMachine
3
- } from "./chunk-DLLK6K76.mjs";
3
+ } from "./chunk-K4YLH73B.mjs";
4
4
  import {
5
5
  getAudioContext,
6
6
  registerPipeline,
@@ -9,7 +9,7 @@ import {
9
9
  import {
10
10
  getNoiseSuppressionPlugin,
11
11
  getVADPlugin
12
- } from "./chunk-OXV7BHX5.mjs";
12
+ } from "./chunk-UFKIAMG3.mjs";
13
13
 
14
14
  // src/pipeline/audio-pipeline.ts
15
15
  import mitt from "mitt";
@@ -3,7 +3,7 @@ import {
3
3
  } from "./chunk-XO6B3D4A.mjs";
4
4
  import {
5
5
  EnergyVADPlugin
6
- } from "./chunk-FKR6NWZF.mjs";
6
+ } from "./chunk-2G2JFHJY.mjs";
7
7
 
8
8
  // src/extensibility/plugins.ts
9
9
  var nsPlugins = /* @__PURE__ */ new Map();
@@ -106,11 +106,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
106
106
  const energyParams = vadConfig?.energyVad || {};
107
107
  const smoothing = energyParams.smoothing ?? 0.95;
108
108
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
109
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
110
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
111
- const minSNR = energyParams.minSNR ?? 10;
109
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
110
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
111
+ const minSNR = energyParams.minSNR ?? 12;
112
112
  const snrRange = energyParams.snrRange ?? 10;
113
- const minEnergy = energyParams.minEnergy ?? 1e-3;
113
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
114
114
  return `
115
115
  class EnergyVadProcessor extends AudioWorkletProcessor {
116
116
  constructor() {
@@ -139,8 +139,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
139
139
 
140
140
  // Calculate instantaneous RMS (Root Mean Square) energy
141
141
  let sum = 0;
142
+ let peak = 0;
142
143
  for (let i = 0; i < channel.length; i++) {
144
+ const sample = Math.abs(channel[i]);
143
145
  sum += channel[i] * channel[i];
146
+ peak = Math.max(peak, sample);
144
147
  }
145
148
  const instantRms = Math.sqrt(sum / channel.length);
146
149
 
@@ -148,30 +151,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
148
151
  // this.energy acts as the smoothed RMS value
149
152
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
150
153
 
151
- // Adaptive noise floor estimation
152
- // We use a TWO-PASS approach to avoid circular dependencies:
153
- // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
154
- const instantSnr = instantRms / (this.noiseFloor + 1e-6);
155
- const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
156
-
157
- // Adapt the noise floor based on instantaneous SNR
158
- if (instantRms < this.noiseFloor) {
159
- // Signal is quieter than noise floor, adapt downwards quickly
160
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
161
- } else if (instantSnrDb < 12) {
162
- // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
163
- // Adapt upwards at normal rate to track rising noise
164
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
154
+ // Calculate Crest Factor (peak-to-RMS ratio)
155
+ // Voice typically has crest factor of 2-4 (6-12dB)
156
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
157
+ const crestFactor = peak / (instantRms + 1e-10);
158
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
159
+
160
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
161
+ // This prevents sharp transients from affecting the noise floor
162
+ if (this.energy < this.noiseFloor) {
163
+ // Signal is quieter than noise floor, adapt downwards slowly
164
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
165
165
  } else {
166
- // Signal has high SNR (>= 12dB) - likely speech or transient
167
- // Adapt VERY slowly to avoid "chasing" speech
168
- const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
169
- this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
166
+ // Calculate SNR based on smoothed energy
167
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
168
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
169
+
170
+ // Only adapt upwards if:
171
+ // 1. SNR is low (< 10dB) - likely just background noise
172
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
173
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
174
+ // This is persistent background noise, adapt upwards
175
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
176
+ } else {
177
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
178
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
179
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
180
+ }
170
181
  }
171
182
 
172
183
  // Ensure noise floor doesn't drop to absolute zero
173
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
174
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
184
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
175
185
 
176
186
  // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
177
187
  const snr = this.energy / (this.noiseFloor + 1e-6);
@@ -182,11 +192,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
182
192
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
183
193
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
184
194
 
185
- // Apply absolute energy threshold
186
- // We use a soft threshold to avoid abrupt cutting
195
+ // Apply absolute energy threshold with soft knee
187
196
  if (this.energy < this.minEnergy) {
188
197
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
189
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
198
+ probability *= Math.pow(energyRatio, 2);
199
+ }
200
+
201
+ // Apply crest factor penalty
202
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
203
+ // Voice: 6-12dB, Keyboard: 20-30dB
204
+ // We penalize anything above 14dB
205
+ if (crestFactorDb > 14) {
206
+ const excess = crestFactorDb - 14;
207
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
208
+ probability *= penalty;
190
209
  }
191
210
 
192
211
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -3,9 +3,9 @@ import {
3
3
  getVADPlugin,
4
4
  registerNoiseSuppressionPlugin,
5
5
  registerVADPlugin
6
- } from "../chunk-OXV7BHX5.mjs";
6
+ } from "../chunk-UFKIAMG3.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-FKR6NWZF.mjs";
8
+ import "../chunk-2G2JFHJY.mjs";
9
9
  export {
10
10
  getNoiseSuppressionPlugin,
11
11
  getVADPlugin,
package/dist/index.js CHANGED
@@ -158,11 +158,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
158
158
  const energyParams = vadConfig?.energyVad || {};
159
159
  const smoothing = energyParams.smoothing ?? 0.95;
160
160
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
161
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
162
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
163
- const minSNR = energyParams.minSNR ?? 10;
161
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
162
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
163
+ const minSNR = energyParams.minSNR ?? 12;
164
164
  const snrRange = energyParams.snrRange ?? 10;
165
- const minEnergy = energyParams.minEnergy ?? 1e-3;
165
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
166
166
  return `
167
167
  class EnergyVadProcessor extends AudioWorkletProcessor {
168
168
  constructor() {
@@ -191,8 +191,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
191
191
 
192
192
  // Calculate instantaneous RMS (Root Mean Square) energy
193
193
  let sum = 0;
194
+ let peak = 0;
194
195
  for (let i = 0; i < channel.length; i++) {
196
+ const sample = Math.abs(channel[i]);
195
197
  sum += channel[i] * channel[i];
198
+ peak = Math.max(peak, sample);
196
199
  }
197
200
  const instantRms = Math.sqrt(sum / channel.length);
198
201
 
@@ -200,30 +203,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
200
203
  // this.energy acts as the smoothed RMS value
201
204
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
202
205
 
203
- // Adaptive noise floor estimation
204
- // We use a TWO-PASS approach to avoid circular dependencies:
205
- // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
206
- const instantSnr = instantRms / (this.noiseFloor + 1e-6);
207
- const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
208
-
209
- // Adapt the noise floor based on instantaneous SNR
210
- if (instantRms < this.noiseFloor) {
211
- // Signal is quieter than noise floor, adapt downwards quickly
212
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
213
- } else if (instantSnrDb < 12) {
214
- // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
215
- // Adapt upwards at normal rate to track rising noise
216
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
206
+ // Calculate Crest Factor (peak-to-RMS ratio)
207
+ // Voice typically has crest factor of 2-4 (6-12dB)
208
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
209
+ const crestFactor = peak / (instantRms + 1e-10);
210
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
211
+
212
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
213
+ // This prevents sharp transients from affecting the noise floor
214
+ if (this.energy < this.noiseFloor) {
215
+ // Signal is quieter than noise floor, adapt downwards slowly
216
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
217
217
  } else {
218
- // Signal has high SNR (>= 12dB) - likely speech or transient
219
- // Adapt VERY slowly to avoid "chasing" speech
220
- const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
221
- this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
218
+ // Calculate SNR based on smoothed energy
219
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
220
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
221
+
222
+ // Only adapt upwards if:
223
+ // 1. SNR is low (< 10dB) - likely just background noise
224
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
225
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
226
+ // This is persistent background noise, adapt upwards
227
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
228
+ } else {
229
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
230
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
231
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
232
+ }
222
233
  }
223
234
 
224
235
  // Ensure noise floor doesn't drop to absolute zero
225
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
226
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
236
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
227
237
 
228
238
  // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
229
239
  const snr = this.energy / (this.noiseFloor + 1e-6);
@@ -234,11 +244,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
234
244
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
235
245
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
236
246
 
237
- // Apply absolute energy threshold
238
- // We use a soft threshold to avoid abrupt cutting
247
+ // Apply absolute energy threshold with soft knee
239
248
  if (this.energy < this.minEnergy) {
240
249
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
241
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
250
+ probability *= Math.pow(energyRatio, 2);
251
+ }
252
+
253
+ // Apply crest factor penalty
254
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
255
+ // Voice: 6-12dB, Keyboard: 20-30dB
256
+ // We penalize anything above 14dB
257
+ if (crestFactorDb > 14) {
258
+ const excess = crestFactorDb - 14;
259
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
260
+ probability *= penalty;
242
261
  }
243
262
 
244
263
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -367,17 +386,17 @@ var VADStateMachine = class {
367
386
  // Smooth for natural speech
368
387
  preRollMs: config?.preRollMs ?? 250,
369
388
  // Generous pre-roll
370
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
371
- // Increased to filter keyboard clicks
389
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
390
+ // Aggressive transient rejection
372
391
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
373
392
  energyVad: {
374
393
  smoothing: config?.energyVad?.smoothing ?? 0.95,
375
394
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
376
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
377
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
378
- minSNR: config?.energyVad?.minSNR ?? 10,
395
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
396
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
397
+ minSNR: config?.energyVad?.minSNR ?? 12,
379
398
  snrRange: config?.energyVad?.snrRange ?? 10,
380
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
399
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
381
400
  }
382
401
  };
383
402
  this.lastSilenceTime = Date.now();
package/dist/index.mjs CHANGED
@@ -1,13 +1,13 @@
1
1
  import "./chunk-WBQAMGXK.mjs";
2
2
  import {
3
3
  attachProcessingToTrack
4
- } from "./chunk-K6X52R7N.mjs";
4
+ } from "./chunk-6F2HZUYO.mjs";
5
5
  import {
6
6
  createAudioPipeline
7
- } from "./chunk-RD4GDIPO.mjs";
7
+ } from "./chunk-R5M2DGAQ.mjs";
8
8
  import {
9
9
  VADStateMachine
10
- } from "./chunk-DLLK6K76.mjs";
10
+ } from "./chunk-K4YLH73B.mjs";
11
11
  import {
12
12
  closeAudioContext,
13
13
  getAudioContext,
@@ -21,13 +21,13 @@ import {
21
21
  getVADPlugin,
22
22
  registerNoiseSuppressionPlugin,
23
23
  registerVADPlugin
24
- } from "./chunk-OXV7BHX5.mjs";
24
+ } from "./chunk-UFKIAMG3.mjs";
25
25
  import {
26
26
  RNNoisePlugin
27
27
  } from "./chunk-XO6B3D4A.mjs";
28
28
  import {
29
29
  EnergyVADPlugin
30
- } from "./chunk-FKR6NWZF.mjs";
30
+ } from "./chunk-2G2JFHJY.mjs";
31
31
  export {
32
32
  EnergyVADPlugin,
33
33
  RNNoisePlugin,
@@ -127,11 +127,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
127
127
  const energyParams = vadConfig?.energyVad || {};
128
128
  const smoothing = energyParams.smoothing ?? 0.95;
129
129
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
130
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
131
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
132
- const minSNR = energyParams.minSNR ?? 10;
130
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
131
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
132
+ const minSNR = energyParams.minSNR ?? 12;
133
133
  const snrRange = energyParams.snrRange ?? 10;
134
- const minEnergy = energyParams.minEnergy ?? 1e-3;
134
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
135
135
  return `
136
136
  class EnergyVadProcessor extends AudioWorkletProcessor {
137
137
  constructor() {
@@ -160,8 +160,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
160
160
 
161
161
  // Calculate instantaneous RMS (Root Mean Square) energy
162
162
  let sum = 0;
163
+ let peak = 0;
163
164
  for (let i = 0; i < channel.length; i++) {
165
+ const sample = Math.abs(channel[i]);
164
166
  sum += channel[i] * channel[i];
167
+ peak = Math.max(peak, sample);
165
168
  }
166
169
  const instantRms = Math.sqrt(sum / channel.length);
167
170
 
@@ -169,30 +172,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
169
172
  // this.energy acts as the smoothed RMS value
170
173
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
171
174
 
172
- // Adaptive noise floor estimation
173
- // We use a TWO-PASS approach to avoid circular dependencies:
174
- // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
175
- const instantSnr = instantRms / (this.noiseFloor + 1e-6);
176
- const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
177
-
178
- // Adapt the noise floor based on instantaneous SNR
179
- if (instantRms < this.noiseFloor) {
180
- // Signal is quieter than noise floor, adapt downwards quickly
181
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
182
- } else if (instantSnrDb < 12) {
183
- // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
184
- // Adapt upwards at normal rate to track rising noise
185
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
175
+ // Calculate Crest Factor (peak-to-RMS ratio)
176
+ // Voice typically has crest factor of 2-4 (6-12dB)
177
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
178
+ const crestFactor = peak / (instantRms + 1e-10);
179
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
180
+
181
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
182
+ // This prevents sharp transients from affecting the noise floor
183
+ if (this.energy < this.noiseFloor) {
184
+ // Signal is quieter than noise floor, adapt downwards slowly
185
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
186
186
  } else {
187
- // Signal has high SNR (>= 12dB) - likely speech or transient
188
- // Adapt VERY slowly to avoid "chasing" speech
189
- const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
190
- this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
187
+ // Calculate SNR based on smoothed energy
188
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
189
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
190
+
191
+ // Only adapt upwards if:
192
+ // 1. SNR is low (< 10dB) - likely just background noise
193
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
194
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
195
+ // This is persistent background noise, adapt upwards
196
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
197
+ } else {
198
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
199
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
200
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
201
+ }
191
202
  }
192
203
 
193
204
  // Ensure noise floor doesn't drop to absolute zero
194
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
195
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
205
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
196
206
 
197
207
  // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
198
208
  const snr = this.energy / (this.noiseFloor + 1e-6);
@@ -203,11 +213,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
203
213
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
204
214
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
205
215
 
206
- // Apply absolute energy threshold
207
- // We use a soft threshold to avoid abrupt cutting
216
+ // Apply absolute energy threshold with soft knee
208
217
  if (this.energy < this.minEnergy) {
209
218
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
210
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
219
+ probability *= Math.pow(energyRatio, 2);
220
+ }
221
+
222
+ // Apply crest factor penalty
223
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
224
+ // Voice: 6-12dB, Keyboard: 20-30dB
225
+ // We penalize anything above 14dB
226
+ if (crestFactorDb > 14) {
227
+ const excess = crestFactorDb - 14;
228
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
229
+ probability *= penalty;
211
230
  }
212
231
 
213
232
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -330,17 +349,17 @@ var VADStateMachine = class {
330
349
  // Smooth for natural speech
331
350
  preRollMs: config?.preRollMs ?? 250,
332
351
  // Generous pre-roll
333
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
334
- // Increased to filter keyboard clicks
352
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
353
+ // Aggressive transient rejection
335
354
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
336
355
  energyVad: {
337
356
  smoothing: config?.energyVad?.smoothing ?? 0.95,
338
357
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
339
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
340
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
341
- minSNR: config?.energyVad?.minSNR ?? 10,
358
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
359
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
360
+ minSNR: config?.energyVad?.minSNR ?? 12,
342
361
  snrRange: config?.energyVad?.snrRange ?? 10,
343
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
362
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
344
363
  }
345
364
  };
346
365
  this.lastSilenceTime = Date.now();
@@ -1,12 +1,12 @@
1
1
  import {
2
2
  attachProcessingToTrack
3
- } from "../chunk-K6X52R7N.mjs";
4
- import "../chunk-RD4GDIPO.mjs";
5
- import "../chunk-DLLK6K76.mjs";
3
+ } from "../chunk-6F2HZUYO.mjs";
4
+ import "../chunk-R5M2DGAQ.mjs";
5
+ import "../chunk-K4YLH73B.mjs";
6
6
  import "../chunk-OZ7KMC4S.mjs";
7
- import "../chunk-OXV7BHX5.mjs";
7
+ import "../chunk-UFKIAMG3.mjs";
8
8
  import "../chunk-XO6B3D4A.mjs";
9
- import "../chunk-FKR6NWZF.mjs";
9
+ import "../chunk-2G2JFHJY.mjs";
10
10
  export {
11
11
  attachProcessingToTrack
12
12
  };
@@ -125,11 +125,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
125
125
  const energyParams = vadConfig?.energyVad || {};
126
126
  const smoothing = energyParams.smoothing ?? 0.95;
127
127
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
128
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
129
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
130
- const minSNR = energyParams.minSNR ?? 10;
128
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
129
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
130
+ const minSNR = energyParams.minSNR ?? 12;
131
131
  const snrRange = energyParams.snrRange ?? 10;
132
- const minEnergy = energyParams.minEnergy ?? 1e-3;
132
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
133
133
  return `
134
134
  class EnergyVadProcessor extends AudioWorkletProcessor {
135
135
  constructor() {
@@ -158,8 +158,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
158
158
 
159
159
  // Calculate instantaneous RMS (Root Mean Square) energy
160
160
  let sum = 0;
161
+ let peak = 0;
161
162
  for (let i = 0; i < channel.length; i++) {
163
+ const sample = Math.abs(channel[i]);
162
164
  sum += channel[i] * channel[i];
165
+ peak = Math.max(peak, sample);
163
166
  }
164
167
  const instantRms = Math.sqrt(sum / channel.length);
165
168
 
@@ -167,30 +170,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
167
170
  // this.energy acts as the smoothed RMS value
168
171
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
169
172
 
170
- // Adaptive noise floor estimation
171
- // We use a TWO-PASS approach to avoid circular dependencies:
172
- // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
173
- const instantSnr = instantRms / (this.noiseFloor + 1e-6);
174
- const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
175
-
176
- // Adapt the noise floor based on instantaneous SNR
177
- if (instantRms < this.noiseFloor) {
178
- // Signal is quieter than noise floor, adapt downwards quickly
179
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
180
- } else if (instantSnrDb < 12) {
181
- // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
182
- // Adapt upwards at normal rate to track rising noise
183
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
173
+ // Calculate Crest Factor (peak-to-RMS ratio)
174
+ // Voice typically has crest factor of 2-4 (6-12dB)
175
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
176
+ const crestFactor = peak / (instantRms + 1e-10);
177
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
178
+
179
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
180
+ // This prevents sharp transients from affecting the noise floor
181
+ if (this.energy < this.noiseFloor) {
182
+ // Signal is quieter than noise floor, adapt downwards slowly
183
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
184
184
  } else {
185
- // Signal has high SNR (>= 12dB) - likely speech or transient
186
- // Adapt VERY slowly to avoid "chasing" speech
187
- const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
188
- this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
185
+ // Calculate SNR based on smoothed energy
186
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
187
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
188
+
189
+ // Only adapt upwards if:
190
+ // 1. SNR is low (< 10dB) - likely just background noise
191
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
192
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
193
+ // This is persistent background noise, adapt upwards
194
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
195
+ } else {
196
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
197
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
198
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
199
+ }
189
200
  }
190
201
 
191
202
  // Ensure noise floor doesn't drop to absolute zero
192
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
193
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
203
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
194
204
 
195
205
  // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
196
206
  const snr = this.energy / (this.noiseFloor + 1e-6);
@@ -201,11 +211,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
201
211
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
202
212
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
203
213
 
204
- // Apply absolute energy threshold
205
- // We use a soft threshold to avoid abrupt cutting
214
+ // Apply absolute energy threshold with soft knee
206
215
  if (this.energy < this.minEnergy) {
207
216
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
208
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
217
+ probability *= Math.pow(energyRatio, 2);
218
+ }
219
+
220
+ // Apply crest factor penalty
221
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
222
+ // Voice: 6-12dB, Keyboard: 20-30dB
223
+ // We penalize anything above 14dB
224
+ if (crestFactorDb > 14) {
225
+ const excess = crestFactorDb - 14;
226
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
227
+ probability *= penalty;
209
228
  }
210
229
 
211
230
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -328,17 +347,17 @@ var VADStateMachine = class {
328
347
  // Smooth for natural speech
329
348
  preRollMs: config?.preRollMs ?? 250,
330
349
  // Generous pre-roll
331
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
332
- // Increased to filter keyboard clicks
350
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
351
+ // Aggressive transient rejection
333
352
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
334
353
  energyVad: {
335
354
  smoothing: config?.energyVad?.smoothing ?? 0.95,
336
355
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
337
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
338
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
339
- minSNR: config?.energyVad?.minSNR ?? 10,
356
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
357
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
358
+ minSNR: config?.energyVad?.minSNR ?? 12,
340
359
  snrRange: config?.energyVad?.snrRange ?? 10,
341
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
360
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
342
361
  }
343
362
  };
344
363
  this.lastSilenceTime = Date.now();
@@ -1,11 +1,11 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "../chunk-RD4GDIPO.mjs";
4
- import "../chunk-DLLK6K76.mjs";
3
+ } from "../chunk-R5M2DGAQ.mjs";
4
+ import "../chunk-K4YLH73B.mjs";
5
5
  import "../chunk-OZ7KMC4S.mjs";
6
- import "../chunk-OXV7BHX5.mjs";
6
+ import "../chunk-UFKIAMG3.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-FKR6NWZF.mjs";
8
+ import "../chunk-2G2JFHJY.mjs";
9
9
  export {
10
10
  createAudioPipeline
11
11
  };
package/dist/types.d.mts CHANGED
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
70
70
  preRollMs?: number;
71
71
  /**
72
72
  * Minimum speech duration in ms to consider it valid speech.
73
- * Filters out very brief noise spikes like keyboard clicks.
74
- * Default: 150ms
73
+ * Filters out brief transients like keyboard clicks.
74
+ * Default: 250ms (aggressive transient rejection)
75
75
  */
76
76
  minSpeechDurationMs?: number;
77
77
  /**
@@ -97,18 +97,18 @@ interface AudioProcessingConfig {
97
97
  initialNoiseFloor?: number;
98
98
  /**
99
99
  * Rate at which noise floor adapts to quiet signals (0-1).
100
- * Default: 0.01
100
+ * Default: 0.002 (very slow downward drift)
101
101
  */
102
102
  noiseFloorAdaptRateQuiet?: number;
103
103
  /**
104
104
  * Rate at which noise floor adapts to loud signals (0-1).
105
- * Applied when instantaneous SNR < 12dB (background noise).
106
- * Default: 0.1 (fast tracking of rising noise)
105
+ * Applied to low-energy, low-crest-factor signals (background noise).
106
+ * Default: 0.02
107
107
  */
108
108
  noiseFloorAdaptRateLoud?: number;
109
109
  /**
110
110
  * Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
111
- * Default: 10.0 (more aggressive noise rejection)
111
+ * Default: 12.0 (aggressive noise rejection)
112
112
  */
113
113
  minSNR?: number;
114
114
  /**
@@ -118,8 +118,8 @@ interface AudioProcessingConfig {
118
118
  snrRange?: number;
119
119
  /**
120
120
  * Minimum absolute RMS energy to consider as speech.
121
- * Prevents triggering on very quiet background noise in silent rooms.
122
- * Default: 0.001 (approx -60dB)
121
+ * Prevents triggering on very quiet background noise.
122
+ * Default: 0.003 (approx -50dB, voice-appropriate level)
123
123
  */
124
124
  minEnergy?: number;
125
125
  };
package/dist/types.d.ts CHANGED
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
70
70
  preRollMs?: number;
71
71
  /**
72
72
  * Minimum speech duration in ms to consider it valid speech.
73
- * Filters out very brief noise spikes like keyboard clicks.
74
- * Default: 150ms
73
+ * Filters out brief transients like keyboard clicks.
74
+ * Default: 250ms (aggressive transient rejection)
75
75
  */
76
76
  minSpeechDurationMs?: number;
77
77
  /**
@@ -97,18 +97,18 @@ interface AudioProcessingConfig {
97
97
  initialNoiseFloor?: number;
98
98
  /**
99
99
  * Rate at which noise floor adapts to quiet signals (0-1).
100
- * Default: 0.01
100
+ * Default: 0.002 (very slow downward drift)
101
101
  */
102
102
  noiseFloorAdaptRateQuiet?: number;
103
103
  /**
104
104
  * Rate at which noise floor adapts to loud signals (0-1).
105
- * Applied when instantaneous SNR < 12dB (background noise).
106
- * Default: 0.1 (fast tracking of rising noise)
105
+ * Applied to low-energy, low-crest-factor signals (background noise).
106
+ * Default: 0.02
107
107
  */
108
108
  noiseFloorAdaptRateLoud?: number;
109
109
  /**
110
110
  * Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
111
- * Default: 10.0 (more aggressive noise rejection)
111
+ * Default: 12.0 (aggressive noise rejection)
112
112
  */
113
113
  minSNR?: number;
114
114
  /**
@@ -118,8 +118,8 @@ interface AudioProcessingConfig {
118
118
  snrRange?: number;
119
119
  /**
120
120
  * Minimum absolute RMS energy to consider as speech.
121
- * Prevents triggering on very quiet background noise in silent rooms.
122
- * Default: 0.001 (approx -60dB)
121
+ * Prevents triggering on very quiet background noise.
122
+ * Default: 0.003 (approx -50dB, voice-appropriate level)
123
123
  */
124
124
  minEnergy?: number;
125
125
  };
@@ -27,11 +27,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
27
27
  const energyParams = vadConfig?.energyVad || {};
28
28
  const smoothing = energyParams.smoothing ?? 0.95;
29
29
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
30
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
31
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
32
- const minSNR = energyParams.minSNR ?? 10;
30
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
31
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
32
+ const minSNR = energyParams.minSNR ?? 12;
33
33
  const snrRange = energyParams.snrRange ?? 10;
34
- const minEnergy = energyParams.minEnergy ?? 1e-3;
34
+ const minEnergy = energyParams.minEnergy ?? 3e-3;
35
35
  return `
36
36
  class EnergyVadProcessor extends AudioWorkletProcessor {
37
37
  constructor() {
@@ -60,8 +60,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
60
60
 
61
61
  // Calculate instantaneous RMS (Root Mean Square) energy
62
62
  let sum = 0;
63
+ let peak = 0;
63
64
  for (let i = 0; i < channel.length; i++) {
65
+ const sample = Math.abs(channel[i]);
64
66
  sum += channel[i] * channel[i];
67
+ peak = Math.max(peak, sample);
65
68
  }
66
69
  const instantRms = Math.sqrt(sum / channel.length);
67
70
 
@@ -69,30 +72,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
69
72
  // this.energy acts as the smoothed RMS value
70
73
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
71
74
 
72
- // Adaptive noise floor estimation
73
- // We use a TWO-PASS approach to avoid circular dependencies:
74
- // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
75
- const instantSnr = instantRms / (this.noiseFloor + 1e-6);
76
- const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
77
-
78
- // Adapt the noise floor based on instantaneous SNR
79
- if (instantRms < this.noiseFloor) {
80
- // Signal is quieter than noise floor, adapt downwards quickly
81
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
82
- } else if (instantSnrDb < 12) {
83
- // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
84
- // Adapt upwards at normal rate to track rising noise
85
- this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
75
+ // Calculate Crest Factor (peak-to-RMS ratio)
76
+ // Voice typically has crest factor of 2-4 (6-12dB)
77
+ // Keyboard clicks have crest factor of 10-30+ (20-30dB)
78
+ const crestFactor = peak / (instantRms + 1e-10);
79
+ const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
80
+
81
+ // Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
82
+ // This prevents sharp transients from affecting the noise floor
83
+ if (this.energy < this.noiseFloor) {
84
+ // Signal is quieter than noise floor, adapt downwards slowly
85
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
86
86
  } else {
87
- // Signal has high SNR (>= 12dB) - likely speech or transient
88
- // Adapt VERY slowly to avoid "chasing" speech
89
- const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
90
- this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
87
+ // Calculate SNR based on smoothed energy
88
+ const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
89
+ const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
90
+
91
+ // Only adapt upwards if:
92
+ // 1. SNR is low (< 10dB) - likely just background noise
93
+ // 2. AND crest factor is low (< 15dB) - not a sharp transient
94
+ if (smoothedSnrDb < 10 && crestFactorDb < 15) {
95
+ // This is persistent background noise, adapt upwards
96
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
97
+ } else {
98
+ // Either high SNR (speech) or high crest factor (click) - adapt very slowly
99
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
100
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
101
+ }
91
102
  }
92
103
 
93
104
  // Ensure noise floor doesn't drop to absolute zero
94
- // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
95
- this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
105
+ this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
96
106
 
97
107
  // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
98
108
  const snr = this.energy / (this.noiseFloor + 1e-6);
@@ -103,11 +113,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
103
113
  // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
104
114
  let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
105
115
 
106
- // Apply absolute energy threshold
107
- // We use a soft threshold to avoid abrupt cutting
116
+ // Apply absolute energy threshold with soft knee
108
117
  if (this.energy < this.minEnergy) {
109
118
  const energyRatio = this.energy / (this.minEnergy + 1e-6);
110
- probability *= Math.pow(energyRatio, 2); // Quadratic falloff
119
+ probability *= Math.pow(energyRatio, 2);
120
+ }
121
+
122
+ // Apply crest factor penalty
123
+ // Reject signals with high crest factor (sharp transients like keyboard clicks)
124
+ // Voice: 6-12dB, Keyboard: 20-30dB
125
+ // We penalize anything above 14dB
126
+ if (crestFactorDb > 14) {
127
+ const excess = crestFactorDb - 14;
128
+ const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
129
+ probability *= penalty;
111
130
  }
112
131
 
113
132
  this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  EnergyVADPlugin
3
- } from "../chunk-FKR6NWZF.mjs";
3
+ } from "../chunk-2G2JFHJY.mjs";
4
4
  export {
5
5
  EnergyVADPlugin
6
6
  };
@@ -44,17 +44,17 @@ var VADStateMachine = class {
44
44
  // Smooth for natural speech
45
45
  preRollMs: config?.preRollMs ?? 250,
46
46
  // Generous pre-roll
47
- minSpeechDurationMs: config?.minSpeechDurationMs ?? 150,
48
- // Increased to filter keyboard clicks
47
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
48
+ // Aggressive transient rejection
49
49
  minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
50
50
  energyVad: {
51
51
  smoothing: config?.energyVad?.smoothing ?? 0.95,
52
52
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
53
- noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
54
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
55
- minSNR: config?.energyVad?.minSNR ?? 10,
53
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
54
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
55
+ minSNR: config?.energyVad?.minSNR ?? 12,
56
56
  snrRange: config?.energyVad?.snrRange ?? 10,
57
- minEnergy: config?.energyVad?.minEnergy ?? 1e-3
57
+ minEnergy: config?.energyVad?.minEnergy ?? 3e-3
58
58
  }
59
59
  };
60
60
  this.lastSilenceTime = Date.now();
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  VADStateMachine
3
- } from "../chunk-DLLK6K76.mjs";
3
+ } from "../chunk-K4YLH73B.mjs";
4
4
  export {
5
5
  VADStateMachine
6
6
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tensamin/audio",
3
- "version": "0.1.13",
3
+ "version": "0.1.14",
4
4
  "main": "dist/index.js",
5
5
  "module": "dist/index.mjs",
6
6
  "types": "dist/index.d.ts",