@tensamin/audio 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -101,8 +101,8 @@ vad: {
101
101
  energyVad?: {
102
102
  smoothing: number; // Default: 0.95
103
103
  initialNoiseFloor: number; // Default: 0.001
104
- noiseFloorAdaptRateQuiet: number; // Default: 0.005
105
- noiseFloorAdaptRateLoud: number; // Default: 0.01
104
+ noiseFloorAdaptRateQuiet: number; // Default: 0.01
105
+ noiseFloorAdaptRateLoud: number; // Default: 0.1
106
106
  minSNR: number; // Default: 10.0 (dB)
107
107
  snrRange: number; // Default: 10.0 (dB)
108
108
  minEnergy: number; // Default: 0.001
@@ -27,7 +27,7 @@ var VADStateMachine = class {
27
27
  smoothing: config?.energyVad?.smoothing ?? 0.95,
28
28
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
29
29
  noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
30
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
30
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
31
31
  minSNR: config?.energyVad?.minSNR ?? 10,
32
32
  snrRange: config?.energyVad?.snrRange ?? 10,
33
33
  minEnergy: config?.energyVad?.minEnergy ?? 1e-3
@@ -61,25 +61,33 @@ var VADStateMachine = class {
61
61
  newState = "silent";
62
62
  this.lastSilenceTime = timestamp;
63
63
  }
64
- } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
64
+ } else if (this.currentState === "speech_starting") {
65
+ if (probability >= stopThreshold) {
66
+ const speechDuration = timestamp - this.speechStartTime;
67
+ if (speechDuration >= minSpeechDurationMs) {
68
+ newState = "speaking";
69
+ } else {
70
+ newState = "speech_starting";
71
+ }
72
+ this.lastSpeechTime = timestamp;
73
+ } else {
74
+ newState = "silent";
75
+ this.lastSilenceTime = timestamp;
76
+ }
77
+ } else if (this.currentState === "speaking") {
65
78
  if (probability >= stopThreshold) {
66
79
  newState = "speaking";
67
80
  this.lastSpeechTime = timestamp;
68
81
  } else {
69
82
  const timeSinceSpeech = timestamp - this.lastSpeechTime;
70
- const speechDuration = timestamp - this.speechStartTime;
71
83
  if (timeSinceSpeech < hangoverMs) {
72
84
  newState = "speaking";
73
- } else if (speechDuration < minSpeechDurationMs) {
74
- newState = "silent";
75
- this.lastSilenceTime = timestamp;
76
85
  } else {
77
86
  newState = "speech_ending";
78
87
  this.lastSilenceTime = timestamp;
79
88
  }
80
89
  }
81
90
  }
82
- if (newState === "speech_starting") newState = "speaking";
83
91
  if (newState === "speech_ending") newState = "silent";
84
92
  this.currentState = newState;
85
93
  return {
@@ -3,8 +3,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
3
3
  const energyParams = vadConfig?.energyVad || {};
4
4
  const smoothing = energyParams.smoothing ?? 0.95;
5
5
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
6
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 5e-3;
7
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
6
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
7
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
8
8
  const minSNR = energyParams.minSNR ?? 10;
9
9
  const snrRange = energyParams.snrRange ?? 10;
10
10
  const minEnergy = energyParams.minEnergy ?? 1e-3;
@@ -46,35 +46,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
46
46
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
47
47
 
48
48
  // Adaptive noise floor estimation
49
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
49
+ // We use a TWO-PASS approach to avoid circular dependencies:
50
+ // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
51
+ const instantSnr = instantRms / (this.noiseFloor + 1e-6);
52
+ const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
53
+
54
+ // Adapt the noise floor based on instantaneous SNR
50
55
  if (instantRms < this.noiseFloor) {
51
- // If signal is quieter than noise floor, adapt downwards quickly
56
+ // Signal is quieter than noise floor, adapt downwards quickly
52
57
  this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
58
+ } else if (instantSnrDb < 12) {
59
+ // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
60
+ // Adapt upwards at normal rate to track rising noise
61
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
53
62
  } else {
54
- // If signal is louder, adapt upwards
55
- // We use a multi-stage adaptation rate:
56
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
57
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
58
- // 3. Otherwise, adapt at the normal loud rate
59
- const snr = instantRms / (this.noiseFloor + 1e-6);
60
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
61
-
62
- let multiplier = 1.0;
63
- if (this.isSpeaking) {
64
- multiplier = 0.01;
65
- } else if (snrDb > 20) {
66
- multiplier = 0.1;
67
- }
68
-
69
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
70
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
63
+ // Signal has high SNR (>= 12dB) - likely speech or transient
64
+ // Adapt VERY slowly to avoid "chasing" speech
65
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
66
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
71
67
  }
72
68
 
73
69
  // Ensure noise floor doesn't drop to absolute zero
74
70
  // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
75
71
  this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
76
72
 
77
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
73
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
78
74
  const snr = this.energy / (this.noiseFloor + 1e-6);
79
75
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
80
76
 
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "./chunk-I5AR7XQD.mjs";
3
+ } from "./chunk-RD4GDIPO.mjs";
4
4
 
5
5
  // src/livekit/integration.ts
6
6
  async function attachProcessingToTrack(track, config = {}) {
@@ -3,7 +3,7 @@ import {
3
3
  } from "./chunk-XO6B3D4A.mjs";
4
4
  import {
5
5
  EnergyVADPlugin
6
- } from "./chunk-B36JBXOK.mjs";
6
+ } from "./chunk-FKR6NWZF.mjs";
7
7
 
8
8
  // src/extensibility/plugins.ts
9
9
  var nsPlugins = /* @__PURE__ */ new Map();
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  VADStateMachine
3
- } from "./chunk-GFLVGUTU.mjs";
3
+ } from "./chunk-DLLK6K76.mjs";
4
4
  import {
5
5
  getAudioContext,
6
6
  registerPipeline,
@@ -9,7 +9,7 @@ import {
9
9
  import {
10
10
  getNoiseSuppressionPlugin,
11
11
  getVADPlugin
12
- } from "./chunk-3I4OQD2L.mjs";
12
+ } from "./chunk-OXV7BHX5.mjs";
13
13
 
14
14
  // src/pipeline/audio-pipeline.ts
15
15
  import mitt from "mitt";
@@ -106,8 +106,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
106
106
  const energyParams = vadConfig?.energyVad || {};
107
107
  const smoothing = energyParams.smoothing ?? 0.95;
108
108
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
109
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 5e-3;
110
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
109
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
110
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
111
111
  const minSNR = energyParams.minSNR ?? 10;
112
112
  const snrRange = energyParams.snrRange ?? 10;
113
113
  const minEnergy = energyParams.minEnergy ?? 1e-3;
@@ -149,35 +149,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
149
149
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
150
150
 
151
151
  // Adaptive noise floor estimation
152
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
152
+ // We use a TWO-PASS approach to avoid circular dependencies:
153
+ // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
154
+ const instantSnr = instantRms / (this.noiseFloor + 1e-6);
155
+ const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
156
+
157
+ // Adapt the noise floor based on instantaneous SNR
153
158
  if (instantRms < this.noiseFloor) {
154
- // If signal is quieter than noise floor, adapt downwards quickly
159
+ // Signal is quieter than noise floor, adapt downwards quickly
155
160
  this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
161
+ } else if (instantSnrDb < 12) {
162
+ // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
163
+ // Adapt upwards at normal rate to track rising noise
164
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
156
165
  } else {
157
- // If signal is louder, adapt upwards
158
- // We use a multi-stage adaptation rate:
159
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
160
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
161
- // 3. Otherwise, adapt at the normal loud rate
162
- const snr = instantRms / (this.noiseFloor + 1e-6);
163
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
164
-
165
- let multiplier = 1.0;
166
- if (this.isSpeaking) {
167
- multiplier = 0.01;
168
- } else if (snrDb > 20) {
169
- multiplier = 0.1;
170
- }
171
-
172
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
173
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
166
+ // Signal has high SNR (>= 12dB) - likely speech or transient
167
+ // Adapt VERY slowly to avoid "chasing" speech
168
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
169
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
174
170
  }
175
171
 
176
172
  // Ensure noise floor doesn't drop to absolute zero
177
173
  // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
178
174
  this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
179
175
 
180
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
176
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
181
177
  const snr = this.energy / (this.noiseFloor + 1e-6);
182
178
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
183
179
 
@@ -3,9 +3,9 @@ import {
3
3
  getVADPlugin,
4
4
  registerNoiseSuppressionPlugin,
5
5
  registerVADPlugin
6
- } from "../chunk-3I4OQD2L.mjs";
6
+ } from "../chunk-OXV7BHX5.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-B36JBXOK.mjs";
8
+ import "../chunk-FKR6NWZF.mjs";
9
9
  export {
10
10
  getNoiseSuppressionPlugin,
11
11
  getVADPlugin,
package/dist/index.js CHANGED
@@ -158,8 +158,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
158
158
  const energyParams = vadConfig?.energyVad || {};
159
159
  const smoothing = energyParams.smoothing ?? 0.95;
160
160
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
161
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 5e-3;
162
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
161
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
162
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
163
163
  const minSNR = energyParams.minSNR ?? 10;
164
164
  const snrRange = energyParams.snrRange ?? 10;
165
165
  const minEnergy = energyParams.minEnergy ?? 1e-3;
@@ -201,35 +201,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
201
201
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
202
202
 
203
203
  // Adaptive noise floor estimation
204
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
204
+ // We use a TWO-PASS approach to avoid circular dependencies:
205
+ // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
206
+ const instantSnr = instantRms / (this.noiseFloor + 1e-6);
207
+ const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
208
+
209
+ // Adapt the noise floor based on instantaneous SNR
205
210
  if (instantRms < this.noiseFloor) {
206
- // If signal is quieter than noise floor, adapt downwards quickly
211
+ // Signal is quieter than noise floor, adapt downwards quickly
207
212
  this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
213
+ } else if (instantSnrDb < 12) {
214
+ // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
215
+ // Adapt upwards at normal rate to track rising noise
216
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
208
217
  } else {
209
- // If signal is louder, adapt upwards
210
- // We use a multi-stage adaptation rate:
211
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
212
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
213
- // 3. Otherwise, adapt at the normal loud rate
214
- const snr = instantRms / (this.noiseFloor + 1e-6);
215
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
216
-
217
- let multiplier = 1.0;
218
- if (this.isSpeaking) {
219
- multiplier = 0.01;
220
- } else if (snrDb > 20) {
221
- multiplier = 0.1;
222
- }
223
-
224
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
225
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
218
+ // Signal has high SNR (>= 12dB) - likely speech or transient
219
+ // Adapt VERY slowly to avoid "chasing" speech
220
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
221
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
226
222
  }
227
223
 
228
224
  // Ensure noise floor doesn't drop to absolute zero
229
225
  // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
230
226
  this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
231
227
 
232
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
228
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
233
229
  const snr = this.energy / (this.noiseFloor + 1e-6);
234
230
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
235
231
 
@@ -378,7 +374,7 @@ var VADStateMachine = class {
378
374
  smoothing: config?.energyVad?.smoothing ?? 0.95,
379
375
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
380
376
  noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
381
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
377
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
382
378
  minSNR: config?.energyVad?.minSNR ?? 10,
383
379
  snrRange: config?.energyVad?.snrRange ?? 10,
384
380
  minEnergy: config?.energyVad?.minEnergy ?? 1e-3
@@ -412,25 +408,33 @@ var VADStateMachine = class {
412
408
  newState = "silent";
413
409
  this.lastSilenceTime = timestamp;
414
410
  }
415
- } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
411
+ } else if (this.currentState === "speech_starting") {
412
+ if (probability >= stopThreshold) {
413
+ const speechDuration = timestamp - this.speechStartTime;
414
+ if (speechDuration >= minSpeechDurationMs) {
415
+ newState = "speaking";
416
+ } else {
417
+ newState = "speech_starting";
418
+ }
419
+ this.lastSpeechTime = timestamp;
420
+ } else {
421
+ newState = "silent";
422
+ this.lastSilenceTime = timestamp;
423
+ }
424
+ } else if (this.currentState === "speaking") {
416
425
  if (probability >= stopThreshold) {
417
426
  newState = "speaking";
418
427
  this.lastSpeechTime = timestamp;
419
428
  } else {
420
429
  const timeSinceSpeech = timestamp - this.lastSpeechTime;
421
- const speechDuration = timestamp - this.speechStartTime;
422
430
  if (timeSinceSpeech < hangoverMs) {
423
431
  newState = "speaking";
424
- } else if (speechDuration < minSpeechDurationMs) {
425
- newState = "silent";
426
- this.lastSilenceTime = timestamp;
427
432
  } else {
428
433
  newState = "speech_ending";
429
434
  this.lastSilenceTime = timestamp;
430
435
  }
431
436
  }
432
437
  }
433
- if (newState === "speech_starting") newState = "speaking";
434
438
  if (newState === "speech_ending") newState = "silent";
435
439
  this.currentState = newState;
436
440
  return {
package/dist/index.mjs CHANGED
@@ -1,13 +1,13 @@
1
1
  import "./chunk-WBQAMGXK.mjs";
2
2
  import {
3
3
  attachProcessingToTrack
4
- } from "./chunk-RLZVZ6D6.mjs";
4
+ } from "./chunk-K6X52R7N.mjs";
5
5
  import {
6
6
  createAudioPipeline
7
- } from "./chunk-I5AR7XQD.mjs";
7
+ } from "./chunk-RD4GDIPO.mjs";
8
8
  import {
9
9
  VADStateMachine
10
- } from "./chunk-GFLVGUTU.mjs";
10
+ } from "./chunk-DLLK6K76.mjs";
11
11
  import {
12
12
  closeAudioContext,
13
13
  getAudioContext,
@@ -21,13 +21,13 @@ import {
21
21
  getVADPlugin,
22
22
  registerNoiseSuppressionPlugin,
23
23
  registerVADPlugin
24
- } from "./chunk-3I4OQD2L.mjs";
24
+ } from "./chunk-OXV7BHX5.mjs";
25
25
  import {
26
26
  RNNoisePlugin
27
27
  } from "./chunk-XO6B3D4A.mjs";
28
28
  import {
29
29
  EnergyVADPlugin
30
- } from "./chunk-B36JBXOK.mjs";
30
+ } from "./chunk-FKR6NWZF.mjs";
31
31
  export {
32
32
  EnergyVADPlugin,
33
33
  RNNoisePlugin,
@@ -127,8 +127,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
127
127
  const energyParams = vadConfig?.energyVad || {};
128
128
  const smoothing = energyParams.smoothing ?? 0.95;
129
129
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
130
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 5e-3;
131
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
130
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
131
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
132
132
  const minSNR = energyParams.minSNR ?? 10;
133
133
  const snrRange = energyParams.snrRange ?? 10;
134
134
  const minEnergy = energyParams.minEnergy ?? 1e-3;
@@ -170,35 +170,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
170
170
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
171
171
 
172
172
  // Adaptive noise floor estimation
173
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
173
+ // We use a TWO-PASS approach to avoid circular dependencies:
174
+ // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
175
+ const instantSnr = instantRms / (this.noiseFloor + 1e-6);
176
+ const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
177
+
178
+ // Adapt the noise floor based on instantaneous SNR
174
179
  if (instantRms < this.noiseFloor) {
175
- // If signal is quieter than noise floor, adapt downwards quickly
180
+ // Signal is quieter than noise floor, adapt downwards quickly
176
181
  this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
182
+ } else if (instantSnrDb < 12) {
183
+ // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
184
+ // Adapt upwards at normal rate to track rising noise
185
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
177
186
  } else {
178
- // If signal is louder, adapt upwards
179
- // We use a multi-stage adaptation rate:
180
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
181
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
182
- // 3. Otherwise, adapt at the normal loud rate
183
- const snr = instantRms / (this.noiseFloor + 1e-6);
184
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
185
-
186
- let multiplier = 1.0;
187
- if (this.isSpeaking) {
188
- multiplier = 0.01;
189
- } else if (snrDb > 20) {
190
- multiplier = 0.1;
191
- }
192
-
193
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
194
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
187
+ // Signal has high SNR (>= 12dB) - likely speech or transient
188
+ // Adapt VERY slowly to avoid "chasing" speech
189
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
190
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
195
191
  }
196
192
 
197
193
  // Ensure noise floor doesn't drop to absolute zero
198
194
  // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
199
195
  this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
200
196
 
201
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
197
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
202
198
  const snr = this.energy / (this.noiseFloor + 1e-6);
203
199
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
204
200
 
@@ -341,7 +337,7 @@ var VADStateMachine = class {
341
337
  smoothing: config?.energyVad?.smoothing ?? 0.95,
342
338
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
343
339
  noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
344
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
340
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
345
341
  minSNR: config?.energyVad?.minSNR ?? 10,
346
342
  snrRange: config?.energyVad?.snrRange ?? 10,
347
343
  minEnergy: config?.energyVad?.minEnergy ?? 1e-3
@@ -375,25 +371,33 @@ var VADStateMachine = class {
375
371
  newState = "silent";
376
372
  this.lastSilenceTime = timestamp;
377
373
  }
378
- } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
374
+ } else if (this.currentState === "speech_starting") {
375
+ if (probability >= stopThreshold) {
376
+ const speechDuration = timestamp - this.speechStartTime;
377
+ if (speechDuration >= minSpeechDurationMs) {
378
+ newState = "speaking";
379
+ } else {
380
+ newState = "speech_starting";
381
+ }
382
+ this.lastSpeechTime = timestamp;
383
+ } else {
384
+ newState = "silent";
385
+ this.lastSilenceTime = timestamp;
386
+ }
387
+ } else if (this.currentState === "speaking") {
379
388
  if (probability >= stopThreshold) {
380
389
  newState = "speaking";
381
390
  this.lastSpeechTime = timestamp;
382
391
  } else {
383
392
  const timeSinceSpeech = timestamp - this.lastSpeechTime;
384
- const speechDuration = timestamp - this.speechStartTime;
385
393
  if (timeSinceSpeech < hangoverMs) {
386
394
  newState = "speaking";
387
- } else if (speechDuration < minSpeechDurationMs) {
388
- newState = "silent";
389
- this.lastSilenceTime = timestamp;
390
395
  } else {
391
396
  newState = "speech_ending";
392
397
  this.lastSilenceTime = timestamp;
393
398
  }
394
399
  }
395
400
  }
396
- if (newState === "speech_starting") newState = "speaking";
397
401
  if (newState === "speech_ending") newState = "silent";
398
402
  this.currentState = newState;
399
403
  return {
@@ -1,12 +1,12 @@
1
1
  import {
2
2
  attachProcessingToTrack
3
- } from "../chunk-RLZVZ6D6.mjs";
4
- import "../chunk-I5AR7XQD.mjs";
5
- import "../chunk-GFLVGUTU.mjs";
3
+ } from "../chunk-K6X52R7N.mjs";
4
+ import "../chunk-RD4GDIPO.mjs";
5
+ import "../chunk-DLLK6K76.mjs";
6
6
  import "../chunk-OZ7KMC4S.mjs";
7
- import "../chunk-3I4OQD2L.mjs";
7
+ import "../chunk-OXV7BHX5.mjs";
8
8
  import "../chunk-XO6B3D4A.mjs";
9
- import "../chunk-B36JBXOK.mjs";
9
+ import "../chunk-FKR6NWZF.mjs";
10
10
  export {
11
11
  attachProcessingToTrack
12
12
  };
@@ -125,8 +125,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
125
125
  const energyParams = vadConfig?.energyVad || {};
126
126
  const smoothing = energyParams.smoothing ?? 0.95;
127
127
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
128
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 5e-3;
129
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
128
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
129
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
130
130
  const minSNR = energyParams.minSNR ?? 10;
131
131
  const snrRange = energyParams.snrRange ?? 10;
132
132
  const minEnergy = energyParams.minEnergy ?? 1e-3;
@@ -168,35 +168,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
168
168
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
169
169
 
170
170
  // Adaptive noise floor estimation
171
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
171
+ // We use a TWO-PASS approach to avoid circular dependencies:
172
+ // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
173
+ const instantSnr = instantRms / (this.noiseFloor + 1e-6);
174
+ const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
175
+
176
+ // Adapt the noise floor based on instantaneous SNR
172
177
  if (instantRms < this.noiseFloor) {
173
- // If signal is quieter than noise floor, adapt downwards quickly
178
+ // Signal is quieter than noise floor, adapt downwards quickly
174
179
  this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
180
+ } else if (instantSnrDb < 12) {
181
+ // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
182
+ // Adapt upwards at normal rate to track rising noise
183
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
175
184
  } else {
176
- // If signal is louder, adapt upwards
177
- // We use a multi-stage adaptation rate:
178
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
179
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
180
- // 3. Otherwise, adapt at the normal loud rate
181
- const snr = instantRms / (this.noiseFloor + 1e-6);
182
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
183
-
184
- let multiplier = 1.0;
185
- if (this.isSpeaking) {
186
- multiplier = 0.01;
187
- } else if (snrDb > 20) {
188
- multiplier = 0.1;
189
- }
190
-
191
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
192
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
185
+ // Signal has high SNR (>= 12dB) - likely speech or transient
186
+ // Adapt VERY slowly to avoid "chasing" speech
187
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
188
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
193
189
  }
194
190
 
195
191
  // Ensure noise floor doesn't drop to absolute zero
196
192
  // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
197
193
  this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
198
194
 
199
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
195
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
200
196
  const snr = this.energy / (this.noiseFloor + 1e-6);
201
197
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
202
198
 
@@ -339,7 +335,7 @@ var VADStateMachine = class {
339
335
  smoothing: config?.energyVad?.smoothing ?? 0.95,
340
336
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
341
337
  noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
342
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
338
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
343
339
  minSNR: config?.energyVad?.minSNR ?? 10,
344
340
  snrRange: config?.energyVad?.snrRange ?? 10,
345
341
  minEnergy: config?.energyVad?.minEnergy ?? 1e-3
@@ -373,25 +369,33 @@ var VADStateMachine = class {
373
369
  newState = "silent";
374
370
  this.lastSilenceTime = timestamp;
375
371
  }
376
- } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
372
+ } else if (this.currentState === "speech_starting") {
373
+ if (probability >= stopThreshold) {
374
+ const speechDuration = timestamp - this.speechStartTime;
375
+ if (speechDuration >= minSpeechDurationMs) {
376
+ newState = "speaking";
377
+ } else {
378
+ newState = "speech_starting";
379
+ }
380
+ this.lastSpeechTime = timestamp;
381
+ } else {
382
+ newState = "silent";
383
+ this.lastSilenceTime = timestamp;
384
+ }
385
+ } else if (this.currentState === "speaking") {
377
386
  if (probability >= stopThreshold) {
378
387
  newState = "speaking";
379
388
  this.lastSpeechTime = timestamp;
380
389
  } else {
381
390
  const timeSinceSpeech = timestamp - this.lastSpeechTime;
382
- const speechDuration = timestamp - this.speechStartTime;
383
391
  if (timeSinceSpeech < hangoverMs) {
384
392
  newState = "speaking";
385
- } else if (speechDuration < minSpeechDurationMs) {
386
- newState = "silent";
387
- this.lastSilenceTime = timestamp;
388
393
  } else {
389
394
  newState = "speech_ending";
390
395
  this.lastSilenceTime = timestamp;
391
396
  }
392
397
  }
393
398
  }
394
- if (newState === "speech_starting") newState = "speaking";
395
399
  if (newState === "speech_ending") newState = "silent";
396
400
  this.currentState = newState;
397
401
  return {
@@ -1,11 +1,11 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "../chunk-I5AR7XQD.mjs";
4
- import "../chunk-GFLVGUTU.mjs";
3
+ } from "../chunk-RD4GDIPO.mjs";
4
+ import "../chunk-DLLK6K76.mjs";
5
5
  import "../chunk-OZ7KMC4S.mjs";
6
- import "../chunk-3I4OQD2L.mjs";
6
+ import "../chunk-OXV7BHX5.mjs";
7
7
  import "../chunk-XO6B3D4A.mjs";
8
- import "../chunk-B36JBXOK.mjs";
8
+ import "../chunk-FKR6NWZF.mjs";
9
9
  export {
10
10
  createAudioPipeline
11
11
  };
package/dist/types.d.mts CHANGED
@@ -97,12 +97,13 @@ interface AudioProcessingConfig {
97
97
  initialNoiseFloor?: number;
98
98
  /**
99
99
  * Rate at which noise floor adapts to quiet signals (0-1).
100
- * Default: 0.005 (slower downward drift)
100
+ * Default: 0.01
101
101
  */
102
102
  noiseFloorAdaptRateQuiet?: number;
103
103
  /**
104
104
  * Rate at which noise floor adapts to loud signals (0-1).
105
- * Default: 0.01
105
+ * Applied when instantaneous SNR < 12dB (background noise).
106
+ * Default: 0.1 (fast tracking of rising noise)
106
107
  */
107
108
  noiseFloorAdaptRateLoud?: number;
108
109
  /**
package/dist/types.d.ts CHANGED
@@ -97,12 +97,13 @@ interface AudioProcessingConfig {
97
97
  initialNoiseFloor?: number;
98
98
  /**
99
99
  * Rate at which noise floor adapts to quiet signals (0-1).
100
- * Default: 0.005 (slower downward drift)
100
+ * Default: 0.01
101
101
  */
102
102
  noiseFloorAdaptRateQuiet?: number;
103
103
  /**
104
104
  * Rate at which noise floor adapts to loud signals (0-1).
105
- * Default: 0.01
105
+ * Applied when instantaneous SNR < 12dB (background noise).
106
+ * Default: 0.1 (fast tracking of rising noise)
106
107
  */
107
108
  noiseFloorAdaptRateLoud?: number;
108
109
  /**
@@ -27,8 +27,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
27
27
  const energyParams = vadConfig?.energyVad || {};
28
28
  const smoothing = energyParams.smoothing ?? 0.95;
29
29
  const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
30
- const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 5e-3;
31
- const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
30
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
31
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
32
32
  const minSNR = energyParams.minSNR ?? 10;
33
33
  const snrRange = energyParams.snrRange ?? 10;
34
34
  const minEnergy = energyParams.minEnergy ?? 1e-3;
@@ -70,35 +70,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
70
70
  this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
71
71
 
72
72
  // Adaptive noise floor estimation
73
- // We use the instantaneous RMS for noise floor tracking to react quickly to silence
73
+ // We use a TWO-PASS approach to avoid circular dependencies:
74
+ // FIRST PASS: Calculate instantaneous SNR to decide how to adapt
75
+ const instantSnr = instantRms / (this.noiseFloor + 1e-6);
76
+ const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
77
+
78
+ // Adapt the noise floor based on instantaneous SNR
74
79
  if (instantRms < this.noiseFloor) {
75
- // If signal is quieter than noise floor, adapt downwards quickly
80
+ // Signal is quieter than noise floor, adapt downwards quickly
76
81
  this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
82
+ } else if (instantSnrDb < 12) {
83
+ // Signal is louder but SNR is low (< 12dB) - likely just louder background noise
84
+ // Adapt upwards at normal rate to track rising noise
85
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
77
86
  } else {
78
- // If signal is louder, adapt upwards
79
- // We use a multi-stage adaptation rate:
80
- // 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
81
- // 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
82
- // 3. Otherwise, adapt at the normal loud rate
83
- const snr = instantRms / (this.noiseFloor + 1e-6);
84
- const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
85
-
86
- let multiplier = 1.0;
87
- if (this.isSpeaking) {
88
- multiplier = 0.01;
89
- } else if (snrDb > 20) {
90
- multiplier = 0.1;
91
- }
92
-
93
- const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
94
- this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
87
+ // Signal has high SNR (>= 12dB) - likely speech or transient
88
+ // Adapt VERY slowly to avoid "chasing" speech
89
+ const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
90
+ this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
95
91
  }
96
92
 
97
93
  // Ensure noise floor doesn't drop to absolute zero
98
94
  // 0.00005 is approx -86dB, very quiet but prevents SNR explosion
99
95
  this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
100
96
 
101
- // Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
97
+ // SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
102
98
  const snr = this.energy / (this.noiseFloor + 1e-6);
103
99
  const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
104
100
 
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  EnergyVADPlugin
3
- } from "../chunk-B36JBXOK.mjs";
3
+ } from "../chunk-FKR6NWZF.mjs";
4
4
  export {
5
5
  EnergyVADPlugin
6
6
  };
@@ -51,7 +51,7 @@ var VADStateMachine = class {
51
51
  smoothing: config?.energyVad?.smoothing ?? 0.95,
52
52
  initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
53
53
  noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
54
- noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
54
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
55
55
  minSNR: config?.energyVad?.minSNR ?? 10,
56
56
  snrRange: config?.energyVad?.snrRange ?? 10,
57
57
  minEnergy: config?.energyVad?.minEnergy ?? 1e-3
@@ -85,25 +85,33 @@ var VADStateMachine = class {
85
85
  newState = "silent";
86
86
  this.lastSilenceTime = timestamp;
87
87
  }
88
- } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
88
+ } else if (this.currentState === "speech_starting") {
89
+ if (probability >= stopThreshold) {
90
+ const speechDuration = timestamp - this.speechStartTime;
91
+ if (speechDuration >= minSpeechDurationMs) {
92
+ newState = "speaking";
93
+ } else {
94
+ newState = "speech_starting";
95
+ }
96
+ this.lastSpeechTime = timestamp;
97
+ } else {
98
+ newState = "silent";
99
+ this.lastSilenceTime = timestamp;
100
+ }
101
+ } else if (this.currentState === "speaking") {
89
102
  if (probability >= stopThreshold) {
90
103
  newState = "speaking";
91
104
  this.lastSpeechTime = timestamp;
92
105
  } else {
93
106
  const timeSinceSpeech = timestamp - this.lastSpeechTime;
94
- const speechDuration = timestamp - this.speechStartTime;
95
107
  if (timeSinceSpeech < hangoverMs) {
96
108
  newState = "speaking";
97
- } else if (speechDuration < minSpeechDurationMs) {
98
- newState = "silent";
99
- this.lastSilenceTime = timestamp;
100
109
  } else {
101
110
  newState = "speech_ending";
102
111
  this.lastSilenceTime = timestamp;
103
112
  }
104
113
  }
105
114
  }
106
- if (newState === "speech_starting") newState = "speaking";
107
115
  if (newState === "speech_ending") newState = "silent";
108
116
  this.currentState = newState;
109
117
  return {
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  VADStateMachine
3
- } from "../chunk-GFLVGUTU.mjs";
3
+ } from "../chunk-DLLK6K76.mjs";
4
4
  export {
5
5
  VADStateMachine
6
6
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tensamin/audio",
3
- "version": "0.1.11",
3
+ "version": "0.1.13",
4
4
  "main": "dist/index.js",
5
5
  "module": "dist/index.mjs",
6
6
  "types": "dist/index.d.ts",