@tensamin/audio 0.1.11 → 0.1.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/{chunk-GFLVGUTU.mjs → chunk-DLLK6K76.mjs} +15 -7
- package/dist/{chunk-B36JBXOK.mjs → chunk-FKR6NWZF.mjs} +18 -22
- package/dist/{chunk-RLZVZ6D6.mjs → chunk-K6X52R7N.mjs} +1 -1
- package/dist/{chunk-3I4OQD2L.mjs → chunk-OXV7BHX5.mjs} +1 -1
- package/dist/{chunk-I5AR7XQD.mjs → chunk-RD4GDIPO.mjs} +2 -2
- package/dist/extensibility/plugins.js +18 -22
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +33 -29
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +33 -29
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +33 -29
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +3 -2
- package/dist/types.d.ts +3 -2
- package/dist/vad/vad-node.js +18 -22
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.js +15 -7
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -101,8 +101,8 @@ vad: {
|
|
|
101
101
|
energyVad?: {
|
|
102
102
|
smoothing: number; // Default: 0.95
|
|
103
103
|
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
-
noiseFloorAdaptRateQuiet: number; // Default: 0.
|
|
105
|
-
noiseFloorAdaptRateLoud: number; // Default: 0.
|
|
104
|
+
noiseFloorAdaptRateQuiet: number; // Default: 0.01
|
|
105
|
+
noiseFloorAdaptRateLoud: number; // Default: 0.1
|
|
106
106
|
minSNR: number; // Default: 10.0 (dB)
|
|
107
107
|
snrRange: number; // Default: 10.0 (dB)
|
|
108
108
|
minEnergy: number; // Default: 0.001
|
|
@@ -27,7 +27,7 @@ var VADStateMachine = class {
|
|
|
27
27
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
28
28
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
29
29
|
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
|
|
30
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
30
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
|
|
31
31
|
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
32
32
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
33
33
|
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
@@ -61,25 +61,33 @@ var VADStateMachine = class {
|
|
|
61
61
|
newState = "silent";
|
|
62
62
|
this.lastSilenceTime = timestamp;
|
|
63
63
|
}
|
|
64
|
-
} else if (this.currentState === "speech_starting"
|
|
64
|
+
} else if (this.currentState === "speech_starting") {
|
|
65
|
+
if (probability >= stopThreshold) {
|
|
66
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
67
|
+
if (speechDuration >= minSpeechDurationMs) {
|
|
68
|
+
newState = "speaking";
|
|
69
|
+
} else {
|
|
70
|
+
newState = "speech_starting";
|
|
71
|
+
}
|
|
72
|
+
this.lastSpeechTime = timestamp;
|
|
73
|
+
} else {
|
|
74
|
+
newState = "silent";
|
|
75
|
+
this.lastSilenceTime = timestamp;
|
|
76
|
+
}
|
|
77
|
+
} else if (this.currentState === "speaking") {
|
|
65
78
|
if (probability >= stopThreshold) {
|
|
66
79
|
newState = "speaking";
|
|
67
80
|
this.lastSpeechTime = timestamp;
|
|
68
81
|
} else {
|
|
69
82
|
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
70
|
-
const speechDuration = timestamp - this.speechStartTime;
|
|
71
83
|
if (timeSinceSpeech < hangoverMs) {
|
|
72
84
|
newState = "speaking";
|
|
73
|
-
} else if (speechDuration < minSpeechDurationMs) {
|
|
74
|
-
newState = "silent";
|
|
75
|
-
this.lastSilenceTime = timestamp;
|
|
76
85
|
} else {
|
|
77
86
|
newState = "speech_ending";
|
|
78
87
|
this.lastSilenceTime = timestamp;
|
|
79
88
|
}
|
|
80
89
|
}
|
|
81
90
|
}
|
|
82
|
-
if (newState === "speech_starting") newState = "speaking";
|
|
83
91
|
if (newState === "speech_ending") newState = "silent";
|
|
84
92
|
this.currentState = newState;
|
|
85
93
|
return {
|
|
@@ -3,8 +3,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
3
3
|
const energyParams = vadConfig?.energyVad || {};
|
|
4
4
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
5
5
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
6
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
7
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
6
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
7
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
|
|
8
8
|
const minSNR = energyParams.minSNR ?? 10;
|
|
9
9
|
const snrRange = energyParams.snrRange ?? 10;
|
|
10
10
|
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
@@ -46,35 +46,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
46
46
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
47
47
|
|
|
48
48
|
// Adaptive noise floor estimation
|
|
49
|
-
// We use
|
|
49
|
+
// We use a TWO-PASS approach to avoid circular dependencies:
|
|
50
|
+
// FIRST PASS: Calculate instantaneous SNR to decide how to adapt
|
|
51
|
+
const instantSnr = instantRms / (this.noiseFloor + 1e-6);
|
|
52
|
+
const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
|
|
53
|
+
|
|
54
|
+
// Adapt the noise floor based on instantaneous SNR
|
|
50
55
|
if (instantRms < this.noiseFloor) {
|
|
51
|
-
//
|
|
56
|
+
// Signal is quieter than noise floor, adapt downwards quickly
|
|
52
57
|
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
58
|
+
} else if (instantSnrDb < 12) {
|
|
59
|
+
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
60
|
+
// Adapt upwards at normal rate to track rising noise
|
|
61
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
53
62
|
} else {
|
|
54
|
-
//
|
|
55
|
-
//
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
59
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
60
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
61
|
-
|
|
62
|
-
let multiplier = 1.0;
|
|
63
|
-
if (this.isSpeaking) {
|
|
64
|
-
multiplier = 0.01;
|
|
65
|
-
} else if (snrDb > 20) {
|
|
66
|
-
multiplier = 0.1;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
70
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
63
|
+
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
64
|
+
// Adapt VERY slowly to avoid "chasing" speech
|
|
65
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
66
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
71
67
|
}
|
|
72
68
|
|
|
73
69
|
// Ensure noise floor doesn't drop to absolute zero
|
|
74
70
|
// 0.00005 is approx -86dB, very quiet but prevents SNR explosion
|
|
75
71
|
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
76
72
|
|
|
77
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
73
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
78
74
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
79
75
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
80
76
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
VADStateMachine
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-DLLK6K76.mjs";
|
|
4
4
|
import {
|
|
5
5
|
getAudioContext,
|
|
6
6
|
registerPipeline,
|
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
import {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-OXV7BHX5.mjs";
|
|
13
13
|
|
|
14
14
|
// src/pipeline/audio-pipeline.ts
|
|
15
15
|
import mitt from "mitt";
|
|
@@ -106,8 +106,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
106
106
|
const energyParams = vadConfig?.energyVad || {};
|
|
107
107
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
108
108
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
109
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
110
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
109
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
110
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
|
|
111
111
|
const minSNR = energyParams.minSNR ?? 10;
|
|
112
112
|
const snrRange = energyParams.snrRange ?? 10;
|
|
113
113
|
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
@@ -149,35 +149,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
149
149
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
150
150
|
|
|
151
151
|
// Adaptive noise floor estimation
|
|
152
|
-
// We use
|
|
152
|
+
// We use a TWO-PASS approach to avoid circular dependencies:
|
|
153
|
+
// FIRST PASS: Calculate instantaneous SNR to decide how to adapt
|
|
154
|
+
const instantSnr = instantRms / (this.noiseFloor + 1e-6);
|
|
155
|
+
const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
|
|
156
|
+
|
|
157
|
+
// Adapt the noise floor based on instantaneous SNR
|
|
153
158
|
if (instantRms < this.noiseFloor) {
|
|
154
|
-
//
|
|
159
|
+
// Signal is quieter than noise floor, adapt downwards quickly
|
|
155
160
|
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
161
|
+
} else if (instantSnrDb < 12) {
|
|
162
|
+
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
163
|
+
// Adapt upwards at normal rate to track rising noise
|
|
164
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
156
165
|
} else {
|
|
157
|
-
//
|
|
158
|
-
//
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
162
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
163
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
164
|
-
|
|
165
|
-
let multiplier = 1.0;
|
|
166
|
-
if (this.isSpeaking) {
|
|
167
|
-
multiplier = 0.01;
|
|
168
|
-
} else if (snrDb > 20) {
|
|
169
|
-
multiplier = 0.1;
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
173
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
166
|
+
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
167
|
+
// Adapt VERY slowly to avoid "chasing" speech
|
|
168
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
169
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
174
170
|
}
|
|
175
171
|
|
|
176
172
|
// Ensure noise floor doesn't drop to absolute zero
|
|
177
173
|
// 0.00005 is approx -86dB, very quiet but prevents SNR explosion
|
|
178
174
|
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
179
175
|
|
|
180
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
176
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
181
177
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
182
178
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
183
179
|
|
|
@@ -3,9 +3,9 @@ import {
|
|
|
3
3
|
getVADPlugin,
|
|
4
4
|
registerNoiseSuppressionPlugin,
|
|
5
5
|
registerVADPlugin
|
|
6
|
-
} from "../chunk-
|
|
6
|
+
} from "../chunk-OXV7BHX5.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-FKR6NWZF.mjs";
|
|
9
9
|
export {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin,
|
package/dist/index.js
CHANGED
|
@@ -158,8 +158,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
158
158
|
const energyParams = vadConfig?.energyVad || {};
|
|
159
159
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
160
160
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
161
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
162
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
161
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
162
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
|
|
163
163
|
const minSNR = energyParams.minSNR ?? 10;
|
|
164
164
|
const snrRange = energyParams.snrRange ?? 10;
|
|
165
165
|
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
@@ -201,35 +201,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
201
201
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
202
202
|
|
|
203
203
|
// Adaptive noise floor estimation
|
|
204
|
-
// We use
|
|
204
|
+
// We use a TWO-PASS approach to avoid circular dependencies:
|
|
205
|
+
// FIRST PASS: Calculate instantaneous SNR to decide how to adapt
|
|
206
|
+
const instantSnr = instantRms / (this.noiseFloor + 1e-6);
|
|
207
|
+
const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
|
|
208
|
+
|
|
209
|
+
// Adapt the noise floor based on instantaneous SNR
|
|
205
210
|
if (instantRms < this.noiseFloor) {
|
|
206
|
-
//
|
|
211
|
+
// Signal is quieter than noise floor, adapt downwards quickly
|
|
207
212
|
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
213
|
+
} else if (instantSnrDb < 12) {
|
|
214
|
+
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
215
|
+
// Adapt upwards at normal rate to track rising noise
|
|
216
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
208
217
|
} else {
|
|
209
|
-
//
|
|
210
|
-
//
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
214
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
215
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
216
|
-
|
|
217
|
-
let multiplier = 1.0;
|
|
218
|
-
if (this.isSpeaking) {
|
|
219
|
-
multiplier = 0.01;
|
|
220
|
-
} else if (snrDb > 20) {
|
|
221
|
-
multiplier = 0.1;
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
225
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
218
|
+
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
219
|
+
// Adapt VERY slowly to avoid "chasing" speech
|
|
220
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
221
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
226
222
|
}
|
|
227
223
|
|
|
228
224
|
// Ensure noise floor doesn't drop to absolute zero
|
|
229
225
|
// 0.00005 is approx -86dB, very quiet but prevents SNR explosion
|
|
230
226
|
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
231
227
|
|
|
232
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
228
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
233
229
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
234
230
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
235
231
|
|
|
@@ -378,7 +374,7 @@ var VADStateMachine = class {
|
|
|
378
374
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
379
375
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
380
376
|
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
|
|
381
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
377
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
|
|
382
378
|
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
383
379
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
384
380
|
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
@@ -412,25 +408,33 @@ var VADStateMachine = class {
|
|
|
412
408
|
newState = "silent";
|
|
413
409
|
this.lastSilenceTime = timestamp;
|
|
414
410
|
}
|
|
415
|
-
} else if (this.currentState === "speech_starting"
|
|
411
|
+
} else if (this.currentState === "speech_starting") {
|
|
412
|
+
if (probability >= stopThreshold) {
|
|
413
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
414
|
+
if (speechDuration >= minSpeechDurationMs) {
|
|
415
|
+
newState = "speaking";
|
|
416
|
+
} else {
|
|
417
|
+
newState = "speech_starting";
|
|
418
|
+
}
|
|
419
|
+
this.lastSpeechTime = timestamp;
|
|
420
|
+
} else {
|
|
421
|
+
newState = "silent";
|
|
422
|
+
this.lastSilenceTime = timestamp;
|
|
423
|
+
}
|
|
424
|
+
} else if (this.currentState === "speaking") {
|
|
416
425
|
if (probability >= stopThreshold) {
|
|
417
426
|
newState = "speaking";
|
|
418
427
|
this.lastSpeechTime = timestamp;
|
|
419
428
|
} else {
|
|
420
429
|
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
421
|
-
const speechDuration = timestamp - this.speechStartTime;
|
|
422
430
|
if (timeSinceSpeech < hangoverMs) {
|
|
423
431
|
newState = "speaking";
|
|
424
|
-
} else if (speechDuration < minSpeechDurationMs) {
|
|
425
|
-
newState = "silent";
|
|
426
|
-
this.lastSilenceTime = timestamp;
|
|
427
432
|
} else {
|
|
428
433
|
newState = "speech_ending";
|
|
429
434
|
this.lastSilenceTime = timestamp;
|
|
430
435
|
}
|
|
431
436
|
}
|
|
432
437
|
}
|
|
433
|
-
if (newState === "speech_starting") newState = "speaking";
|
|
434
438
|
if (newState === "speech_ending") newState = "silent";
|
|
435
439
|
this.currentState = newState;
|
|
436
440
|
return {
|
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import "./chunk-WBQAMGXK.mjs";
|
|
2
2
|
import {
|
|
3
3
|
attachProcessingToTrack
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-K6X52R7N.mjs";
|
|
5
5
|
import {
|
|
6
6
|
createAudioPipeline
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-RD4GDIPO.mjs";
|
|
8
8
|
import {
|
|
9
9
|
VADStateMachine
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-DLLK6K76.mjs";
|
|
11
11
|
import {
|
|
12
12
|
closeAudioContext,
|
|
13
13
|
getAudioContext,
|
|
@@ -21,13 +21,13 @@ import {
|
|
|
21
21
|
getVADPlugin,
|
|
22
22
|
registerNoiseSuppressionPlugin,
|
|
23
23
|
registerVADPlugin
|
|
24
|
-
} from "./chunk-
|
|
24
|
+
} from "./chunk-OXV7BHX5.mjs";
|
|
25
25
|
import {
|
|
26
26
|
RNNoisePlugin
|
|
27
27
|
} from "./chunk-XO6B3D4A.mjs";
|
|
28
28
|
import {
|
|
29
29
|
EnergyVADPlugin
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-FKR6NWZF.mjs";
|
|
31
31
|
export {
|
|
32
32
|
EnergyVADPlugin,
|
|
33
33
|
RNNoisePlugin,
|
|
@@ -127,8 +127,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
127
127
|
const energyParams = vadConfig?.energyVad || {};
|
|
128
128
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
129
129
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
130
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
131
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
130
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
131
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
|
|
132
132
|
const minSNR = energyParams.minSNR ?? 10;
|
|
133
133
|
const snrRange = energyParams.snrRange ?? 10;
|
|
134
134
|
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
@@ -170,35 +170,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
170
170
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
171
171
|
|
|
172
172
|
// Adaptive noise floor estimation
|
|
173
|
-
// We use
|
|
173
|
+
// We use a TWO-PASS approach to avoid circular dependencies:
|
|
174
|
+
// FIRST PASS: Calculate instantaneous SNR to decide how to adapt
|
|
175
|
+
const instantSnr = instantRms / (this.noiseFloor + 1e-6);
|
|
176
|
+
const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
|
|
177
|
+
|
|
178
|
+
// Adapt the noise floor based on instantaneous SNR
|
|
174
179
|
if (instantRms < this.noiseFloor) {
|
|
175
|
-
//
|
|
180
|
+
// Signal is quieter than noise floor, adapt downwards quickly
|
|
176
181
|
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
182
|
+
} else if (instantSnrDb < 12) {
|
|
183
|
+
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
184
|
+
// Adapt upwards at normal rate to track rising noise
|
|
185
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
177
186
|
} else {
|
|
178
|
-
//
|
|
179
|
-
//
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
183
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
184
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
185
|
-
|
|
186
|
-
let multiplier = 1.0;
|
|
187
|
-
if (this.isSpeaking) {
|
|
188
|
-
multiplier = 0.01;
|
|
189
|
-
} else if (snrDb > 20) {
|
|
190
|
-
multiplier = 0.1;
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
194
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
187
|
+
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
188
|
+
// Adapt VERY slowly to avoid "chasing" speech
|
|
189
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
190
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
195
191
|
}
|
|
196
192
|
|
|
197
193
|
// Ensure noise floor doesn't drop to absolute zero
|
|
198
194
|
// 0.00005 is approx -86dB, very quiet but prevents SNR explosion
|
|
199
195
|
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
200
196
|
|
|
201
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
197
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
202
198
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
203
199
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
204
200
|
|
|
@@ -341,7 +337,7 @@ var VADStateMachine = class {
|
|
|
341
337
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
342
338
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
343
339
|
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
|
|
344
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
340
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
|
|
345
341
|
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
346
342
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
347
343
|
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
@@ -375,25 +371,33 @@ var VADStateMachine = class {
|
|
|
375
371
|
newState = "silent";
|
|
376
372
|
this.lastSilenceTime = timestamp;
|
|
377
373
|
}
|
|
378
|
-
} else if (this.currentState === "speech_starting"
|
|
374
|
+
} else if (this.currentState === "speech_starting") {
|
|
375
|
+
if (probability >= stopThreshold) {
|
|
376
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
377
|
+
if (speechDuration >= minSpeechDurationMs) {
|
|
378
|
+
newState = "speaking";
|
|
379
|
+
} else {
|
|
380
|
+
newState = "speech_starting";
|
|
381
|
+
}
|
|
382
|
+
this.lastSpeechTime = timestamp;
|
|
383
|
+
} else {
|
|
384
|
+
newState = "silent";
|
|
385
|
+
this.lastSilenceTime = timestamp;
|
|
386
|
+
}
|
|
387
|
+
} else if (this.currentState === "speaking") {
|
|
379
388
|
if (probability >= stopThreshold) {
|
|
380
389
|
newState = "speaking";
|
|
381
390
|
this.lastSpeechTime = timestamp;
|
|
382
391
|
} else {
|
|
383
392
|
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
384
|
-
const speechDuration = timestamp - this.speechStartTime;
|
|
385
393
|
if (timeSinceSpeech < hangoverMs) {
|
|
386
394
|
newState = "speaking";
|
|
387
|
-
} else if (speechDuration < minSpeechDurationMs) {
|
|
388
|
-
newState = "silent";
|
|
389
|
-
this.lastSilenceTime = timestamp;
|
|
390
395
|
} else {
|
|
391
396
|
newState = "speech_ending";
|
|
392
397
|
this.lastSilenceTime = timestamp;
|
|
393
398
|
}
|
|
394
399
|
}
|
|
395
400
|
}
|
|
396
|
-
if (newState === "speech_starting") newState = "speaking";
|
|
397
401
|
if (newState === "speech_ending") newState = "silent";
|
|
398
402
|
this.currentState = newState;
|
|
399
403
|
return {
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import {
|
|
2
2
|
attachProcessingToTrack
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
5
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-K6X52R7N.mjs";
|
|
4
|
+
import "../chunk-RD4GDIPO.mjs";
|
|
5
|
+
import "../chunk-DLLK6K76.mjs";
|
|
6
6
|
import "../chunk-OZ7KMC4S.mjs";
|
|
7
|
-
import "../chunk-
|
|
7
|
+
import "../chunk-OXV7BHX5.mjs";
|
|
8
8
|
import "../chunk-XO6B3D4A.mjs";
|
|
9
|
-
import "../chunk-
|
|
9
|
+
import "../chunk-FKR6NWZF.mjs";
|
|
10
10
|
export {
|
|
11
11
|
attachProcessingToTrack
|
|
12
12
|
};
|
|
@@ -125,8 +125,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
125
125
|
const energyParams = vadConfig?.energyVad || {};
|
|
126
126
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
127
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
129
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
128
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
129
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
|
|
130
130
|
const minSNR = energyParams.minSNR ?? 10;
|
|
131
131
|
const snrRange = energyParams.snrRange ?? 10;
|
|
132
132
|
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
@@ -168,35 +168,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
168
168
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
169
169
|
|
|
170
170
|
// Adaptive noise floor estimation
|
|
171
|
-
// We use
|
|
171
|
+
// We use a TWO-PASS approach to avoid circular dependencies:
|
|
172
|
+
// FIRST PASS: Calculate instantaneous SNR to decide how to adapt
|
|
173
|
+
const instantSnr = instantRms / (this.noiseFloor + 1e-6);
|
|
174
|
+
const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
|
|
175
|
+
|
|
176
|
+
// Adapt the noise floor based on instantaneous SNR
|
|
172
177
|
if (instantRms < this.noiseFloor) {
|
|
173
|
-
//
|
|
178
|
+
// Signal is quieter than noise floor, adapt downwards quickly
|
|
174
179
|
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
180
|
+
} else if (instantSnrDb < 12) {
|
|
181
|
+
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
182
|
+
// Adapt upwards at normal rate to track rising noise
|
|
183
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
175
184
|
} else {
|
|
176
|
-
//
|
|
177
|
-
//
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
181
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
182
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
183
|
-
|
|
184
|
-
let multiplier = 1.0;
|
|
185
|
-
if (this.isSpeaking) {
|
|
186
|
-
multiplier = 0.01;
|
|
187
|
-
} else if (snrDb > 20) {
|
|
188
|
-
multiplier = 0.1;
|
|
189
|
-
}
|
|
190
|
-
|
|
191
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
192
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
185
|
+
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
186
|
+
// Adapt VERY slowly to avoid "chasing" speech
|
|
187
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
188
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
193
189
|
}
|
|
194
190
|
|
|
195
191
|
// Ensure noise floor doesn't drop to absolute zero
|
|
196
192
|
// 0.00005 is approx -86dB, very quiet but prevents SNR explosion
|
|
197
193
|
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
198
194
|
|
|
199
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
195
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
200
196
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
201
197
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
202
198
|
|
|
@@ -339,7 +335,7 @@ var VADStateMachine = class {
|
|
|
339
335
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
340
336
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
341
337
|
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
|
|
342
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
338
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
|
|
343
339
|
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
344
340
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
345
341
|
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
@@ -373,25 +369,33 @@ var VADStateMachine = class {
|
|
|
373
369
|
newState = "silent";
|
|
374
370
|
this.lastSilenceTime = timestamp;
|
|
375
371
|
}
|
|
376
|
-
} else if (this.currentState === "speech_starting"
|
|
372
|
+
} else if (this.currentState === "speech_starting") {
|
|
373
|
+
if (probability >= stopThreshold) {
|
|
374
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
375
|
+
if (speechDuration >= minSpeechDurationMs) {
|
|
376
|
+
newState = "speaking";
|
|
377
|
+
} else {
|
|
378
|
+
newState = "speech_starting";
|
|
379
|
+
}
|
|
380
|
+
this.lastSpeechTime = timestamp;
|
|
381
|
+
} else {
|
|
382
|
+
newState = "silent";
|
|
383
|
+
this.lastSilenceTime = timestamp;
|
|
384
|
+
}
|
|
385
|
+
} else if (this.currentState === "speaking") {
|
|
377
386
|
if (probability >= stopThreshold) {
|
|
378
387
|
newState = "speaking";
|
|
379
388
|
this.lastSpeechTime = timestamp;
|
|
380
389
|
} else {
|
|
381
390
|
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
382
|
-
const speechDuration = timestamp - this.speechStartTime;
|
|
383
391
|
if (timeSinceSpeech < hangoverMs) {
|
|
384
392
|
newState = "speaking";
|
|
385
|
-
} else if (speechDuration < minSpeechDurationMs) {
|
|
386
|
-
newState = "silent";
|
|
387
|
-
this.lastSilenceTime = timestamp;
|
|
388
393
|
} else {
|
|
389
394
|
newState = "speech_ending";
|
|
390
395
|
this.lastSilenceTime = timestamp;
|
|
391
396
|
}
|
|
392
397
|
}
|
|
393
398
|
}
|
|
394
|
-
if (newState === "speech_starting") newState = "speaking";
|
|
395
399
|
if (newState === "speech_ending") newState = "silent";
|
|
396
400
|
this.currentState = newState;
|
|
397
401
|
return {
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
createAudioPipeline
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-RD4GDIPO.mjs";
|
|
4
|
+
import "../chunk-DLLK6K76.mjs";
|
|
5
5
|
import "../chunk-OZ7KMC4S.mjs";
|
|
6
|
-
import "../chunk-
|
|
6
|
+
import "../chunk-OXV7BHX5.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-FKR6NWZF.mjs";
|
|
9
9
|
export {
|
|
10
10
|
createAudioPipeline
|
|
11
11
|
};
|
package/dist/types.d.mts
CHANGED
|
@@ -97,12 +97,13 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.01
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
*
|
|
105
|
+
* Applied when instantaneous SNR < 12dB (background noise).
|
|
106
|
+
* Default: 0.1 (fast tracking of rising noise)
|
|
106
107
|
*/
|
|
107
108
|
noiseFloorAdaptRateLoud?: number;
|
|
108
109
|
/**
|
package/dist/types.d.ts
CHANGED
|
@@ -97,12 +97,13 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.01
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
*
|
|
105
|
+
* Applied when instantaneous SNR < 12dB (background noise).
|
|
106
|
+
* Default: 0.1 (fast tracking of rising noise)
|
|
106
107
|
*/
|
|
107
108
|
noiseFloorAdaptRateLoud?: number;
|
|
108
109
|
/**
|
package/dist/vad/vad-node.js
CHANGED
|
@@ -27,8 +27,8 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
27
27
|
const energyParams = vadConfig?.energyVad || {};
|
|
28
28
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
29
29
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
30
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
31
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
30
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
31
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.1;
|
|
32
32
|
const minSNR = energyParams.minSNR ?? 10;
|
|
33
33
|
const snrRange = energyParams.snrRange ?? 10;
|
|
34
34
|
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
@@ -70,35 +70,31 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
70
70
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
71
71
|
|
|
72
72
|
// Adaptive noise floor estimation
|
|
73
|
-
// We use
|
|
73
|
+
// We use a TWO-PASS approach to avoid circular dependencies:
|
|
74
|
+
// FIRST PASS: Calculate instantaneous SNR to decide how to adapt
|
|
75
|
+
const instantSnr = instantRms / (this.noiseFloor + 1e-6);
|
|
76
|
+
const instantSnrDb = 20 * Math.log10(Math.max(1e-6, instantSnr));
|
|
77
|
+
|
|
78
|
+
// Adapt the noise floor based on instantaneous SNR
|
|
74
79
|
if (instantRms < this.noiseFloor) {
|
|
75
|
-
//
|
|
80
|
+
// Signal is quieter than noise floor, adapt downwards quickly
|
|
76
81
|
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
82
|
+
} else if (instantSnrDb < 12) {
|
|
83
|
+
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
84
|
+
// Adapt upwards at normal rate to track rising noise
|
|
85
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
77
86
|
} else {
|
|
78
|
-
//
|
|
79
|
-
//
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
83
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
84
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
85
|
-
|
|
86
|
-
let multiplier = 1.0;
|
|
87
|
-
if (this.isSpeaking) {
|
|
88
|
-
multiplier = 0.01;
|
|
89
|
-
} else if (snrDb > 20) {
|
|
90
|
-
multiplier = 0.1;
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
94
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
87
|
+
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
88
|
+
// Adapt VERY slowly to avoid "chasing" speech
|
|
89
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
90
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
95
91
|
}
|
|
96
92
|
|
|
97
93
|
// Ensure noise floor doesn't drop to absolute zero
|
|
98
94
|
// 0.00005 is approx -86dB, very quiet but prevents SNR explosion
|
|
99
95
|
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
100
96
|
|
|
101
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
97
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
102
98
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
103
99
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
104
100
|
|
package/dist/vad/vad-node.mjs
CHANGED
package/dist/vad/vad-state.js
CHANGED
|
@@ -51,7 +51,7 @@ var VADStateMachine = class {
|
|
|
51
51
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
52
52
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
53
53
|
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 5e-3,
|
|
54
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
54
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.1,
|
|
55
55
|
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
56
56
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
57
57
|
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
@@ -85,25 +85,33 @@ var VADStateMachine = class {
|
|
|
85
85
|
newState = "silent";
|
|
86
86
|
this.lastSilenceTime = timestamp;
|
|
87
87
|
}
|
|
88
|
-
} else if (this.currentState === "speech_starting"
|
|
88
|
+
} else if (this.currentState === "speech_starting") {
|
|
89
|
+
if (probability >= stopThreshold) {
|
|
90
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
91
|
+
if (speechDuration >= minSpeechDurationMs) {
|
|
92
|
+
newState = "speaking";
|
|
93
|
+
} else {
|
|
94
|
+
newState = "speech_starting";
|
|
95
|
+
}
|
|
96
|
+
this.lastSpeechTime = timestamp;
|
|
97
|
+
} else {
|
|
98
|
+
newState = "silent";
|
|
99
|
+
this.lastSilenceTime = timestamp;
|
|
100
|
+
}
|
|
101
|
+
} else if (this.currentState === "speaking") {
|
|
89
102
|
if (probability >= stopThreshold) {
|
|
90
103
|
newState = "speaking";
|
|
91
104
|
this.lastSpeechTime = timestamp;
|
|
92
105
|
} else {
|
|
93
106
|
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
94
|
-
const speechDuration = timestamp - this.speechStartTime;
|
|
95
107
|
if (timeSinceSpeech < hangoverMs) {
|
|
96
108
|
newState = "speaking";
|
|
97
|
-
} else if (speechDuration < minSpeechDurationMs) {
|
|
98
|
-
newState = "silent";
|
|
99
|
-
this.lastSilenceTime = timestamp;
|
|
100
109
|
} else {
|
|
101
110
|
newState = "speech_ending";
|
|
102
111
|
this.lastSilenceTime = timestamp;
|
|
103
112
|
}
|
|
104
113
|
}
|
|
105
114
|
}
|
|
106
|
-
if (newState === "speech_starting") newState = "speaking";
|
|
107
115
|
if (newState === "speech_ending") newState = "silent";
|
|
108
116
|
this.currentState = newState;
|
|
109
117
|
return {
|
package/dist/vad/vad-state.mjs
CHANGED