@tensamin/audio 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -8
- package/dist/{chunk-N553RHTI.mjs → chunk-2EX3FXSF.mjs} +5 -4
- package/dist/{chunk-VEJXAEMM.mjs → chunk-2TKYGFMC.mjs} +38 -24
- package/dist/{chunk-H5UKZU2Y.mjs → chunk-BMVZ3KKG.mjs} +1 -1
- package/dist/{chunk-XXTNAUYX.mjs → chunk-JP6DA62Y.mjs} +2 -2
- package/dist/{chunk-GVKCBKW6.mjs → chunk-UQG6Z5W3.mjs} +1 -1
- package/dist/extensibility/plugins.js +38 -24
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +43 -28
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +43 -28
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +43 -28
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +12 -6
- package/dist/types.d.ts +12 -6
- package/dist/vad/vad-node.js +38 -24
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.js +5 -4
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -101,18 +101,19 @@ vad: {
|
|
|
101
101
|
energyVad?: {
|
|
102
102
|
smoothing: number; // Default: 0.95
|
|
103
103
|
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
-
noiseFloorAdaptRateQuiet: number; // Default: 0.
|
|
105
|
-
noiseFloorAdaptRateLoud: number; // Default: 0.
|
|
106
|
-
minSNR: number; // Default:
|
|
107
|
-
snrRange: number; // Default:
|
|
104
|
+
noiseFloorAdaptRateQuiet: number; // Default: 0.05
|
|
105
|
+
noiseFloorAdaptRateLoud: number; // Default: 0.005
|
|
106
|
+
minSNR: number; // Default: 6.0 (dB)
|
|
107
|
+
snrRange: number; // Default: 12.0 (dB)
|
|
108
|
+
minEnergy: number; // Default: 0.0005
|
|
108
109
|
};
|
|
109
110
|
}
|
|
110
111
|
```
|
|
111
112
|
|
|
112
113
|
**Threshold Parameters:**
|
|
113
114
|
|
|
114
|
-
- `startThreshold`: Probability threshold to unmute audio
|
|
115
|
-
- `stopThreshold`: Probability threshold to mute audio (
|
|
115
|
+
- `startThreshold`: Probability threshold to unmute audio (Default: 0.6, ~13.2dB SNR)
|
|
116
|
+
- `stopThreshold`: Probability threshold to mute audio (Default: 0.45, ~11.4dB SNR)
|
|
116
117
|
- `hangoverMs`: Delay before muting after speech stops
|
|
117
118
|
- `preRollMs`: Audio buffer duration before speech onset
|
|
118
119
|
- `minSpeechDurationMs`: Minimum duration to consider as valid speech
|
|
@@ -121,8 +122,9 @@ vad: {
|
|
|
121
122
|
**Energy VAD Parameters:**
|
|
122
123
|
|
|
123
124
|
- `smoothing`: Energy calculation smoothing factor (0-1)
|
|
124
|
-
- `minSNR`: Minimum signal-to-noise ratio for speech detection
|
|
125
|
-
- `snrRange`: Range for probability scaling from minSNR
|
|
125
|
+
- `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
|
|
126
|
+
- `snrRange`: Range in dB for probability scaling from minSNR
|
|
127
|
+
- `minEnergy`: Minimum absolute RMS energy to consider as speech
|
|
126
128
|
|
|
127
129
|
### Output Control
|
|
128
130
|
|
|
@@ -25,10 +25,11 @@ var VADStateMachine = class {
|
|
|
25
25
|
energyVad: {
|
|
26
26
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
27
27
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
28
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
29
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
30
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
31
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
28
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
29
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 5e-3,
|
|
30
|
+
minSNR: config?.energyVad?.minSNR ?? 6,
|
|
31
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
32
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
32
33
|
}
|
|
33
34
|
};
|
|
34
35
|
this.lastSilenceTime = Date.now();
|
|
@@ -3,10 +3,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
3
3
|
const energyParams = vadConfig?.energyVad || {};
|
|
4
4
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
5
5
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
6
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
7
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
8
|
-
const minSNR = energyParams.minSNR ??
|
|
9
|
-
const snrRange = energyParams.snrRange ??
|
|
6
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
7
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 5e-3;
|
|
8
|
+
const minSNR = energyParams.minSNR ?? 6;
|
|
9
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
10
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
10
11
|
return `
|
|
11
12
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
12
13
|
constructor() {
|
|
@@ -18,6 +19,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
18
19
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
19
20
|
this.minSNR = ${minSNR};
|
|
20
21
|
this.snrRange = ${snrRange};
|
|
22
|
+
this.minEnergy = ${minEnergy};
|
|
21
23
|
this.isSpeaking = false;
|
|
22
24
|
|
|
23
25
|
this.port.onmessage = (event) => {
|
|
@@ -32,36 +34,48 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
32
34
|
if (!input || !input.length) return true;
|
|
33
35
|
const channel = input[0];
|
|
34
36
|
|
|
35
|
-
// Calculate RMS (Root Mean Square) energy
|
|
37
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
36
38
|
let sum = 0;
|
|
37
39
|
for (let i = 0; i < channel.length; i++) {
|
|
38
40
|
sum += channel[i] * channel[i];
|
|
39
41
|
}
|
|
40
|
-
const
|
|
42
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
41
43
|
|
|
42
|
-
//
|
|
43
|
-
//
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
44
|
+
// Smooth the RMS energy to reduce jitter
|
|
45
|
+
// this.energy acts as the smoothed RMS value
|
|
46
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
47
|
+
|
|
48
|
+
// Adaptive noise floor estimation
|
|
49
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
50
|
+
if (instantRms < this.noiseFloor) {
|
|
51
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
52
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
53
|
+
} else {
|
|
54
|
+
// If signal is louder, adapt upwards
|
|
55
|
+
// If we are currently speaking, adapt EXTREMELY slowly to avoid "chasing" speech
|
|
56
|
+
// If we are silent, adapt at the normal loud rate
|
|
57
|
+
const adaptRate = this.isSpeaking ? (this.noiseFloorAdaptRateLoud * 0.02) : this.noiseFloorAdaptRateLoud;
|
|
58
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
52
59
|
}
|
|
53
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
54
60
|
|
|
55
|
-
//
|
|
56
|
-
|
|
61
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
62
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
63
|
+
|
|
64
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
65
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
66
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
57
67
|
|
|
58
|
-
// Map SNR to probability (0-1)
|
|
59
|
-
// Probability is 0 when
|
|
68
|
+
// Map SNR dB to probability (0-1)
|
|
69
|
+
// Probability is 0 when snrDb <= minSNR
|
|
60
70
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
61
|
-
|
|
62
|
-
|
|
71
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
72
|
+
|
|
73
|
+
// Apply absolute energy threshold
|
|
74
|
+
if (this.energy < this.minEnergy) {
|
|
75
|
+
probability = 0;
|
|
76
|
+
}
|
|
63
77
|
|
|
64
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
78
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
65
79
|
|
|
66
80
|
return true;
|
|
67
81
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
VADStateMachine
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-2EX3FXSF.mjs";
|
|
4
4
|
import {
|
|
5
5
|
getAudioContext,
|
|
6
6
|
registerPipeline,
|
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
import {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-BMVZ3KKG.mjs";
|
|
13
13
|
|
|
14
14
|
// src/pipeline/audio-pipeline.ts
|
|
15
15
|
import mitt from "mitt";
|
|
@@ -106,10 +106,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
106
106
|
const energyParams = vadConfig?.energyVad || {};
|
|
107
107
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
108
108
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
109
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
110
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
111
|
-
const minSNR = energyParams.minSNR ??
|
|
112
|
-
const snrRange = energyParams.snrRange ??
|
|
109
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
110
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 5e-3;
|
|
111
|
+
const minSNR = energyParams.minSNR ?? 6;
|
|
112
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
113
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
113
114
|
return `
|
|
114
115
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
115
116
|
constructor() {
|
|
@@ -121,6 +122,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
121
122
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
122
123
|
this.minSNR = ${minSNR};
|
|
123
124
|
this.snrRange = ${snrRange};
|
|
125
|
+
this.minEnergy = ${minEnergy};
|
|
124
126
|
this.isSpeaking = false;
|
|
125
127
|
|
|
126
128
|
this.port.onmessage = (event) => {
|
|
@@ -135,36 +137,48 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
135
137
|
if (!input || !input.length) return true;
|
|
136
138
|
const channel = input[0];
|
|
137
139
|
|
|
138
|
-
// Calculate RMS (Root Mean Square) energy
|
|
140
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
139
141
|
let sum = 0;
|
|
140
142
|
for (let i = 0; i < channel.length; i++) {
|
|
141
143
|
sum += channel[i] * channel[i];
|
|
142
144
|
}
|
|
143
|
-
const
|
|
145
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
144
146
|
|
|
145
|
-
//
|
|
146
|
-
//
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
147
|
+
// Smooth the RMS energy to reduce jitter
|
|
148
|
+
// this.energy acts as the smoothed RMS value
|
|
149
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
150
|
+
|
|
151
|
+
// Adaptive noise floor estimation
|
|
152
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
153
|
+
if (instantRms < this.noiseFloor) {
|
|
154
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
155
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
156
|
+
} else {
|
|
157
|
+
// If signal is louder, adapt upwards
|
|
158
|
+
// If we are currently speaking, adapt EXTREMELY slowly to avoid "chasing" speech
|
|
159
|
+
// If we are silent, adapt at the normal loud rate
|
|
160
|
+
const adaptRate = this.isSpeaking ? (this.noiseFloorAdaptRateLoud * 0.02) : this.noiseFloorAdaptRateLoud;
|
|
161
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
155
162
|
}
|
|
156
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
157
163
|
|
|
158
|
-
//
|
|
159
|
-
|
|
164
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
165
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
166
|
+
|
|
167
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
168
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
169
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
160
170
|
|
|
161
|
-
// Map SNR to probability (0-1)
|
|
162
|
-
// Probability is 0 when
|
|
171
|
+
// Map SNR dB to probability (0-1)
|
|
172
|
+
// Probability is 0 when snrDb <= minSNR
|
|
163
173
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
164
|
-
|
|
165
|
-
|
|
174
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
175
|
+
|
|
176
|
+
// Apply absolute energy threshold
|
|
177
|
+
if (this.energy < this.minEnergy) {
|
|
178
|
+
probability = 0;
|
|
179
|
+
}
|
|
166
180
|
|
|
167
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
181
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
168
182
|
|
|
169
183
|
return true;
|
|
170
184
|
}
|
|
@@ -3,9 +3,9 @@ import {
|
|
|
3
3
|
getVADPlugin,
|
|
4
4
|
registerNoiseSuppressionPlugin,
|
|
5
5
|
registerVADPlugin
|
|
6
|
-
} from "../chunk-
|
|
6
|
+
} from "../chunk-BMVZ3KKG.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-2TKYGFMC.mjs";
|
|
9
9
|
export {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin,
|
package/dist/index.js
CHANGED
|
@@ -158,10 +158,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
158
158
|
const energyParams = vadConfig?.energyVad || {};
|
|
159
159
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
160
160
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
161
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
162
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
163
|
-
const minSNR = energyParams.minSNR ??
|
|
164
|
-
const snrRange = energyParams.snrRange ??
|
|
161
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
162
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 5e-3;
|
|
163
|
+
const minSNR = energyParams.minSNR ?? 6;
|
|
164
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
165
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
165
166
|
return `
|
|
166
167
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
167
168
|
constructor() {
|
|
@@ -173,6 +174,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
173
174
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
174
175
|
this.minSNR = ${minSNR};
|
|
175
176
|
this.snrRange = ${snrRange};
|
|
177
|
+
this.minEnergy = ${minEnergy};
|
|
176
178
|
this.isSpeaking = false;
|
|
177
179
|
|
|
178
180
|
this.port.onmessage = (event) => {
|
|
@@ -187,36 +189,48 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
187
189
|
if (!input || !input.length) return true;
|
|
188
190
|
const channel = input[0];
|
|
189
191
|
|
|
190
|
-
// Calculate RMS (Root Mean Square) energy
|
|
192
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
191
193
|
let sum = 0;
|
|
192
194
|
for (let i = 0; i < channel.length; i++) {
|
|
193
195
|
sum += channel[i] * channel[i];
|
|
194
196
|
}
|
|
195
|
-
const
|
|
197
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
196
198
|
|
|
197
|
-
//
|
|
198
|
-
//
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
199
|
+
// Smooth the RMS energy to reduce jitter
|
|
200
|
+
// this.energy acts as the smoothed RMS value
|
|
201
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
202
|
+
|
|
203
|
+
// Adaptive noise floor estimation
|
|
204
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
205
|
+
if (instantRms < this.noiseFloor) {
|
|
206
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
207
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
208
|
+
} else {
|
|
209
|
+
// If signal is louder, adapt upwards
|
|
210
|
+
// If we are currently speaking, adapt EXTREMELY slowly to avoid "chasing" speech
|
|
211
|
+
// If we are silent, adapt at the normal loud rate
|
|
212
|
+
const adaptRate = this.isSpeaking ? (this.noiseFloorAdaptRateLoud * 0.02) : this.noiseFloorAdaptRateLoud;
|
|
213
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
207
214
|
}
|
|
208
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
209
215
|
|
|
210
|
-
//
|
|
211
|
-
|
|
216
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
217
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
218
|
+
|
|
219
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
220
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
221
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
212
222
|
|
|
213
|
-
// Map SNR to probability (0-1)
|
|
214
|
-
// Probability is 0 when
|
|
223
|
+
// Map SNR dB to probability (0-1)
|
|
224
|
+
// Probability is 0 when snrDb <= minSNR
|
|
215
225
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
216
|
-
|
|
217
|
-
|
|
226
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
227
|
+
|
|
228
|
+
// Apply absolute energy threshold
|
|
229
|
+
if (this.energy < this.minEnergy) {
|
|
230
|
+
probability = 0;
|
|
231
|
+
}
|
|
218
232
|
|
|
219
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
233
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
220
234
|
|
|
221
235
|
return true;
|
|
222
236
|
}
|
|
@@ -347,10 +361,11 @@ var VADStateMachine = class {
|
|
|
347
361
|
energyVad: {
|
|
348
362
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
349
363
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
350
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
351
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
352
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
353
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
364
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
365
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 5e-3,
|
|
366
|
+
minSNR: config?.energyVad?.minSNR ?? 6,
|
|
367
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
368
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
354
369
|
}
|
|
355
370
|
};
|
|
356
371
|
this.lastSilenceTime = Date.now();
|
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import "./chunk-WBQAMGXK.mjs";
|
|
2
2
|
import {
|
|
3
3
|
attachProcessingToTrack
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-UQG6Z5W3.mjs";
|
|
5
5
|
import {
|
|
6
6
|
createAudioPipeline
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-JP6DA62Y.mjs";
|
|
8
8
|
import {
|
|
9
9
|
VADStateMachine
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-2EX3FXSF.mjs";
|
|
11
11
|
import {
|
|
12
12
|
closeAudioContext,
|
|
13
13
|
getAudioContext,
|
|
@@ -21,13 +21,13 @@ import {
|
|
|
21
21
|
getVADPlugin,
|
|
22
22
|
registerNoiseSuppressionPlugin,
|
|
23
23
|
registerVADPlugin
|
|
24
|
-
} from "./chunk-
|
|
24
|
+
} from "./chunk-BMVZ3KKG.mjs";
|
|
25
25
|
import {
|
|
26
26
|
RNNoisePlugin
|
|
27
27
|
} from "./chunk-XO6B3D4A.mjs";
|
|
28
28
|
import {
|
|
29
29
|
EnergyVADPlugin
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-2TKYGFMC.mjs";
|
|
31
31
|
export {
|
|
32
32
|
EnergyVADPlugin,
|
|
33
33
|
RNNoisePlugin,
|
|
@@ -127,10 +127,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
127
127
|
const energyParams = vadConfig?.energyVad || {};
|
|
128
128
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
129
129
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
130
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
131
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
132
|
-
const minSNR = energyParams.minSNR ??
|
|
133
|
-
const snrRange = energyParams.snrRange ??
|
|
130
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
131
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 5e-3;
|
|
132
|
+
const minSNR = energyParams.minSNR ?? 6;
|
|
133
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
134
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
134
135
|
return `
|
|
135
136
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
136
137
|
constructor() {
|
|
@@ -142,6 +143,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
142
143
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
143
144
|
this.minSNR = ${minSNR};
|
|
144
145
|
this.snrRange = ${snrRange};
|
|
146
|
+
this.minEnergy = ${minEnergy};
|
|
145
147
|
this.isSpeaking = false;
|
|
146
148
|
|
|
147
149
|
this.port.onmessage = (event) => {
|
|
@@ -156,36 +158,48 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
156
158
|
if (!input || !input.length) return true;
|
|
157
159
|
const channel = input[0];
|
|
158
160
|
|
|
159
|
-
// Calculate RMS (Root Mean Square) energy
|
|
161
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
160
162
|
let sum = 0;
|
|
161
163
|
for (let i = 0; i < channel.length; i++) {
|
|
162
164
|
sum += channel[i] * channel[i];
|
|
163
165
|
}
|
|
164
|
-
const
|
|
166
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
165
167
|
|
|
166
|
-
//
|
|
167
|
-
//
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
168
|
+
// Smooth the RMS energy to reduce jitter
|
|
169
|
+
// this.energy acts as the smoothed RMS value
|
|
170
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
171
|
+
|
|
172
|
+
// Adaptive noise floor estimation
|
|
173
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
174
|
+
if (instantRms < this.noiseFloor) {
|
|
175
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
176
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
177
|
+
} else {
|
|
178
|
+
// If signal is louder, adapt upwards
|
|
179
|
+
// If we are currently speaking, adapt EXTREMELY slowly to avoid "chasing" speech
|
|
180
|
+
// If we are silent, adapt at the normal loud rate
|
|
181
|
+
const adaptRate = this.isSpeaking ? (this.noiseFloorAdaptRateLoud * 0.02) : this.noiseFloorAdaptRateLoud;
|
|
182
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
176
183
|
}
|
|
177
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
178
184
|
|
|
179
|
-
//
|
|
180
|
-
|
|
185
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
186
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
187
|
+
|
|
188
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
189
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
190
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
181
191
|
|
|
182
|
-
// Map SNR to probability (0-1)
|
|
183
|
-
// Probability is 0 when
|
|
192
|
+
// Map SNR dB to probability (0-1)
|
|
193
|
+
// Probability is 0 when snrDb <= minSNR
|
|
184
194
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
185
|
-
|
|
186
|
-
|
|
195
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
196
|
+
|
|
197
|
+
// Apply absolute energy threshold
|
|
198
|
+
if (this.energy < this.minEnergy) {
|
|
199
|
+
probability = 0;
|
|
200
|
+
}
|
|
187
201
|
|
|
188
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
202
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
189
203
|
|
|
190
204
|
return true;
|
|
191
205
|
}
|
|
@@ -310,10 +324,11 @@ var VADStateMachine = class {
|
|
|
310
324
|
energyVad: {
|
|
311
325
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
312
326
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
313
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
314
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
315
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
316
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
327
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
328
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 5e-3,
|
|
329
|
+
minSNR: config?.energyVad?.minSNR ?? 6,
|
|
330
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
331
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
317
332
|
}
|
|
318
333
|
};
|
|
319
334
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import {
|
|
2
2
|
attachProcessingToTrack
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
5
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-UQG6Z5W3.mjs";
|
|
4
|
+
import "../chunk-JP6DA62Y.mjs";
|
|
5
|
+
import "../chunk-2EX3FXSF.mjs";
|
|
6
6
|
import "../chunk-OZ7KMC4S.mjs";
|
|
7
|
-
import "../chunk-
|
|
7
|
+
import "../chunk-BMVZ3KKG.mjs";
|
|
8
8
|
import "../chunk-XO6B3D4A.mjs";
|
|
9
|
-
import "../chunk-
|
|
9
|
+
import "../chunk-2TKYGFMC.mjs";
|
|
10
10
|
export {
|
|
11
11
|
attachProcessingToTrack
|
|
12
12
|
};
|
|
@@ -125,10 +125,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
125
125
|
const energyParams = vadConfig?.energyVad || {};
|
|
126
126
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
127
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
129
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
130
|
-
const minSNR = energyParams.minSNR ??
|
|
131
|
-
const snrRange = energyParams.snrRange ??
|
|
128
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
129
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 5e-3;
|
|
130
|
+
const minSNR = energyParams.minSNR ?? 6;
|
|
131
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
132
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
132
133
|
return `
|
|
133
134
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
134
135
|
constructor() {
|
|
@@ -140,6 +141,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
140
141
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
141
142
|
this.minSNR = ${minSNR};
|
|
142
143
|
this.snrRange = ${snrRange};
|
|
144
|
+
this.minEnergy = ${minEnergy};
|
|
143
145
|
this.isSpeaking = false;
|
|
144
146
|
|
|
145
147
|
this.port.onmessage = (event) => {
|
|
@@ -154,36 +156,48 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
154
156
|
if (!input || !input.length) return true;
|
|
155
157
|
const channel = input[0];
|
|
156
158
|
|
|
157
|
-
// Calculate RMS (Root Mean Square) energy
|
|
159
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
158
160
|
let sum = 0;
|
|
159
161
|
for (let i = 0; i < channel.length; i++) {
|
|
160
162
|
sum += channel[i] * channel[i];
|
|
161
163
|
}
|
|
162
|
-
const
|
|
164
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
163
165
|
|
|
164
|
-
//
|
|
165
|
-
//
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
166
|
+
// Smooth the RMS energy to reduce jitter
|
|
167
|
+
// this.energy acts as the smoothed RMS value
|
|
168
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
169
|
+
|
|
170
|
+
// Adaptive noise floor estimation
|
|
171
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
172
|
+
if (instantRms < this.noiseFloor) {
|
|
173
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
174
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
175
|
+
} else {
|
|
176
|
+
// If signal is louder, adapt upwards
|
|
177
|
+
// If we are currently speaking, adapt EXTREMELY slowly to avoid "chasing" speech
|
|
178
|
+
// If we are silent, adapt at the normal loud rate
|
|
179
|
+
const adaptRate = this.isSpeaking ? (this.noiseFloorAdaptRateLoud * 0.02) : this.noiseFloorAdaptRateLoud;
|
|
180
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
174
181
|
}
|
|
175
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
176
182
|
|
|
177
|
-
//
|
|
178
|
-
|
|
183
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
184
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
185
|
+
|
|
186
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
187
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
188
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
179
189
|
|
|
180
|
-
// Map SNR to probability (0-1)
|
|
181
|
-
// Probability is 0 when
|
|
190
|
+
// Map SNR dB to probability (0-1)
|
|
191
|
+
// Probability is 0 when snrDb <= minSNR
|
|
182
192
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
183
|
-
|
|
184
|
-
|
|
193
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
194
|
+
|
|
195
|
+
// Apply absolute energy threshold
|
|
196
|
+
if (this.energy < this.minEnergy) {
|
|
197
|
+
probability = 0;
|
|
198
|
+
}
|
|
185
199
|
|
|
186
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
200
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
187
201
|
|
|
188
202
|
return true;
|
|
189
203
|
}
|
|
@@ -308,10 +322,11 @@ var VADStateMachine = class {
|
|
|
308
322
|
energyVad: {
|
|
309
323
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
310
324
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
311
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
312
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
313
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
314
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
325
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
326
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 5e-3,
|
|
327
|
+
minSNR: config?.energyVad?.minSNR ?? 6,
|
|
328
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
329
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
315
330
|
}
|
|
316
331
|
};
|
|
317
332
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
createAudioPipeline
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-JP6DA62Y.mjs";
|
|
4
|
+
import "../chunk-2EX3FXSF.mjs";
|
|
5
5
|
import "../chunk-OZ7KMC4S.mjs";
|
|
6
|
-
import "../chunk-
|
|
6
|
+
import "../chunk-BMVZ3KKG.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-2TKYGFMC.mjs";
|
|
9
9
|
export {
|
|
10
10
|
createAudioPipeline
|
|
11
11
|
};
|
package/dist/types.d.mts
CHANGED
|
@@ -97,24 +97,30 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.05
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
* Default: 0.
|
|
105
|
+
* Default: 0.005 (slower adaptation for speech)
|
|
106
106
|
*/
|
|
107
107
|
noiseFloorAdaptRateLoud?: number;
|
|
108
108
|
/**
|
|
109
|
-
* Minimum SNR (Signal-to-Noise Ratio) for speech detection.
|
|
110
|
-
* Default:
|
|
109
|
+
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
110
|
+
* Default: 6.0 (voice is ~2x louder than noise floor)
|
|
111
111
|
*/
|
|
112
112
|
minSNR?: number;
|
|
113
113
|
/**
|
|
114
|
-
* SNR range for probability scaling.
|
|
115
|
-
* Default:
|
|
114
|
+
* SNR range in dB for probability scaling.
|
|
115
|
+
* Default: 12.0 (probability scales from minSNR to minSNR+snrRange)
|
|
116
116
|
*/
|
|
117
117
|
snrRange?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Minimum absolute RMS energy to consider as speech.
|
|
120
|
+
* Prevents triggering on very quiet background noise in silent rooms.
|
|
121
|
+
* Default: 0.0005
|
|
122
|
+
*/
|
|
123
|
+
minEnergy?: number;
|
|
118
124
|
};
|
|
119
125
|
};
|
|
120
126
|
/**
|
package/dist/types.d.ts
CHANGED
|
@@ -97,24 +97,30 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.05
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
* Default: 0.
|
|
105
|
+
* Default: 0.005 (slower adaptation for speech)
|
|
106
106
|
*/
|
|
107
107
|
noiseFloorAdaptRateLoud?: number;
|
|
108
108
|
/**
|
|
109
|
-
* Minimum SNR (Signal-to-Noise Ratio) for speech detection.
|
|
110
|
-
* Default:
|
|
109
|
+
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
110
|
+
* Default: 6.0 (voice is ~2x louder than noise floor)
|
|
111
111
|
*/
|
|
112
112
|
minSNR?: number;
|
|
113
113
|
/**
|
|
114
|
-
* SNR range for probability scaling.
|
|
115
|
-
* Default:
|
|
114
|
+
* SNR range in dB for probability scaling.
|
|
115
|
+
* Default: 12.0 (probability scales from minSNR to minSNR+snrRange)
|
|
116
116
|
*/
|
|
117
117
|
snrRange?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Minimum absolute RMS energy to consider as speech.
|
|
120
|
+
* Prevents triggering on very quiet background noise in silent rooms.
|
|
121
|
+
* Default: 0.0005
|
|
122
|
+
*/
|
|
123
|
+
minEnergy?: number;
|
|
118
124
|
};
|
|
119
125
|
};
|
|
120
126
|
/**
|
package/dist/vad/vad-node.js
CHANGED
|
@@ -27,10 +27,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
27
27
|
const energyParams = vadConfig?.energyVad || {};
|
|
28
28
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
29
29
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
30
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
31
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
32
|
-
const minSNR = energyParams.minSNR ??
|
|
33
|
-
const snrRange = energyParams.snrRange ??
|
|
30
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
31
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 5e-3;
|
|
32
|
+
const minSNR = energyParams.minSNR ?? 6;
|
|
33
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
34
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
34
35
|
return `
|
|
35
36
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
36
37
|
constructor() {
|
|
@@ -42,6 +43,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
42
43
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
43
44
|
this.minSNR = ${minSNR};
|
|
44
45
|
this.snrRange = ${snrRange};
|
|
46
|
+
this.minEnergy = ${minEnergy};
|
|
45
47
|
this.isSpeaking = false;
|
|
46
48
|
|
|
47
49
|
this.port.onmessage = (event) => {
|
|
@@ -56,36 +58,48 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
56
58
|
if (!input || !input.length) return true;
|
|
57
59
|
const channel = input[0];
|
|
58
60
|
|
|
59
|
-
// Calculate RMS (Root Mean Square) energy
|
|
61
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
60
62
|
let sum = 0;
|
|
61
63
|
for (let i = 0; i < channel.length; i++) {
|
|
62
64
|
sum += channel[i] * channel[i];
|
|
63
65
|
}
|
|
64
|
-
const
|
|
66
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
65
67
|
|
|
66
|
-
//
|
|
67
|
-
//
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
68
|
+
// Smooth the RMS energy to reduce jitter
|
|
69
|
+
// this.energy acts as the smoothed RMS value
|
|
70
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
71
|
+
|
|
72
|
+
// Adaptive noise floor estimation
|
|
73
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
74
|
+
if (instantRms < this.noiseFloor) {
|
|
75
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
76
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
77
|
+
} else {
|
|
78
|
+
// If signal is louder, adapt upwards
|
|
79
|
+
// If we are currently speaking, adapt EXTREMELY slowly to avoid "chasing" speech
|
|
80
|
+
// If we are silent, adapt at the normal loud rate
|
|
81
|
+
const adaptRate = this.isSpeaking ? (this.noiseFloorAdaptRateLoud * 0.02) : this.noiseFloorAdaptRateLoud;
|
|
82
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
76
83
|
}
|
|
77
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
78
84
|
|
|
79
|
-
//
|
|
80
|
-
|
|
85
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
86
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
87
|
+
|
|
88
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
89
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
90
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
81
91
|
|
|
82
|
-
// Map SNR to probability (0-1)
|
|
83
|
-
// Probability is 0 when
|
|
92
|
+
// Map SNR dB to probability (0-1)
|
|
93
|
+
// Probability is 0 when snrDb <= minSNR
|
|
84
94
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
85
|
-
|
|
86
|
-
|
|
95
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
96
|
+
|
|
97
|
+
// Apply absolute energy threshold
|
|
98
|
+
if (this.energy < this.minEnergy) {
|
|
99
|
+
probability = 0;
|
|
100
|
+
}
|
|
87
101
|
|
|
88
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
102
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
89
103
|
|
|
90
104
|
return true;
|
|
91
105
|
}
|
package/dist/vad/vad-node.mjs
CHANGED
package/dist/vad/vad-state.js
CHANGED
|
@@ -49,10 +49,11 @@ var VADStateMachine = class {
|
|
|
49
49
|
energyVad: {
|
|
50
50
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
51
51
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
52
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
53
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
54
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
55
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
52
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
53
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 5e-3,
|
|
54
|
+
minSNR: config?.energyVad?.minSNR ?? 6,
|
|
55
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
56
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
56
57
|
}
|
|
57
58
|
};
|
|
58
59
|
this.lastSilenceTime = Date.now();
|
package/dist/vad/vad-state.mjs
CHANGED