@tensamin/audio 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -9
- package/dist/{chunk-DF4AYGHJ.mjs → chunk-2UPI6VWY.mjs} +2 -2
- package/dist/{chunk-TLPO52HV.mjs → chunk-3A2CTC4K.mjs} +49 -31
- package/dist/{chunk-ZCC7ID7L.mjs → chunk-FOGC2MFA.mjs} +1 -1
- package/dist/{chunk-N553RHTI.mjs → chunk-XHMNP7NC.mjs} +8 -7
- package/dist/{chunk-TWQJGBBU.mjs → chunk-Y6IG7XGC.mjs} +1 -1
- package/dist/extensibility/plugins.js +49 -31
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +57 -38
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +57 -38
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +57 -38
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +15 -9
- package/dist/types.d.ts +15 -9
- package/dist/vad/vad-node.js +49 -31
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.js +8 -7
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -101,19 +101,20 @@ vad: {
|
|
|
101
101
|
energyVad?: {
|
|
102
102
|
smoothing: number; // Default: 0.95
|
|
103
103
|
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
-
noiseFloorAdaptRateQuiet: number; // Default: 0.
|
|
105
|
-
noiseFloorAdaptRateLoud: number; // Default: 0.
|
|
106
|
-
minSNR: number; // Default:
|
|
107
|
-
snrRange: number; // Default:
|
|
104
|
+
noiseFloorAdaptRateQuiet: number; // Default: 0.05
|
|
105
|
+
noiseFloorAdaptRateLoud: number; // Default: 0.01
|
|
106
|
+
minSNR: number; // Default: 10.0 (dB)
|
|
107
|
+
snrRange: number; // Default: 10.0 (dB)
|
|
108
|
+
minEnergy: number; // Default: 0.0005
|
|
108
109
|
};
|
|
109
110
|
}
|
|
110
111
|
```
|
|
111
112
|
|
|
112
113
|
**Threshold Parameters:**
|
|
113
114
|
|
|
114
|
-
- `startThreshold`: Probability threshold to unmute audio
|
|
115
|
-
- `stopThreshold`: Probability threshold to mute audio (
|
|
116
|
-
- `hangoverMs`: Delay before muting after speech stops
|
|
115
|
+
- `startThreshold`: Probability threshold to unmute audio (Default: 0.8, ~18dB SNR)
|
|
116
|
+
- `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
|
|
117
|
+
- `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
|
|
117
118
|
- `preRollMs`: Audio buffer duration before speech onset
|
|
118
119
|
- `minSpeechDurationMs`: Minimum duration to consider as valid speech
|
|
119
120
|
- `minSilenceDurationMs`: Minimum silence duration between speech segments
|
|
@@ -121,8 +122,9 @@ vad: {
|
|
|
121
122
|
**Energy VAD Parameters:**
|
|
122
123
|
|
|
123
124
|
- `smoothing`: Energy calculation smoothing factor (0-1)
|
|
124
|
-
- `minSNR`: Minimum signal-to-noise ratio for speech detection
|
|
125
|
-
- `snrRange`: Range for probability scaling from minSNR
|
|
125
|
+
- `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
|
|
126
|
+
- `snrRange`: Range in dB for probability scaling from minSNR
|
|
127
|
+
- `minEnergy`: Minimum absolute RMS energy to consider as speech
|
|
126
128
|
|
|
127
129
|
### Output Control
|
|
128
130
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
VADStateMachine
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-XHMNP7NC.mjs";
|
|
4
4
|
import {
|
|
5
5
|
getAudioContext,
|
|
6
6
|
registerPipeline,
|
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
import {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-FOGC2MFA.mjs";
|
|
13
13
|
|
|
14
14
|
// src/pipeline/audio-pipeline.ts
|
|
15
15
|
import mitt from "mitt";
|
|
@@ -3,10 +3,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
3
3
|
const energyParams = vadConfig?.energyVad || {};
|
|
4
4
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
5
5
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
6
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
7
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
8
|
-
const minSNR = energyParams.minSNR ??
|
|
9
|
-
const snrRange = energyParams.snrRange ??
|
|
6
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
7
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
|
|
8
|
+
const minSNR = energyParams.minSNR ?? 10;
|
|
9
|
+
const snrRange = energyParams.snrRange ?? 10;
|
|
10
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
10
11
|
return `
|
|
11
12
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
12
13
|
constructor() {
|
|
@@ -18,6 +19,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
18
19
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
19
20
|
this.minSNR = ${minSNR};
|
|
20
21
|
this.snrRange = ${snrRange};
|
|
22
|
+
this.minEnergy = ${minEnergy};
|
|
21
23
|
this.isSpeaking = false;
|
|
22
24
|
|
|
23
25
|
this.port.onmessage = (event) => {
|
|
@@ -32,44 +34,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
32
34
|
if (!input || !input.length) return true;
|
|
33
35
|
const channel = input[0];
|
|
34
36
|
|
|
35
|
-
// Calculate RMS (Root Mean Square) energy
|
|
37
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
36
38
|
let sum = 0;
|
|
37
39
|
for (let i = 0; i < channel.length; i++) {
|
|
38
40
|
sum += channel[i] * channel[i];
|
|
39
41
|
}
|
|
40
|
-
const
|
|
42
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
41
43
|
|
|
42
|
-
//
|
|
43
|
-
//
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
44
|
+
// Smooth the RMS energy to reduce jitter
|
|
45
|
+
// this.energy acts as the smoothed RMS value
|
|
46
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
47
|
+
|
|
48
|
+
// Adaptive noise floor estimation
|
|
49
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
50
|
+
if (instantRms < this.noiseFloor) {
|
|
51
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
52
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
53
|
+
} else {
|
|
54
|
+
// If signal is louder, adapt upwards
|
|
55
|
+
// We use a multi-stage adaptation rate:
|
|
56
|
+
// 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
|
|
57
|
+
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
58
|
+
// 3. Otherwise, adapt at the normal loud rate
|
|
59
|
+
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
60
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
61
|
+
|
|
62
|
+
let multiplier = 1.0;
|
|
63
|
+
if (this.isSpeaking) {
|
|
64
|
+
multiplier = 0.01;
|
|
65
|
+
} else if (snrDb > 20) {
|
|
66
|
+
multiplier = 0.1;
|
|
59
67
|
}
|
|
68
|
+
|
|
69
|
+
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
70
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
60
71
|
}
|
|
61
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
62
72
|
|
|
63
|
-
//
|
|
64
|
-
|
|
73
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
74
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
75
|
+
|
|
76
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
77
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
78
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
65
79
|
|
|
66
|
-
// Map SNR to probability (0-1)
|
|
67
|
-
// Probability is 0 when
|
|
80
|
+
// Map SNR dB to probability (0-1)
|
|
81
|
+
// Probability is 0 when snrDb <= minSNR
|
|
68
82
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
69
|
-
|
|
70
|
-
|
|
83
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
84
|
+
|
|
85
|
+
// Apply absolute energy threshold
|
|
86
|
+
if (this.energy < this.minEnergy) {
|
|
87
|
+
probability = 0;
|
|
88
|
+
}
|
|
71
89
|
|
|
72
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
90
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
73
91
|
|
|
74
92
|
return true;
|
|
75
93
|
}
|
|
@@ -12,11 +12,11 @@ var VADStateMachine = class {
|
|
|
12
12
|
enabled: config?.enabled ?? true,
|
|
13
13
|
pluginName: config?.pluginName ?? "energy-vad",
|
|
14
14
|
// Voice-optimized defaults
|
|
15
|
-
startThreshold: config?.startThreshold ?? 0.
|
|
15
|
+
startThreshold: config?.startThreshold ?? 0.8,
|
|
16
16
|
// Higher threshold to avoid noise
|
|
17
|
-
stopThreshold: config?.stopThreshold ?? 0.
|
|
17
|
+
stopThreshold: config?.stopThreshold ?? 0.3,
|
|
18
18
|
// Balanced for voice
|
|
19
|
-
hangoverMs: config?.hangoverMs ??
|
|
19
|
+
hangoverMs: config?.hangoverMs ?? 300,
|
|
20
20
|
// Smooth for natural speech
|
|
21
21
|
preRollMs: config?.preRollMs ?? 250,
|
|
22
22
|
// Generous pre-roll
|
|
@@ -25,10 +25,11 @@ var VADStateMachine = class {
|
|
|
25
25
|
energyVad: {
|
|
26
26
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
27
27
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
28
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
29
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
30
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
31
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
28
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
29
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
|
|
30
|
+
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
31
|
+
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
32
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
32
33
|
}
|
|
33
34
|
};
|
|
34
35
|
this.lastSilenceTime = Date.now();
|
|
@@ -106,10 +106,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
106
106
|
const energyParams = vadConfig?.energyVad || {};
|
|
107
107
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
108
108
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
109
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
110
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
111
|
-
const minSNR = energyParams.minSNR ??
|
|
112
|
-
const snrRange = energyParams.snrRange ??
|
|
109
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
110
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
|
|
111
|
+
const minSNR = energyParams.minSNR ?? 10;
|
|
112
|
+
const snrRange = energyParams.snrRange ?? 10;
|
|
113
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
113
114
|
return `
|
|
114
115
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
115
116
|
constructor() {
|
|
@@ -121,6 +122,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
121
122
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
122
123
|
this.minSNR = ${minSNR};
|
|
123
124
|
this.snrRange = ${snrRange};
|
|
125
|
+
this.minEnergy = ${minEnergy};
|
|
124
126
|
this.isSpeaking = false;
|
|
125
127
|
|
|
126
128
|
this.port.onmessage = (event) => {
|
|
@@ -135,44 +137,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
135
137
|
if (!input || !input.length) return true;
|
|
136
138
|
const channel = input[0];
|
|
137
139
|
|
|
138
|
-
// Calculate RMS (Root Mean Square) energy
|
|
140
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
139
141
|
let sum = 0;
|
|
140
142
|
for (let i = 0; i < channel.length; i++) {
|
|
141
143
|
sum += channel[i] * channel[i];
|
|
142
144
|
}
|
|
143
|
-
const
|
|
145
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
144
146
|
|
|
145
|
-
//
|
|
146
|
-
//
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
147
|
+
// Smooth the RMS energy to reduce jitter
|
|
148
|
+
// this.energy acts as the smoothed RMS value
|
|
149
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
150
|
+
|
|
151
|
+
// Adaptive noise floor estimation
|
|
152
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
153
|
+
if (instantRms < this.noiseFloor) {
|
|
154
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
155
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
156
|
+
} else {
|
|
157
|
+
// If signal is louder, adapt upwards
|
|
158
|
+
// We use a multi-stage adaptation rate:
|
|
159
|
+
// 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
|
|
160
|
+
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
161
|
+
// 3. Otherwise, adapt at the normal loud rate
|
|
162
|
+
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
163
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
164
|
+
|
|
165
|
+
let multiplier = 1.0;
|
|
166
|
+
if (this.isSpeaking) {
|
|
167
|
+
multiplier = 0.01;
|
|
168
|
+
} else if (snrDb > 20) {
|
|
169
|
+
multiplier = 0.1;
|
|
162
170
|
}
|
|
171
|
+
|
|
172
|
+
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
173
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
163
174
|
}
|
|
164
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
165
175
|
|
|
166
|
-
//
|
|
167
|
-
|
|
176
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
177
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
178
|
+
|
|
179
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
180
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
181
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
168
182
|
|
|
169
|
-
// Map SNR to probability (0-1)
|
|
170
|
-
// Probability is 0 when
|
|
183
|
+
// Map SNR dB to probability (0-1)
|
|
184
|
+
// Probability is 0 when snrDb <= minSNR
|
|
171
185
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
172
|
-
|
|
173
|
-
|
|
186
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
187
|
+
|
|
188
|
+
// Apply absolute energy threshold
|
|
189
|
+
if (this.energy < this.minEnergy) {
|
|
190
|
+
probability = 0;
|
|
191
|
+
}
|
|
174
192
|
|
|
175
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
193
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
176
194
|
|
|
177
195
|
return true;
|
|
178
196
|
}
|
|
@@ -3,9 +3,9 @@ import {
|
|
|
3
3
|
getVADPlugin,
|
|
4
4
|
registerNoiseSuppressionPlugin,
|
|
5
5
|
registerVADPlugin
|
|
6
|
-
} from "../chunk-
|
|
6
|
+
} from "../chunk-FOGC2MFA.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-3A2CTC4K.mjs";
|
|
9
9
|
export {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin,
|
package/dist/index.js
CHANGED
|
@@ -158,10 +158,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
158
158
|
const energyParams = vadConfig?.energyVad || {};
|
|
159
159
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
160
160
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
161
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
162
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
163
|
-
const minSNR = energyParams.minSNR ??
|
|
164
|
-
const snrRange = energyParams.snrRange ??
|
|
161
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
162
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
|
|
163
|
+
const minSNR = energyParams.minSNR ?? 10;
|
|
164
|
+
const snrRange = energyParams.snrRange ?? 10;
|
|
165
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
165
166
|
return `
|
|
166
167
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
167
168
|
constructor() {
|
|
@@ -173,6 +174,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
173
174
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
174
175
|
this.minSNR = ${minSNR};
|
|
175
176
|
this.snrRange = ${snrRange};
|
|
177
|
+
this.minEnergy = ${minEnergy};
|
|
176
178
|
this.isSpeaking = false;
|
|
177
179
|
|
|
178
180
|
this.port.onmessage = (event) => {
|
|
@@ -187,44 +189,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
187
189
|
if (!input || !input.length) return true;
|
|
188
190
|
const channel = input[0];
|
|
189
191
|
|
|
190
|
-
// Calculate RMS (Root Mean Square) energy
|
|
192
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
191
193
|
let sum = 0;
|
|
192
194
|
for (let i = 0; i < channel.length; i++) {
|
|
193
195
|
sum += channel[i] * channel[i];
|
|
194
196
|
}
|
|
195
|
-
const
|
|
197
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
196
198
|
|
|
197
|
-
//
|
|
198
|
-
//
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
199
|
+
// Smooth the RMS energy to reduce jitter
|
|
200
|
+
// this.energy acts as the smoothed RMS value
|
|
201
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
202
|
+
|
|
203
|
+
// Adaptive noise floor estimation
|
|
204
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
205
|
+
if (instantRms < this.noiseFloor) {
|
|
206
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
207
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
208
|
+
} else {
|
|
209
|
+
// If signal is louder, adapt upwards
|
|
210
|
+
// We use a multi-stage adaptation rate:
|
|
211
|
+
// 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
|
|
212
|
+
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
213
|
+
// 3. Otherwise, adapt at the normal loud rate
|
|
214
|
+
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
215
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
216
|
+
|
|
217
|
+
let multiplier = 1.0;
|
|
218
|
+
if (this.isSpeaking) {
|
|
219
|
+
multiplier = 0.01;
|
|
220
|
+
} else if (snrDb > 20) {
|
|
221
|
+
multiplier = 0.1;
|
|
214
222
|
}
|
|
223
|
+
|
|
224
|
+
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
225
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
215
226
|
}
|
|
216
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
217
227
|
|
|
218
|
-
//
|
|
219
|
-
|
|
228
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
229
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
230
|
+
|
|
231
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
232
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
233
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
220
234
|
|
|
221
|
-
// Map SNR to probability (0-1)
|
|
222
|
-
// Probability is 0 when
|
|
235
|
+
// Map SNR dB to probability (0-1)
|
|
236
|
+
// Probability is 0 when snrDb <= minSNR
|
|
223
237
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
224
|
-
|
|
225
|
-
|
|
238
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
239
|
+
|
|
240
|
+
// Apply absolute energy threshold
|
|
241
|
+
if (this.energy < this.minEnergy) {
|
|
242
|
+
probability = 0;
|
|
243
|
+
}
|
|
226
244
|
|
|
227
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
245
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
228
246
|
|
|
229
247
|
return true;
|
|
230
248
|
}
|
|
@@ -342,11 +360,11 @@ var VADStateMachine = class {
|
|
|
342
360
|
enabled: config?.enabled ?? true,
|
|
343
361
|
pluginName: config?.pluginName ?? "energy-vad",
|
|
344
362
|
// Voice-optimized defaults
|
|
345
|
-
startThreshold: config?.startThreshold ?? 0.
|
|
363
|
+
startThreshold: config?.startThreshold ?? 0.8,
|
|
346
364
|
// Higher threshold to avoid noise
|
|
347
|
-
stopThreshold: config?.stopThreshold ?? 0.
|
|
365
|
+
stopThreshold: config?.stopThreshold ?? 0.3,
|
|
348
366
|
// Balanced for voice
|
|
349
|
-
hangoverMs: config?.hangoverMs ??
|
|
367
|
+
hangoverMs: config?.hangoverMs ?? 300,
|
|
350
368
|
// Smooth for natural speech
|
|
351
369
|
preRollMs: config?.preRollMs ?? 250,
|
|
352
370
|
// Generous pre-roll
|
|
@@ -355,10 +373,11 @@ var VADStateMachine = class {
|
|
|
355
373
|
energyVad: {
|
|
356
374
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
357
375
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
358
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
359
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
360
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
361
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
376
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
377
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
|
|
378
|
+
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
379
|
+
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
380
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
362
381
|
}
|
|
363
382
|
};
|
|
364
383
|
this.lastSilenceTime = Date.now();
|
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import "./chunk-WBQAMGXK.mjs";
|
|
2
2
|
import {
|
|
3
3
|
attachProcessingToTrack
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-Y6IG7XGC.mjs";
|
|
5
5
|
import {
|
|
6
6
|
createAudioPipeline
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-2UPI6VWY.mjs";
|
|
8
8
|
import {
|
|
9
9
|
VADStateMachine
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-XHMNP7NC.mjs";
|
|
11
11
|
import {
|
|
12
12
|
closeAudioContext,
|
|
13
13
|
getAudioContext,
|
|
@@ -21,13 +21,13 @@ import {
|
|
|
21
21
|
getVADPlugin,
|
|
22
22
|
registerNoiseSuppressionPlugin,
|
|
23
23
|
registerVADPlugin
|
|
24
|
-
} from "./chunk-
|
|
24
|
+
} from "./chunk-FOGC2MFA.mjs";
|
|
25
25
|
import {
|
|
26
26
|
RNNoisePlugin
|
|
27
27
|
} from "./chunk-XO6B3D4A.mjs";
|
|
28
28
|
import {
|
|
29
29
|
EnergyVADPlugin
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-3A2CTC4K.mjs";
|
|
31
31
|
export {
|
|
32
32
|
EnergyVADPlugin,
|
|
33
33
|
RNNoisePlugin,
|
|
@@ -127,10 +127,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
127
127
|
const energyParams = vadConfig?.energyVad || {};
|
|
128
128
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
129
129
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
130
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
131
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
132
|
-
const minSNR = energyParams.minSNR ??
|
|
133
|
-
const snrRange = energyParams.snrRange ??
|
|
130
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
131
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
|
|
132
|
+
const minSNR = energyParams.minSNR ?? 10;
|
|
133
|
+
const snrRange = energyParams.snrRange ?? 10;
|
|
134
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
134
135
|
return `
|
|
135
136
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
136
137
|
constructor() {
|
|
@@ -142,6 +143,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
142
143
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
143
144
|
this.minSNR = ${minSNR};
|
|
144
145
|
this.snrRange = ${snrRange};
|
|
146
|
+
this.minEnergy = ${minEnergy};
|
|
145
147
|
this.isSpeaking = false;
|
|
146
148
|
|
|
147
149
|
this.port.onmessage = (event) => {
|
|
@@ -156,44 +158,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
156
158
|
if (!input || !input.length) return true;
|
|
157
159
|
const channel = input[0];
|
|
158
160
|
|
|
159
|
-
// Calculate RMS (Root Mean Square) energy
|
|
161
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
160
162
|
let sum = 0;
|
|
161
163
|
for (let i = 0; i < channel.length; i++) {
|
|
162
164
|
sum += channel[i] * channel[i];
|
|
163
165
|
}
|
|
164
|
-
const
|
|
166
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
165
167
|
|
|
166
|
-
//
|
|
167
|
-
//
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
168
|
+
// Smooth the RMS energy to reduce jitter
|
|
169
|
+
// this.energy acts as the smoothed RMS value
|
|
170
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
171
|
+
|
|
172
|
+
// Adaptive noise floor estimation
|
|
173
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
174
|
+
if (instantRms < this.noiseFloor) {
|
|
175
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
176
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
177
|
+
} else {
|
|
178
|
+
// If signal is louder, adapt upwards
|
|
179
|
+
// We use a multi-stage adaptation rate:
|
|
180
|
+
// 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
|
|
181
|
+
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
182
|
+
// 3. Otherwise, adapt at the normal loud rate
|
|
183
|
+
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
184
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
185
|
+
|
|
186
|
+
let multiplier = 1.0;
|
|
187
|
+
if (this.isSpeaking) {
|
|
188
|
+
multiplier = 0.01;
|
|
189
|
+
} else if (snrDb > 20) {
|
|
190
|
+
multiplier = 0.1;
|
|
183
191
|
}
|
|
192
|
+
|
|
193
|
+
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
194
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
184
195
|
}
|
|
185
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
186
196
|
|
|
187
|
-
//
|
|
188
|
-
|
|
197
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
198
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
199
|
+
|
|
200
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
201
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
202
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
189
203
|
|
|
190
|
-
// Map SNR to probability (0-1)
|
|
191
|
-
// Probability is 0 when
|
|
204
|
+
// Map SNR dB to probability (0-1)
|
|
205
|
+
// Probability is 0 when snrDb <= minSNR
|
|
192
206
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
193
|
-
|
|
194
|
-
|
|
207
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
208
|
+
|
|
209
|
+
// Apply absolute energy threshold
|
|
210
|
+
if (this.energy < this.minEnergy) {
|
|
211
|
+
probability = 0;
|
|
212
|
+
}
|
|
195
213
|
|
|
196
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
214
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
197
215
|
|
|
198
216
|
return true;
|
|
199
217
|
}
|
|
@@ -305,11 +323,11 @@ var VADStateMachine = class {
|
|
|
305
323
|
enabled: config?.enabled ?? true,
|
|
306
324
|
pluginName: config?.pluginName ?? "energy-vad",
|
|
307
325
|
// Voice-optimized defaults
|
|
308
|
-
startThreshold: config?.startThreshold ?? 0.
|
|
326
|
+
startThreshold: config?.startThreshold ?? 0.8,
|
|
309
327
|
// Higher threshold to avoid noise
|
|
310
|
-
stopThreshold: config?.stopThreshold ?? 0.
|
|
328
|
+
stopThreshold: config?.stopThreshold ?? 0.3,
|
|
311
329
|
// Balanced for voice
|
|
312
|
-
hangoverMs: config?.hangoverMs ??
|
|
330
|
+
hangoverMs: config?.hangoverMs ?? 300,
|
|
313
331
|
// Smooth for natural speech
|
|
314
332
|
preRollMs: config?.preRollMs ?? 250,
|
|
315
333
|
// Generous pre-roll
|
|
@@ -318,10 +336,11 @@ var VADStateMachine = class {
|
|
|
318
336
|
energyVad: {
|
|
319
337
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
320
338
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
321
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
322
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
323
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
324
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
339
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
340
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
|
|
341
|
+
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
342
|
+
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
343
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
325
344
|
}
|
|
326
345
|
};
|
|
327
346
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import {
|
|
2
2
|
attachProcessingToTrack
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
5
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-Y6IG7XGC.mjs";
|
|
4
|
+
import "../chunk-2UPI6VWY.mjs";
|
|
5
|
+
import "../chunk-XHMNP7NC.mjs";
|
|
6
6
|
import "../chunk-OZ7KMC4S.mjs";
|
|
7
|
-
import "../chunk-
|
|
7
|
+
import "../chunk-FOGC2MFA.mjs";
|
|
8
8
|
import "../chunk-XO6B3D4A.mjs";
|
|
9
|
-
import "../chunk-
|
|
9
|
+
import "../chunk-3A2CTC4K.mjs";
|
|
10
10
|
export {
|
|
11
11
|
attachProcessingToTrack
|
|
12
12
|
};
|
|
@@ -125,10 +125,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
125
125
|
const energyParams = vadConfig?.energyVad || {};
|
|
126
126
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
127
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
129
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
130
|
-
const minSNR = energyParams.minSNR ??
|
|
131
|
-
const snrRange = energyParams.snrRange ??
|
|
128
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
129
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
|
|
130
|
+
const minSNR = energyParams.minSNR ?? 10;
|
|
131
|
+
const snrRange = energyParams.snrRange ?? 10;
|
|
132
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
132
133
|
return `
|
|
133
134
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
134
135
|
constructor() {
|
|
@@ -140,6 +141,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
140
141
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
141
142
|
this.minSNR = ${minSNR};
|
|
142
143
|
this.snrRange = ${snrRange};
|
|
144
|
+
this.minEnergy = ${minEnergy};
|
|
143
145
|
this.isSpeaking = false;
|
|
144
146
|
|
|
145
147
|
this.port.onmessage = (event) => {
|
|
@@ -154,44 +156,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
154
156
|
if (!input || !input.length) return true;
|
|
155
157
|
const channel = input[0];
|
|
156
158
|
|
|
157
|
-
// Calculate RMS (Root Mean Square) energy
|
|
159
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
158
160
|
let sum = 0;
|
|
159
161
|
for (let i = 0; i < channel.length; i++) {
|
|
160
162
|
sum += channel[i] * channel[i];
|
|
161
163
|
}
|
|
162
|
-
const
|
|
164
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
163
165
|
|
|
164
|
-
//
|
|
165
|
-
//
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
166
|
+
// Smooth the RMS energy to reduce jitter
|
|
167
|
+
// this.energy acts as the smoothed RMS value
|
|
168
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
169
|
+
|
|
170
|
+
// Adaptive noise floor estimation
|
|
171
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
172
|
+
if (instantRms < this.noiseFloor) {
|
|
173
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
174
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
175
|
+
} else {
|
|
176
|
+
// If signal is louder, adapt upwards
|
|
177
|
+
// We use a multi-stage adaptation rate:
|
|
178
|
+
// 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
|
|
179
|
+
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
180
|
+
// 3. Otherwise, adapt at the normal loud rate
|
|
181
|
+
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
182
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
183
|
+
|
|
184
|
+
let multiplier = 1.0;
|
|
185
|
+
if (this.isSpeaking) {
|
|
186
|
+
multiplier = 0.01;
|
|
187
|
+
} else if (snrDb > 20) {
|
|
188
|
+
multiplier = 0.1;
|
|
181
189
|
}
|
|
190
|
+
|
|
191
|
+
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
192
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
182
193
|
}
|
|
183
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
184
194
|
|
|
185
|
-
//
|
|
186
|
-
|
|
195
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
196
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
197
|
+
|
|
198
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
199
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
200
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
187
201
|
|
|
188
|
-
// Map SNR to probability (0-1)
|
|
189
|
-
// Probability is 0 when
|
|
202
|
+
// Map SNR dB to probability (0-1)
|
|
203
|
+
// Probability is 0 when snrDb <= minSNR
|
|
190
204
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
191
|
-
|
|
192
|
-
|
|
205
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
206
|
+
|
|
207
|
+
// Apply absolute energy threshold
|
|
208
|
+
if (this.energy < this.minEnergy) {
|
|
209
|
+
probability = 0;
|
|
210
|
+
}
|
|
193
211
|
|
|
194
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
212
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
195
213
|
|
|
196
214
|
return true;
|
|
197
215
|
}
|
|
@@ -303,11 +321,11 @@ var VADStateMachine = class {
|
|
|
303
321
|
enabled: config?.enabled ?? true,
|
|
304
322
|
pluginName: config?.pluginName ?? "energy-vad",
|
|
305
323
|
// Voice-optimized defaults
|
|
306
|
-
startThreshold: config?.startThreshold ?? 0.
|
|
324
|
+
startThreshold: config?.startThreshold ?? 0.8,
|
|
307
325
|
// Higher threshold to avoid noise
|
|
308
|
-
stopThreshold: config?.stopThreshold ?? 0.
|
|
326
|
+
stopThreshold: config?.stopThreshold ?? 0.3,
|
|
309
327
|
// Balanced for voice
|
|
310
|
-
hangoverMs: config?.hangoverMs ??
|
|
328
|
+
hangoverMs: config?.hangoverMs ?? 300,
|
|
311
329
|
// Smooth for natural speech
|
|
312
330
|
preRollMs: config?.preRollMs ?? 250,
|
|
313
331
|
// Generous pre-roll
|
|
@@ -316,10 +334,11 @@ var VADStateMachine = class {
|
|
|
316
334
|
energyVad: {
|
|
317
335
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
318
336
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
319
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
320
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
321
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
322
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
337
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
338
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
|
|
339
|
+
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
340
|
+
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
341
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
323
342
|
}
|
|
324
343
|
};
|
|
325
344
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
createAudioPipeline
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-2UPI6VWY.mjs";
|
|
4
|
+
import "../chunk-XHMNP7NC.mjs";
|
|
5
5
|
import "../chunk-OZ7KMC4S.mjs";
|
|
6
|
-
import "../chunk-
|
|
6
|
+
import "../chunk-FOGC2MFA.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-3A2CTC4K.mjs";
|
|
9
9
|
export {
|
|
10
10
|
createAudioPipeline
|
|
11
11
|
};
|
package/dist/types.d.mts
CHANGED
|
@@ -43,7 +43,7 @@ interface AudioProcessingConfig {
|
|
|
43
43
|
* When VAD probability rises above this, audio is unmuted.
|
|
44
44
|
* Lower = more sensitive (catches quiet speech, may include noise)
|
|
45
45
|
* Higher = less sensitive (only confident speech, may clip quiet parts)
|
|
46
|
-
* Default: 0.
|
|
46
|
+
* Default: 0.8 (aggressive noise rejection)
|
|
47
47
|
*/
|
|
48
48
|
startThreshold?: number;
|
|
49
49
|
/**
|
|
@@ -51,7 +51,7 @@ interface AudioProcessingConfig {
|
|
|
51
51
|
* When VAD probability drops below this (after hangover), audio is muted.
|
|
52
52
|
* Lower = keeps audio on longer (less aggressive gating)
|
|
53
53
|
* Higher = mutes faster (more aggressive noise suppression)
|
|
54
|
-
* Default: 0.
|
|
54
|
+
* Default: 0.3 (wide hysteresis for stability)
|
|
55
55
|
*/
|
|
56
56
|
stopThreshold?: number;
|
|
57
57
|
/**
|
|
@@ -59,7 +59,7 @@ interface AudioProcessingConfig {
|
|
|
59
59
|
* Prevents rapid on/off toggling during pauses.
|
|
60
60
|
* Lower = more aggressive gating, may clip between words
|
|
61
61
|
* Higher = smoother but may let trailing noise through
|
|
62
|
-
* Default:
|
|
62
|
+
* Default: 300ms
|
|
63
63
|
*/
|
|
64
64
|
hangoverMs?: number;
|
|
65
65
|
/**
|
|
@@ -97,24 +97,30 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.05
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
* Default: 0.
|
|
105
|
+
* Default: 0.01 (faster tracking of rising noise)
|
|
106
106
|
*/
|
|
107
107
|
noiseFloorAdaptRateLoud?: number;
|
|
108
108
|
/**
|
|
109
|
-
* Minimum SNR (Signal-to-Noise Ratio) for speech detection.
|
|
110
|
-
* Default:
|
|
109
|
+
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
110
|
+
* Default: 10.0 (more aggressive noise rejection)
|
|
111
111
|
*/
|
|
112
112
|
minSNR?: number;
|
|
113
113
|
/**
|
|
114
|
-
* SNR range for probability scaling.
|
|
115
|
-
* Default:
|
|
114
|
+
* SNR range in dB for probability scaling.
|
|
115
|
+
* Default: 10.0 (probability scales from minSNR to minSNR+snrRange)
|
|
116
116
|
*/
|
|
117
117
|
snrRange?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Minimum absolute RMS energy to consider as speech.
|
|
120
|
+
* Prevents triggering on very quiet background noise in silent rooms.
|
|
121
|
+
* Default: 0.0005
|
|
122
|
+
*/
|
|
123
|
+
minEnergy?: number;
|
|
118
124
|
};
|
|
119
125
|
};
|
|
120
126
|
/**
|
package/dist/types.d.ts
CHANGED
|
@@ -43,7 +43,7 @@ interface AudioProcessingConfig {
|
|
|
43
43
|
* When VAD probability rises above this, audio is unmuted.
|
|
44
44
|
* Lower = more sensitive (catches quiet speech, may include noise)
|
|
45
45
|
* Higher = less sensitive (only confident speech, may clip quiet parts)
|
|
46
|
-
* Default: 0.
|
|
46
|
+
* Default: 0.8 (aggressive noise rejection)
|
|
47
47
|
*/
|
|
48
48
|
startThreshold?: number;
|
|
49
49
|
/**
|
|
@@ -51,7 +51,7 @@ interface AudioProcessingConfig {
|
|
|
51
51
|
* When VAD probability drops below this (after hangover), audio is muted.
|
|
52
52
|
* Lower = keeps audio on longer (less aggressive gating)
|
|
53
53
|
* Higher = mutes faster (more aggressive noise suppression)
|
|
54
|
-
* Default: 0.
|
|
54
|
+
* Default: 0.3 (wide hysteresis for stability)
|
|
55
55
|
*/
|
|
56
56
|
stopThreshold?: number;
|
|
57
57
|
/**
|
|
@@ -59,7 +59,7 @@ interface AudioProcessingConfig {
|
|
|
59
59
|
* Prevents rapid on/off toggling during pauses.
|
|
60
60
|
* Lower = more aggressive gating, may clip between words
|
|
61
61
|
* Higher = smoother but may let trailing noise through
|
|
62
|
-
* Default:
|
|
62
|
+
* Default: 300ms
|
|
63
63
|
*/
|
|
64
64
|
hangoverMs?: number;
|
|
65
65
|
/**
|
|
@@ -97,24 +97,30 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.05
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
* Default: 0.
|
|
105
|
+
* Default: 0.01 (faster tracking of rising noise)
|
|
106
106
|
*/
|
|
107
107
|
noiseFloorAdaptRateLoud?: number;
|
|
108
108
|
/**
|
|
109
|
-
* Minimum SNR (Signal-to-Noise Ratio) for speech detection.
|
|
110
|
-
* Default:
|
|
109
|
+
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
110
|
+
* Default: 10.0 (more aggressive noise rejection)
|
|
111
111
|
*/
|
|
112
112
|
minSNR?: number;
|
|
113
113
|
/**
|
|
114
|
-
* SNR range for probability scaling.
|
|
115
|
-
* Default:
|
|
114
|
+
* SNR range in dB for probability scaling.
|
|
115
|
+
* Default: 10.0 (probability scales from minSNR to minSNR+snrRange)
|
|
116
116
|
*/
|
|
117
117
|
snrRange?: number;
|
|
118
|
+
/**
|
|
119
|
+
* Minimum absolute RMS energy to consider as speech.
|
|
120
|
+
* Prevents triggering on very quiet background noise in silent rooms.
|
|
121
|
+
* Default: 0.0005
|
|
122
|
+
*/
|
|
123
|
+
minEnergy?: number;
|
|
118
124
|
};
|
|
119
125
|
};
|
|
120
126
|
/**
|
package/dist/vad/vad-node.js
CHANGED
|
@@ -27,10 +27,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
27
27
|
const energyParams = vadConfig?.energyVad || {};
|
|
28
28
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
29
29
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
30
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.
|
|
31
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ??
|
|
32
|
-
const minSNR = energyParams.minSNR ??
|
|
33
|
-
const snrRange = energyParams.snrRange ??
|
|
30
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.05;
|
|
31
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.01;
|
|
32
|
+
const minSNR = energyParams.minSNR ?? 10;
|
|
33
|
+
const snrRange = energyParams.snrRange ?? 10;
|
|
34
|
+
const minEnergy = energyParams.minEnergy ?? 5e-4;
|
|
34
35
|
return `
|
|
35
36
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
36
37
|
constructor() {
|
|
@@ -42,6 +43,7 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
42
43
|
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
43
44
|
this.minSNR = ${minSNR};
|
|
44
45
|
this.snrRange = ${snrRange};
|
|
46
|
+
this.minEnergy = ${minEnergy};
|
|
45
47
|
this.isSpeaking = false;
|
|
46
48
|
|
|
47
49
|
this.port.onmessage = (event) => {
|
|
@@ -56,44 +58,60 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
56
58
|
if (!input || !input.length) return true;
|
|
57
59
|
const channel = input[0];
|
|
58
60
|
|
|
59
|
-
// Calculate RMS (Root Mean Square) energy
|
|
61
|
+
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
60
62
|
let sum = 0;
|
|
61
63
|
for (let i = 0; i < channel.length; i++) {
|
|
62
64
|
sum += channel[i] * channel[i];
|
|
63
65
|
}
|
|
64
|
-
const
|
|
66
|
+
const instantRms = Math.sqrt(sum / channel.length);
|
|
65
67
|
|
|
66
|
-
//
|
|
67
|
-
//
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
68
|
+
// Smooth the RMS energy to reduce jitter
|
|
69
|
+
// this.energy acts as the smoothed RMS value
|
|
70
|
+
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
71
|
+
|
|
72
|
+
// Adaptive noise floor estimation
|
|
73
|
+
// We use the instantaneous RMS for noise floor tracking to react quickly to silence
|
|
74
|
+
if (instantRms < this.noiseFloor) {
|
|
75
|
+
// If signal is quieter than noise floor, adapt downwards quickly
|
|
76
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
77
|
+
} else {
|
|
78
|
+
// If signal is louder, adapt upwards
|
|
79
|
+
// We use a multi-stage adaptation rate:
|
|
80
|
+
// 1. If we are officially speaking, adapt EXTREMELY slowly (0.01x)
|
|
81
|
+
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
82
|
+
// 3. Otherwise, adapt at the normal loud rate
|
|
83
|
+
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
84
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
85
|
+
|
|
86
|
+
let multiplier = 1.0;
|
|
87
|
+
if (this.isSpeaking) {
|
|
88
|
+
multiplier = 0.01;
|
|
89
|
+
} else if (snrDb > 20) {
|
|
90
|
+
multiplier = 0.1;
|
|
83
91
|
}
|
|
92
|
+
|
|
93
|
+
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
94
|
+
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
84
95
|
}
|
|
85
|
-
// During speech, freeze the noise floor to maintain consistent detection
|
|
86
96
|
|
|
87
|
-
//
|
|
88
|
-
|
|
97
|
+
// Ensure noise floor doesn't drop to absolute zero
|
|
98
|
+
this.noiseFloor = Math.max(this.noiseFloor, 1e-5);
|
|
99
|
+
|
|
100
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
101
|
+
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
102
|
+
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
89
103
|
|
|
90
|
-
// Map SNR to probability (0-1)
|
|
91
|
-
// Probability is 0 when
|
|
104
|
+
// Map SNR dB to probability (0-1)
|
|
105
|
+
// Probability is 0 when snrDb <= minSNR
|
|
92
106
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
93
|
-
|
|
94
|
-
|
|
107
|
+
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
108
|
+
|
|
109
|
+
// Apply absolute energy threshold
|
|
110
|
+
if (this.energy < this.minEnergy) {
|
|
111
|
+
probability = 0;
|
|
112
|
+
}
|
|
95
113
|
|
|
96
|
-
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
114
|
+
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
97
115
|
|
|
98
116
|
return true;
|
|
99
117
|
}
|
package/dist/vad/vad-node.mjs
CHANGED
package/dist/vad/vad-state.js
CHANGED
|
@@ -36,11 +36,11 @@ var VADStateMachine = class {
|
|
|
36
36
|
enabled: config?.enabled ?? true,
|
|
37
37
|
pluginName: config?.pluginName ?? "energy-vad",
|
|
38
38
|
// Voice-optimized defaults
|
|
39
|
-
startThreshold: config?.startThreshold ?? 0.
|
|
39
|
+
startThreshold: config?.startThreshold ?? 0.8,
|
|
40
40
|
// Higher threshold to avoid noise
|
|
41
|
-
stopThreshold: config?.stopThreshold ?? 0.
|
|
41
|
+
stopThreshold: config?.stopThreshold ?? 0.3,
|
|
42
42
|
// Balanced for voice
|
|
43
|
-
hangoverMs: config?.hangoverMs ??
|
|
43
|
+
hangoverMs: config?.hangoverMs ?? 300,
|
|
44
44
|
// Smooth for natural speech
|
|
45
45
|
preRollMs: config?.preRollMs ?? 250,
|
|
46
46
|
// Generous pre-roll
|
|
@@ -49,10 +49,11 @@ var VADStateMachine = class {
|
|
|
49
49
|
energyVad: {
|
|
50
50
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
51
51
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
52
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.
|
|
53
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ??
|
|
54
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
55
|
-
snrRange: config?.energyVad?.snrRange ??
|
|
52
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.05,
|
|
53
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.01,
|
|
54
|
+
minSNR: config?.energyVad?.minSNR ?? 10,
|
|
55
|
+
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
56
|
+
minEnergy: config?.energyVad?.minEnergy ?? 5e-4
|
|
56
57
|
}
|
|
57
58
|
};
|
|
58
59
|
this.lastSilenceTime = Date.now();
|
package/dist/vad/vad-state.mjs
CHANGED