@tensamin/audio 0.1.12 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/dist/{chunk-KEWK2OKV.mjs → chunk-2G2JFHJY.mjs} +45 -30
- package/dist/{chunk-Q2I22TJG.mjs → chunk-6F2HZUYO.mjs} +1 -1
- package/dist/{chunk-DYY2MXMU.mjs → chunk-K4YLH73B.mjs} +6 -6
- package/dist/{chunk-SMZJFNRU.mjs → chunk-R5M2DGAQ.mjs} +2 -2
- package/dist/{chunk-XZSFQJW4.mjs → chunk-UFKIAMG3.mjs} +1 -1
- package/dist/extensibility/plugins.js +45 -30
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +51 -36
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +51 -36
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +51 -36
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +8 -7
- package/dist/types.d.ts +8 -7
- package/dist/vad/vad-node.js +45 -30
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.js +6 -6
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -101,11 +101,11 @@ vad: {
|
|
|
101
101
|
energyVad?: {
|
|
102
102
|
smoothing: number; // Default: 0.95
|
|
103
103
|
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
-
noiseFloorAdaptRateQuiet: number; // Default: 0.
|
|
105
|
-
noiseFloorAdaptRateLoud: number; // Default: 0.
|
|
106
|
-
minSNR: number; // Default:
|
|
104
|
+
noiseFloorAdaptRateQuiet: number; // Default: 0.002
|
|
105
|
+
noiseFloorAdaptRateLoud: number; // Default: 0.02
|
|
106
|
+
minSNR: number; // Default: 12.0 (dB)
|
|
107
107
|
snrRange: number; // Default: 10.0 (dB)
|
|
108
|
-
minEnergy: number; // Default: 0.
|
|
108
|
+
minEnergy: number; // Default: 0.003
|
|
109
109
|
};
|
|
110
110
|
}
|
|
111
111
|
```
|
|
@@ -116,7 +116,7 @@ vad: {
|
|
|
116
116
|
- `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
|
|
117
117
|
- `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
|
|
118
118
|
- `preRollMs`: Audio buffer duration before speech onset
|
|
119
|
-
- `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default:
|
|
119
|
+
- `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 250ms)
|
|
120
120
|
- `minSilenceDurationMs`: Minimum silence duration between speech segments
|
|
121
121
|
|
|
122
122
|
**Energy VAD Parameters:**
|
|
@@ -124,7 +124,7 @@ vad: {
|
|
|
124
124
|
- `smoothing`: Energy calculation smoothing factor (0-1)
|
|
125
125
|
- `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
|
|
126
126
|
- `snrRange`: Range in dB for probability scaling from minSNR
|
|
127
|
-
- `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.
|
|
127
|
+
- `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.003, ~-50dB)
|
|
128
128
|
|
|
129
129
|
### Output Control
|
|
130
130
|
|
|
@@ -3,11 +3,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
3
3
|
const energyParams = vadConfig?.energyVad || {};
|
|
4
4
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
5
5
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
6
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
7
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
8
|
-
const minSNR = energyParams.minSNR ??
|
|
6
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
7
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
8
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
9
9
|
const snrRange = energyParams.snrRange ?? 10;
|
|
10
|
-
const minEnergy = energyParams.minEnergy ??
|
|
10
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
11
11
|
return `
|
|
12
12
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
13
13
|
constructor() {
|
|
@@ -36,8 +36,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
36
36
|
|
|
37
37
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
38
38
|
let sum = 0;
|
|
39
|
+
let peak = 0;
|
|
39
40
|
for (let i = 0; i < channel.length; i++) {
|
|
41
|
+
const sample = Math.abs(channel[i]);
|
|
40
42
|
sum += channel[i] * channel[i];
|
|
43
|
+
peak = Math.max(peak, sample);
|
|
41
44
|
}
|
|
42
45
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
43
46
|
|
|
@@ -45,36 +48,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
45
48
|
// this.energy acts as the smoothed RMS value
|
|
46
49
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
47
50
|
|
|
48
|
-
//
|
|
49
|
-
//
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
51
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
52
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
53
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
54
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
55
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
56
|
+
|
|
57
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
58
|
+
// This prevents sharp transients from affecting the noise floor
|
|
59
|
+
if (this.energy < this.noiseFloor) {
|
|
60
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
61
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
53
62
|
} else {
|
|
54
|
-
//
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
58
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
59
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
60
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
63
|
+
// Calculate SNR based on smoothed energy
|
|
64
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
65
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
61
66
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
+
// Only adapt upwards if:
|
|
68
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
69
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
70
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
71
|
+
// This is persistent background noise, adapt upwards
|
|
72
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
73
|
+
} else {
|
|
74
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
75
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
76
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
67
77
|
}
|
|
68
|
-
|
|
69
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
70
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
71
78
|
}
|
|
72
79
|
|
|
73
80
|
// Ensure noise floor doesn't drop to absolute zero
|
|
74
|
-
|
|
75
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
81
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
76
82
|
|
|
77
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
83
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
78
84
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
79
85
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
80
86
|
|
|
@@ -83,11 +89,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
83
89
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
84
90
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
85
91
|
|
|
86
|
-
// Apply absolute energy threshold
|
|
87
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
92
|
+
// Apply absolute energy threshold with soft knee
|
|
88
93
|
if (this.energy < this.minEnergy) {
|
|
89
94
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
90
|
-
probability *= Math.pow(energyRatio, 2);
|
|
95
|
+
probability *= Math.pow(energyRatio, 2);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Apply crest factor penalty
|
|
99
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
100
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
101
|
+
// We penalize anything above 14dB
|
|
102
|
+
if (crestFactorDb > 14) {
|
|
103
|
+
const excess = crestFactorDb - 14;
|
|
104
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
105
|
+
probability *= penalty;
|
|
91
106
|
}
|
|
92
107
|
|
|
93
108
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -20,17 +20,17 @@ var VADStateMachine = class {
|
|
|
20
20
|
// Smooth for natural speech
|
|
21
21
|
preRollMs: config?.preRollMs ?? 250,
|
|
22
22
|
// Generous pre-roll
|
|
23
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
24
|
-
//
|
|
23
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
24
|
+
// Aggressive transient rejection
|
|
25
25
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
26
26
|
energyVad: {
|
|
27
27
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
28
28
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
29
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
30
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
31
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
29
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
30
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
31
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
32
32
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
33
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
33
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
34
34
|
}
|
|
35
35
|
};
|
|
36
36
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
VADStateMachine
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-K4YLH73B.mjs";
|
|
4
4
|
import {
|
|
5
5
|
getAudioContext,
|
|
6
6
|
registerPipeline,
|
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
import {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-UFKIAMG3.mjs";
|
|
13
13
|
|
|
14
14
|
// src/pipeline/audio-pipeline.ts
|
|
15
15
|
import mitt from "mitt";
|
|
@@ -106,11 +106,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
106
106
|
const energyParams = vadConfig?.energyVad || {};
|
|
107
107
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
108
108
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
109
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
110
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
111
|
-
const minSNR = energyParams.minSNR ??
|
|
109
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
110
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
111
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
112
112
|
const snrRange = energyParams.snrRange ?? 10;
|
|
113
|
-
const minEnergy = energyParams.minEnergy ??
|
|
113
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
114
114
|
return `
|
|
115
115
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
116
116
|
constructor() {
|
|
@@ -139,8 +139,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
139
139
|
|
|
140
140
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
141
141
|
let sum = 0;
|
|
142
|
+
let peak = 0;
|
|
142
143
|
for (let i = 0; i < channel.length; i++) {
|
|
144
|
+
const sample = Math.abs(channel[i]);
|
|
143
145
|
sum += channel[i] * channel[i];
|
|
146
|
+
peak = Math.max(peak, sample);
|
|
144
147
|
}
|
|
145
148
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
146
149
|
|
|
@@ -148,36 +151,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
148
151
|
// this.energy acts as the smoothed RMS value
|
|
149
152
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
150
153
|
|
|
151
|
-
//
|
|
152
|
-
//
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
154
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
155
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
156
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
157
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
158
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
159
|
+
|
|
160
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
161
|
+
// This prevents sharp transients from affecting the noise floor
|
|
162
|
+
if (this.energy < this.noiseFloor) {
|
|
163
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
164
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
156
165
|
} else {
|
|
157
|
-
//
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
161
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
162
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
163
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
166
|
+
// Calculate SNR based on smoothed energy
|
|
167
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
168
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
164
169
|
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
+
// Only adapt upwards if:
|
|
171
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
172
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
173
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
174
|
+
// This is persistent background noise, adapt upwards
|
|
175
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
176
|
+
} else {
|
|
177
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
178
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
179
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
170
180
|
}
|
|
171
|
-
|
|
172
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
173
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
174
181
|
}
|
|
175
182
|
|
|
176
183
|
// Ensure noise floor doesn't drop to absolute zero
|
|
177
|
-
|
|
178
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
184
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
179
185
|
|
|
180
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
186
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
181
187
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
182
188
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
183
189
|
|
|
@@ -186,11 +192,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
186
192
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
187
193
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
188
194
|
|
|
189
|
-
// Apply absolute energy threshold
|
|
190
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
195
|
+
// Apply absolute energy threshold with soft knee
|
|
191
196
|
if (this.energy < this.minEnergy) {
|
|
192
197
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
193
|
-
probability *= Math.pow(energyRatio, 2);
|
|
198
|
+
probability *= Math.pow(energyRatio, 2);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Apply crest factor penalty
|
|
202
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
203
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
204
|
+
// We penalize anything above 14dB
|
|
205
|
+
if (crestFactorDb > 14) {
|
|
206
|
+
const excess = crestFactorDb - 14;
|
|
207
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
208
|
+
probability *= penalty;
|
|
194
209
|
}
|
|
195
210
|
|
|
196
211
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -3,9 +3,9 @@ import {
|
|
|
3
3
|
getVADPlugin,
|
|
4
4
|
registerNoiseSuppressionPlugin,
|
|
5
5
|
registerVADPlugin
|
|
6
|
-
} from "../chunk-
|
|
6
|
+
} from "../chunk-UFKIAMG3.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-2G2JFHJY.mjs";
|
|
9
9
|
export {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin,
|
package/dist/index.js
CHANGED
|
@@ -158,11 +158,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
158
158
|
const energyParams = vadConfig?.energyVad || {};
|
|
159
159
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
160
160
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
161
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
162
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
163
|
-
const minSNR = energyParams.minSNR ??
|
|
161
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
162
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
163
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
164
164
|
const snrRange = energyParams.snrRange ?? 10;
|
|
165
|
-
const minEnergy = energyParams.minEnergy ??
|
|
165
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
166
166
|
return `
|
|
167
167
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
168
168
|
constructor() {
|
|
@@ -191,8 +191,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
191
191
|
|
|
192
192
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
193
193
|
let sum = 0;
|
|
194
|
+
let peak = 0;
|
|
194
195
|
for (let i = 0; i < channel.length; i++) {
|
|
196
|
+
const sample = Math.abs(channel[i]);
|
|
195
197
|
sum += channel[i] * channel[i];
|
|
198
|
+
peak = Math.max(peak, sample);
|
|
196
199
|
}
|
|
197
200
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
198
201
|
|
|
@@ -200,36 +203,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
200
203
|
// this.energy acts as the smoothed RMS value
|
|
201
204
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
202
205
|
|
|
203
|
-
//
|
|
204
|
-
//
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
206
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
207
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
208
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
209
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
210
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
211
|
+
|
|
212
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
213
|
+
// This prevents sharp transients from affecting the noise floor
|
|
214
|
+
if (this.energy < this.noiseFloor) {
|
|
215
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
216
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
208
217
|
} else {
|
|
209
|
-
//
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
213
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
214
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
215
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
218
|
+
// Calculate SNR based on smoothed energy
|
|
219
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
220
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
216
221
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
+
// Only adapt upwards if:
|
|
223
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
224
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
225
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
226
|
+
// This is persistent background noise, adapt upwards
|
|
227
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
228
|
+
} else {
|
|
229
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
230
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
231
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
222
232
|
}
|
|
223
|
-
|
|
224
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
225
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
226
233
|
}
|
|
227
234
|
|
|
228
235
|
// Ensure noise floor doesn't drop to absolute zero
|
|
229
|
-
|
|
230
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
236
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
231
237
|
|
|
232
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
238
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
233
239
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
234
240
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
235
241
|
|
|
@@ -238,11 +244,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
238
244
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
239
245
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
240
246
|
|
|
241
|
-
// Apply absolute energy threshold
|
|
242
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
247
|
+
// Apply absolute energy threshold with soft knee
|
|
243
248
|
if (this.energy < this.minEnergy) {
|
|
244
249
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
245
|
-
probability *= Math.pow(energyRatio, 2);
|
|
250
|
+
probability *= Math.pow(energyRatio, 2);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Apply crest factor penalty
|
|
254
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
255
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
256
|
+
// We penalize anything above 14dB
|
|
257
|
+
if (crestFactorDb > 14) {
|
|
258
|
+
const excess = crestFactorDb - 14;
|
|
259
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
260
|
+
probability *= penalty;
|
|
246
261
|
}
|
|
247
262
|
|
|
248
263
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -371,17 +386,17 @@ var VADStateMachine = class {
|
|
|
371
386
|
// Smooth for natural speech
|
|
372
387
|
preRollMs: config?.preRollMs ?? 250,
|
|
373
388
|
// Generous pre-roll
|
|
374
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
375
|
-
//
|
|
389
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
390
|
+
// Aggressive transient rejection
|
|
376
391
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
377
392
|
energyVad: {
|
|
378
393
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
379
394
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
380
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
381
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
382
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
395
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
396
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
397
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
383
398
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
384
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
399
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
385
400
|
}
|
|
386
401
|
};
|
|
387
402
|
this.lastSilenceTime = Date.now();
|
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import "./chunk-WBQAMGXK.mjs";
|
|
2
2
|
import {
|
|
3
3
|
attachProcessingToTrack
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-6F2HZUYO.mjs";
|
|
5
5
|
import {
|
|
6
6
|
createAudioPipeline
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-R5M2DGAQ.mjs";
|
|
8
8
|
import {
|
|
9
9
|
VADStateMachine
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-K4YLH73B.mjs";
|
|
11
11
|
import {
|
|
12
12
|
closeAudioContext,
|
|
13
13
|
getAudioContext,
|
|
@@ -21,13 +21,13 @@ import {
|
|
|
21
21
|
getVADPlugin,
|
|
22
22
|
registerNoiseSuppressionPlugin,
|
|
23
23
|
registerVADPlugin
|
|
24
|
-
} from "./chunk-
|
|
24
|
+
} from "./chunk-UFKIAMG3.mjs";
|
|
25
25
|
import {
|
|
26
26
|
RNNoisePlugin
|
|
27
27
|
} from "./chunk-XO6B3D4A.mjs";
|
|
28
28
|
import {
|
|
29
29
|
EnergyVADPlugin
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-2G2JFHJY.mjs";
|
|
31
31
|
export {
|
|
32
32
|
EnergyVADPlugin,
|
|
33
33
|
RNNoisePlugin,
|
|
@@ -127,11 +127,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
127
127
|
const energyParams = vadConfig?.energyVad || {};
|
|
128
128
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
129
129
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
130
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
131
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
132
|
-
const minSNR = energyParams.minSNR ??
|
|
130
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
131
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
132
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
133
133
|
const snrRange = energyParams.snrRange ?? 10;
|
|
134
|
-
const minEnergy = energyParams.minEnergy ??
|
|
134
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
135
135
|
return `
|
|
136
136
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
137
137
|
constructor() {
|
|
@@ -160,8 +160,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
160
160
|
|
|
161
161
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
162
162
|
let sum = 0;
|
|
163
|
+
let peak = 0;
|
|
163
164
|
for (let i = 0; i < channel.length; i++) {
|
|
165
|
+
const sample = Math.abs(channel[i]);
|
|
164
166
|
sum += channel[i] * channel[i];
|
|
167
|
+
peak = Math.max(peak, sample);
|
|
165
168
|
}
|
|
166
169
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
167
170
|
|
|
@@ -169,36 +172,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
169
172
|
// this.energy acts as the smoothed RMS value
|
|
170
173
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
171
174
|
|
|
172
|
-
//
|
|
173
|
-
//
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
175
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
176
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
177
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
178
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
179
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
180
|
+
|
|
181
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
182
|
+
// This prevents sharp transients from affecting the noise floor
|
|
183
|
+
if (this.energy < this.noiseFloor) {
|
|
184
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
185
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
177
186
|
} else {
|
|
178
|
-
//
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
182
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
183
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
184
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
187
|
+
// Calculate SNR based on smoothed energy
|
|
188
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
189
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
185
190
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
+
// Only adapt upwards if:
|
|
192
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
193
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
194
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
195
|
+
// This is persistent background noise, adapt upwards
|
|
196
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
197
|
+
} else {
|
|
198
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
199
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
200
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
191
201
|
}
|
|
192
|
-
|
|
193
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
194
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
195
202
|
}
|
|
196
203
|
|
|
197
204
|
// Ensure noise floor doesn't drop to absolute zero
|
|
198
|
-
|
|
199
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
205
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
200
206
|
|
|
201
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
207
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
202
208
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
203
209
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
204
210
|
|
|
@@ -207,11 +213,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
207
213
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
208
214
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
209
215
|
|
|
210
|
-
// Apply absolute energy threshold
|
|
211
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
216
|
+
// Apply absolute energy threshold with soft knee
|
|
212
217
|
if (this.energy < this.minEnergy) {
|
|
213
218
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
214
|
-
probability *= Math.pow(energyRatio, 2);
|
|
219
|
+
probability *= Math.pow(energyRatio, 2);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Apply crest factor penalty
|
|
223
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
224
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
225
|
+
// We penalize anything above 14dB
|
|
226
|
+
if (crestFactorDb > 14) {
|
|
227
|
+
const excess = crestFactorDb - 14;
|
|
228
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
229
|
+
probability *= penalty;
|
|
215
230
|
}
|
|
216
231
|
|
|
217
232
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -334,17 +349,17 @@ var VADStateMachine = class {
|
|
|
334
349
|
// Smooth for natural speech
|
|
335
350
|
preRollMs: config?.preRollMs ?? 250,
|
|
336
351
|
// Generous pre-roll
|
|
337
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
338
|
-
//
|
|
352
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
353
|
+
// Aggressive transient rejection
|
|
339
354
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
340
355
|
energyVad: {
|
|
341
356
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
342
357
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
343
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
344
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
345
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
358
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
359
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
360
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
346
361
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
347
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
362
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
348
363
|
}
|
|
349
364
|
};
|
|
350
365
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import {
|
|
2
2
|
attachProcessingToTrack
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
5
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-6F2HZUYO.mjs";
|
|
4
|
+
import "../chunk-R5M2DGAQ.mjs";
|
|
5
|
+
import "../chunk-K4YLH73B.mjs";
|
|
6
6
|
import "../chunk-OZ7KMC4S.mjs";
|
|
7
|
-
import "../chunk-
|
|
7
|
+
import "../chunk-UFKIAMG3.mjs";
|
|
8
8
|
import "../chunk-XO6B3D4A.mjs";
|
|
9
|
-
import "../chunk-
|
|
9
|
+
import "../chunk-2G2JFHJY.mjs";
|
|
10
10
|
export {
|
|
11
11
|
attachProcessingToTrack
|
|
12
12
|
};
|
|
@@ -125,11 +125,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
125
125
|
const energyParams = vadConfig?.energyVad || {};
|
|
126
126
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
127
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
129
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
130
|
-
const minSNR = energyParams.minSNR ??
|
|
128
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
129
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
130
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
131
131
|
const snrRange = energyParams.snrRange ?? 10;
|
|
132
|
-
const minEnergy = energyParams.minEnergy ??
|
|
132
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
133
133
|
return `
|
|
134
134
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
135
135
|
constructor() {
|
|
@@ -158,8 +158,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
158
158
|
|
|
159
159
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
160
160
|
let sum = 0;
|
|
161
|
+
let peak = 0;
|
|
161
162
|
for (let i = 0; i < channel.length; i++) {
|
|
163
|
+
const sample = Math.abs(channel[i]);
|
|
162
164
|
sum += channel[i] * channel[i];
|
|
165
|
+
peak = Math.max(peak, sample);
|
|
163
166
|
}
|
|
164
167
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
165
168
|
|
|
@@ -167,36 +170,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
167
170
|
// this.energy acts as the smoothed RMS value
|
|
168
171
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
169
172
|
|
|
170
|
-
//
|
|
171
|
-
//
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
173
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
174
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
175
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
176
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
177
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
178
|
+
|
|
179
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
180
|
+
// This prevents sharp transients from affecting the noise floor
|
|
181
|
+
if (this.energy < this.noiseFloor) {
|
|
182
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
183
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
175
184
|
} else {
|
|
176
|
-
//
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
180
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
181
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
182
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
185
|
+
// Calculate SNR based on smoothed energy
|
|
186
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
187
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
183
188
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
+
// Only adapt upwards if:
|
|
190
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
191
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
192
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
193
|
+
// This is persistent background noise, adapt upwards
|
|
194
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
195
|
+
} else {
|
|
196
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
197
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
198
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
189
199
|
}
|
|
190
|
-
|
|
191
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
192
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
193
200
|
}
|
|
194
201
|
|
|
195
202
|
// Ensure noise floor doesn't drop to absolute zero
|
|
196
|
-
|
|
197
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
203
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
198
204
|
|
|
199
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
205
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
200
206
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
201
207
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
202
208
|
|
|
@@ -205,11 +211,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
205
211
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
206
212
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
207
213
|
|
|
208
|
-
// Apply absolute energy threshold
|
|
209
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
214
|
+
// Apply absolute energy threshold with soft knee
|
|
210
215
|
if (this.energy < this.minEnergy) {
|
|
211
216
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
212
|
-
probability *= Math.pow(energyRatio, 2);
|
|
217
|
+
probability *= Math.pow(energyRatio, 2);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Apply crest factor penalty
|
|
221
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
222
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
223
|
+
// We penalize anything above 14dB
|
|
224
|
+
if (crestFactorDb > 14) {
|
|
225
|
+
const excess = crestFactorDb - 14;
|
|
226
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
227
|
+
probability *= penalty;
|
|
213
228
|
}
|
|
214
229
|
|
|
215
230
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -332,17 +347,17 @@ var VADStateMachine = class {
|
|
|
332
347
|
// Smooth for natural speech
|
|
333
348
|
preRollMs: config?.preRollMs ?? 250,
|
|
334
349
|
// Generous pre-roll
|
|
335
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
336
|
-
//
|
|
350
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
351
|
+
// Aggressive transient rejection
|
|
337
352
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
338
353
|
energyVad: {
|
|
339
354
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
340
355
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
341
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
342
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
343
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
356
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
357
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
358
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
344
359
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
345
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
360
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
346
361
|
}
|
|
347
362
|
};
|
|
348
363
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
createAudioPipeline
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-R5M2DGAQ.mjs";
|
|
4
|
+
import "../chunk-K4YLH73B.mjs";
|
|
5
5
|
import "../chunk-OZ7KMC4S.mjs";
|
|
6
|
-
import "../chunk-
|
|
6
|
+
import "../chunk-UFKIAMG3.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-2G2JFHJY.mjs";
|
|
9
9
|
export {
|
|
10
10
|
createAudioPipeline
|
|
11
11
|
};
|
package/dist/types.d.mts
CHANGED
|
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
|
|
|
70
70
|
preRollMs?: number;
|
|
71
71
|
/**
|
|
72
72
|
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
-
* Filters out
|
|
74
|
-
* Default:
|
|
73
|
+
* Filters out brief transients like keyboard clicks.
|
|
74
|
+
* Default: 250ms (aggressive transient rejection)
|
|
75
75
|
*/
|
|
76
76
|
minSpeechDurationMs?: number;
|
|
77
77
|
/**
|
|
@@ -97,17 +97,18 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.002 (very slow downward drift)
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
*
|
|
105
|
+
* Applied to low-energy, low-crest-factor signals (background noise).
|
|
106
|
+
* Default: 0.02
|
|
106
107
|
*/
|
|
107
108
|
noiseFloorAdaptRateLoud?: number;
|
|
108
109
|
/**
|
|
109
110
|
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
110
|
-
* Default:
|
|
111
|
+
* Default: 12.0 (aggressive noise rejection)
|
|
111
112
|
*/
|
|
112
113
|
minSNR?: number;
|
|
113
114
|
/**
|
|
@@ -117,8 +118,8 @@ interface AudioProcessingConfig {
|
|
|
117
118
|
snrRange?: number;
|
|
118
119
|
/**
|
|
119
120
|
* Minimum absolute RMS energy to consider as speech.
|
|
120
|
-
* Prevents triggering on very quiet background noise
|
|
121
|
-
* Default: 0.
|
|
121
|
+
* Prevents triggering on very quiet background noise.
|
|
122
|
+
* Default: 0.003 (approx -50dB, voice-appropriate level)
|
|
122
123
|
*/
|
|
123
124
|
minEnergy?: number;
|
|
124
125
|
};
|
package/dist/types.d.ts
CHANGED
|
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
|
|
|
70
70
|
preRollMs?: number;
|
|
71
71
|
/**
|
|
72
72
|
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
-
* Filters out
|
|
74
|
-
* Default:
|
|
73
|
+
* Filters out brief transients like keyboard clicks.
|
|
74
|
+
* Default: 250ms (aggressive transient rejection)
|
|
75
75
|
*/
|
|
76
76
|
minSpeechDurationMs?: number;
|
|
77
77
|
/**
|
|
@@ -97,17 +97,18 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.002 (very slow downward drift)
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
*
|
|
105
|
+
* Applied to low-energy, low-crest-factor signals (background noise).
|
|
106
|
+
* Default: 0.02
|
|
106
107
|
*/
|
|
107
108
|
noiseFloorAdaptRateLoud?: number;
|
|
108
109
|
/**
|
|
109
110
|
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
110
|
-
* Default:
|
|
111
|
+
* Default: 12.0 (aggressive noise rejection)
|
|
111
112
|
*/
|
|
112
113
|
minSNR?: number;
|
|
113
114
|
/**
|
|
@@ -117,8 +118,8 @@ interface AudioProcessingConfig {
|
|
|
117
118
|
snrRange?: number;
|
|
118
119
|
/**
|
|
119
120
|
* Minimum absolute RMS energy to consider as speech.
|
|
120
|
-
* Prevents triggering on very quiet background noise
|
|
121
|
-
* Default: 0.
|
|
121
|
+
* Prevents triggering on very quiet background noise.
|
|
122
|
+
* Default: 0.003 (approx -50dB, voice-appropriate level)
|
|
122
123
|
*/
|
|
123
124
|
minEnergy?: number;
|
|
124
125
|
};
|
package/dist/vad/vad-node.js
CHANGED
|
@@ -27,11 +27,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
27
27
|
const energyParams = vadConfig?.energyVad || {};
|
|
28
28
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
29
29
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
30
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
31
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
32
|
-
const minSNR = energyParams.minSNR ??
|
|
30
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
31
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
32
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
33
33
|
const snrRange = energyParams.snrRange ?? 10;
|
|
34
|
-
const minEnergy = energyParams.minEnergy ??
|
|
34
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
35
35
|
return `
|
|
36
36
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
37
37
|
constructor() {
|
|
@@ -60,8 +60,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
60
60
|
|
|
61
61
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
62
62
|
let sum = 0;
|
|
63
|
+
let peak = 0;
|
|
63
64
|
for (let i = 0; i < channel.length; i++) {
|
|
65
|
+
const sample = Math.abs(channel[i]);
|
|
64
66
|
sum += channel[i] * channel[i];
|
|
67
|
+
peak = Math.max(peak, sample);
|
|
65
68
|
}
|
|
66
69
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
67
70
|
|
|
@@ -69,36 +72,39 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
69
72
|
// this.energy acts as the smoothed RMS value
|
|
70
73
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
71
74
|
|
|
72
|
-
//
|
|
73
|
-
//
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
75
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
76
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
77
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
78
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
79
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
80
|
+
|
|
81
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
82
|
+
// This prevents sharp transients from affecting the noise floor
|
|
83
|
+
if (this.energy < this.noiseFloor) {
|
|
84
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
85
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
77
86
|
} else {
|
|
78
|
-
//
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
// 2. If SNR is very high (> 20dB), assume it's speech and adapt very slowly (0.1x)
|
|
82
|
-
// 3. Otherwise, adapt at the normal loud rate
|
|
83
|
-
const snr = instantRms / (this.noiseFloor + 1e-6);
|
|
84
|
-
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
87
|
+
// Calculate SNR based on smoothed energy
|
|
88
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
89
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
85
90
|
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
+
// Only adapt upwards if:
|
|
92
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
93
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
94
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
95
|
+
// This is persistent background noise, adapt upwards
|
|
96
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
97
|
+
} else {
|
|
98
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
99
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
100
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
91
101
|
}
|
|
92
|
-
|
|
93
|
-
const adaptRate = this.noiseFloorAdaptRateLoud * multiplier;
|
|
94
|
-
this.noiseFloor = this.noiseFloor * (1 - adaptRate) + instantRms * adaptRate;
|
|
95
102
|
}
|
|
96
103
|
|
|
97
104
|
// Ensure noise floor doesn't drop to absolute zero
|
|
98
|
-
|
|
99
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
105
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
100
106
|
|
|
101
|
-
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
107
|
+
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
102
108
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
103
109
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
104
110
|
|
|
@@ -107,11 +113,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
107
113
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
108
114
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
109
115
|
|
|
110
|
-
// Apply absolute energy threshold
|
|
111
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
116
|
+
// Apply absolute energy threshold with soft knee
|
|
112
117
|
if (this.energy < this.minEnergy) {
|
|
113
118
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
114
|
-
probability *= Math.pow(energyRatio, 2);
|
|
119
|
+
probability *= Math.pow(energyRatio, 2);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Apply crest factor penalty
|
|
123
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
124
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
125
|
+
// We penalize anything above 14dB
|
|
126
|
+
if (crestFactorDb > 14) {
|
|
127
|
+
const excess = crestFactorDb - 14;
|
|
128
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
129
|
+
probability *= penalty;
|
|
115
130
|
}
|
|
116
131
|
|
|
117
132
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
package/dist/vad/vad-node.mjs
CHANGED
package/dist/vad/vad-state.js
CHANGED
|
@@ -44,17 +44,17 @@ var VADStateMachine = class {
|
|
|
44
44
|
// Smooth for natural speech
|
|
45
45
|
preRollMs: config?.preRollMs ?? 250,
|
|
46
46
|
// Generous pre-roll
|
|
47
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
48
|
-
//
|
|
47
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
48
|
+
// Aggressive transient rejection
|
|
49
49
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
50
50
|
energyVad: {
|
|
51
51
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
52
52
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
53
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
54
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
55
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
53
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
54
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
55
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
56
56
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
57
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
57
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
58
58
|
}
|
|
59
59
|
};
|
|
60
60
|
this.lastSilenceTime = Date.now();
|
package/dist/vad/vad-state.mjs
CHANGED