@tensamin/audio 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/dist/{chunk-FKR6NWZF.mjs → chunk-2G2JFHJY.mjs} +46 -27
- package/dist/{chunk-K6X52R7N.mjs → chunk-6F2HZUYO.mjs} +1 -1
- package/dist/{chunk-DLLK6K76.mjs → chunk-K4YLH73B.mjs} +6 -6
- package/dist/{chunk-RD4GDIPO.mjs → chunk-R5M2DGAQ.mjs} +2 -2
- package/dist/{chunk-OXV7BHX5.mjs → chunk-UFKIAMG3.mjs} +1 -1
- package/dist/extensibility/plugins.js +46 -27
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +52 -33
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +52 -33
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +52 -33
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +8 -8
- package/dist/types.d.ts +8 -8
- package/dist/vad/vad-node.js +46 -27
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.js +6 -6
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -101,11 +101,11 @@ vad: {
|
|
|
101
101
|
energyVad?: {
|
|
102
102
|
smoothing: number; // Default: 0.95
|
|
103
103
|
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
-
noiseFloorAdaptRateQuiet: number; // Default: 0.
|
|
105
|
-
noiseFloorAdaptRateLoud: number; // Default: 0.
|
|
106
|
-
minSNR: number; // Default:
|
|
104
|
+
noiseFloorAdaptRateQuiet: number; // Default: 0.002
|
|
105
|
+
noiseFloorAdaptRateLoud: number; // Default: 0.02
|
|
106
|
+
minSNR: number; // Default: 12.0 (dB)
|
|
107
107
|
snrRange: number; // Default: 10.0 (dB)
|
|
108
|
-
minEnergy: number; // Default: 0.
|
|
108
|
+
minEnergy: number; // Default: 0.003
|
|
109
109
|
};
|
|
110
110
|
}
|
|
111
111
|
```
|
|
@@ -116,7 +116,7 @@ vad: {
|
|
|
116
116
|
- `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
|
|
117
117
|
- `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
|
|
118
118
|
- `preRollMs`: Audio buffer duration before speech onset
|
|
119
|
-
- `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default:
|
|
119
|
+
- `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 250ms)
|
|
120
120
|
- `minSilenceDurationMs`: Minimum silence duration between speech segments
|
|
121
121
|
|
|
122
122
|
**Energy VAD Parameters:**
|
|
@@ -124,7 +124,7 @@ vad: {
|
|
|
124
124
|
- `smoothing`: Energy calculation smoothing factor (0-1)
|
|
125
125
|
- `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
|
|
126
126
|
- `snrRange`: Range in dB for probability scaling from minSNR
|
|
127
|
-
- `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.
|
|
127
|
+
- `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.003, ~-50dB)
|
|
128
128
|
|
|
129
129
|
### Output Control
|
|
130
130
|
|
|
@@ -3,11 +3,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
3
3
|
const energyParams = vadConfig?.energyVad || {};
|
|
4
4
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
5
5
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
6
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
7
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
8
|
-
const minSNR = energyParams.minSNR ??
|
|
6
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
7
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
8
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
9
9
|
const snrRange = energyParams.snrRange ?? 10;
|
|
10
|
-
const minEnergy = energyParams.minEnergy ??
|
|
10
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
11
11
|
return `
|
|
12
12
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
13
13
|
constructor() {
|
|
@@ -36,8 +36,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
36
36
|
|
|
37
37
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
38
38
|
let sum = 0;
|
|
39
|
+
let peak = 0;
|
|
39
40
|
for (let i = 0; i < channel.length; i++) {
|
|
41
|
+
const sample = Math.abs(channel[i]);
|
|
40
42
|
sum += channel[i] * channel[i];
|
|
43
|
+
peak = Math.max(peak, sample);
|
|
41
44
|
}
|
|
42
45
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
43
46
|
|
|
@@ -45,30 +48,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
45
48
|
// this.energy acts as the smoothed RMS value
|
|
46
49
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
47
50
|
|
|
48
|
-
//
|
|
49
|
-
//
|
|
50
|
-
//
|
|
51
|
-
const
|
|
52
|
-
const
|
|
53
|
-
|
|
54
|
-
//
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
60
|
-
// Adapt upwards at normal rate to track rising noise
|
|
61
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
51
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
52
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
53
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
54
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
55
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
56
|
+
|
|
57
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
58
|
+
// This prevents sharp transients from affecting the noise floor
|
|
59
|
+
if (this.energy < this.noiseFloor) {
|
|
60
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
61
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
62
62
|
} else {
|
|
63
|
-
//
|
|
64
|
-
|
|
65
|
-
const
|
|
66
|
-
|
|
63
|
+
// Calculate SNR based on smoothed energy
|
|
64
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
65
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
66
|
+
|
|
67
|
+
// Only adapt upwards if:
|
|
68
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
69
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
70
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
71
|
+
// This is persistent background noise, adapt upwards
|
|
72
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
73
|
+
} else {
|
|
74
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
75
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
76
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
77
|
+
}
|
|
67
78
|
}
|
|
68
79
|
|
|
69
80
|
// Ensure noise floor doesn't drop to absolute zero
|
|
70
|
-
|
|
71
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
81
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
72
82
|
|
|
73
83
|
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
74
84
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
@@ -79,11 +89,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
79
89
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
80
90
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
81
91
|
|
|
82
|
-
// Apply absolute energy threshold
|
|
83
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
92
|
+
// Apply absolute energy threshold with soft knee
|
|
84
93
|
if (this.energy < this.minEnergy) {
|
|
85
94
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
86
|
-
probability *= Math.pow(energyRatio, 2);
|
|
95
|
+
probability *= Math.pow(energyRatio, 2);
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Apply crest factor penalty
|
|
99
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
100
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
101
|
+
// We penalize anything above 14dB
|
|
102
|
+
if (crestFactorDb > 14) {
|
|
103
|
+
const excess = crestFactorDb - 14;
|
|
104
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
105
|
+
probability *= penalty;
|
|
87
106
|
}
|
|
88
107
|
|
|
89
108
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -20,17 +20,17 @@ var VADStateMachine = class {
|
|
|
20
20
|
// Smooth for natural speech
|
|
21
21
|
preRollMs: config?.preRollMs ?? 250,
|
|
22
22
|
// Generous pre-roll
|
|
23
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
24
|
-
//
|
|
23
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
24
|
+
// Aggressive transient rejection
|
|
25
25
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
26
26
|
energyVad: {
|
|
27
27
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
28
28
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
29
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
30
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
31
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
29
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
30
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
31
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
32
32
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
33
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
33
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
34
34
|
}
|
|
35
35
|
};
|
|
36
36
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
VADStateMachine
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-K4YLH73B.mjs";
|
|
4
4
|
import {
|
|
5
5
|
getAudioContext,
|
|
6
6
|
registerPipeline,
|
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
import {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-UFKIAMG3.mjs";
|
|
13
13
|
|
|
14
14
|
// src/pipeline/audio-pipeline.ts
|
|
15
15
|
import mitt from "mitt";
|
|
@@ -106,11 +106,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
106
106
|
const energyParams = vadConfig?.energyVad || {};
|
|
107
107
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
108
108
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
109
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
110
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
111
|
-
const minSNR = energyParams.minSNR ??
|
|
109
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
110
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
111
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
112
112
|
const snrRange = energyParams.snrRange ?? 10;
|
|
113
|
-
const minEnergy = energyParams.minEnergy ??
|
|
113
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
114
114
|
return `
|
|
115
115
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
116
116
|
constructor() {
|
|
@@ -139,8 +139,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
139
139
|
|
|
140
140
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
141
141
|
let sum = 0;
|
|
142
|
+
let peak = 0;
|
|
142
143
|
for (let i = 0; i < channel.length; i++) {
|
|
144
|
+
const sample = Math.abs(channel[i]);
|
|
143
145
|
sum += channel[i] * channel[i];
|
|
146
|
+
peak = Math.max(peak, sample);
|
|
144
147
|
}
|
|
145
148
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
146
149
|
|
|
@@ -148,30 +151,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
148
151
|
// this.energy acts as the smoothed RMS value
|
|
149
152
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
150
153
|
|
|
151
|
-
//
|
|
152
|
-
//
|
|
153
|
-
//
|
|
154
|
-
const
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
//
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
163
|
-
// Adapt upwards at normal rate to track rising noise
|
|
164
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
154
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
155
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
156
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
157
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
158
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
159
|
+
|
|
160
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
161
|
+
// This prevents sharp transients from affecting the noise floor
|
|
162
|
+
if (this.energy < this.noiseFloor) {
|
|
163
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
164
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
165
165
|
} else {
|
|
166
|
-
//
|
|
167
|
-
|
|
168
|
-
const
|
|
169
|
-
|
|
166
|
+
// Calculate SNR based on smoothed energy
|
|
167
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
168
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
169
|
+
|
|
170
|
+
// Only adapt upwards if:
|
|
171
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
172
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
173
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
174
|
+
// This is persistent background noise, adapt upwards
|
|
175
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
176
|
+
} else {
|
|
177
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
178
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
179
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
180
|
+
}
|
|
170
181
|
}
|
|
171
182
|
|
|
172
183
|
// Ensure noise floor doesn't drop to absolute zero
|
|
173
|
-
|
|
174
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
184
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
175
185
|
|
|
176
186
|
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
177
187
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
@@ -182,11 +192,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
182
192
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
183
193
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
184
194
|
|
|
185
|
-
// Apply absolute energy threshold
|
|
186
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
195
|
+
// Apply absolute energy threshold with soft knee
|
|
187
196
|
if (this.energy < this.minEnergy) {
|
|
188
197
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
189
|
-
probability *= Math.pow(energyRatio, 2);
|
|
198
|
+
probability *= Math.pow(energyRatio, 2);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
// Apply crest factor penalty
|
|
202
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
203
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
204
|
+
// We penalize anything above 14dB
|
|
205
|
+
if (crestFactorDb > 14) {
|
|
206
|
+
const excess = crestFactorDb - 14;
|
|
207
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
208
|
+
probability *= penalty;
|
|
190
209
|
}
|
|
191
210
|
|
|
192
211
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -3,9 +3,9 @@ import {
|
|
|
3
3
|
getVADPlugin,
|
|
4
4
|
registerNoiseSuppressionPlugin,
|
|
5
5
|
registerVADPlugin
|
|
6
|
-
} from "../chunk-
|
|
6
|
+
} from "../chunk-UFKIAMG3.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-2G2JFHJY.mjs";
|
|
9
9
|
export {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin,
|
package/dist/index.js
CHANGED
|
@@ -158,11 +158,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
158
158
|
const energyParams = vadConfig?.energyVad || {};
|
|
159
159
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
160
160
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
161
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
162
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
163
|
-
const minSNR = energyParams.minSNR ??
|
|
161
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
162
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
163
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
164
164
|
const snrRange = energyParams.snrRange ?? 10;
|
|
165
|
-
const minEnergy = energyParams.minEnergy ??
|
|
165
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
166
166
|
return `
|
|
167
167
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
168
168
|
constructor() {
|
|
@@ -191,8 +191,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
191
191
|
|
|
192
192
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
193
193
|
let sum = 0;
|
|
194
|
+
let peak = 0;
|
|
194
195
|
for (let i = 0; i < channel.length; i++) {
|
|
196
|
+
const sample = Math.abs(channel[i]);
|
|
195
197
|
sum += channel[i] * channel[i];
|
|
198
|
+
peak = Math.max(peak, sample);
|
|
196
199
|
}
|
|
197
200
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
198
201
|
|
|
@@ -200,30 +203,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
200
203
|
// this.energy acts as the smoothed RMS value
|
|
201
204
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
202
205
|
|
|
203
|
-
//
|
|
204
|
-
//
|
|
205
|
-
//
|
|
206
|
-
const
|
|
207
|
-
const
|
|
208
|
-
|
|
209
|
-
//
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
215
|
-
// Adapt upwards at normal rate to track rising noise
|
|
216
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
206
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
207
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
208
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
209
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
210
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
211
|
+
|
|
212
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
213
|
+
// This prevents sharp transients from affecting the noise floor
|
|
214
|
+
if (this.energy < this.noiseFloor) {
|
|
215
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
216
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
217
217
|
} else {
|
|
218
|
-
//
|
|
219
|
-
|
|
220
|
-
const
|
|
221
|
-
|
|
218
|
+
// Calculate SNR based on smoothed energy
|
|
219
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
220
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
221
|
+
|
|
222
|
+
// Only adapt upwards if:
|
|
223
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
224
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
225
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
226
|
+
// This is persistent background noise, adapt upwards
|
|
227
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
228
|
+
} else {
|
|
229
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
230
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
231
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
232
|
+
}
|
|
222
233
|
}
|
|
223
234
|
|
|
224
235
|
// Ensure noise floor doesn't drop to absolute zero
|
|
225
|
-
|
|
226
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
236
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
227
237
|
|
|
228
238
|
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
229
239
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
@@ -234,11 +244,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
234
244
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
235
245
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
236
246
|
|
|
237
|
-
// Apply absolute energy threshold
|
|
238
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
247
|
+
// Apply absolute energy threshold with soft knee
|
|
239
248
|
if (this.energy < this.minEnergy) {
|
|
240
249
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
241
|
-
probability *= Math.pow(energyRatio, 2);
|
|
250
|
+
probability *= Math.pow(energyRatio, 2);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
// Apply crest factor penalty
|
|
254
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
255
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
256
|
+
// We penalize anything above 14dB
|
|
257
|
+
if (crestFactorDb > 14) {
|
|
258
|
+
const excess = crestFactorDb - 14;
|
|
259
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
260
|
+
probability *= penalty;
|
|
242
261
|
}
|
|
243
262
|
|
|
244
263
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -367,17 +386,17 @@ var VADStateMachine = class {
|
|
|
367
386
|
// Smooth for natural speech
|
|
368
387
|
preRollMs: config?.preRollMs ?? 250,
|
|
369
388
|
// Generous pre-roll
|
|
370
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
371
|
-
//
|
|
389
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
390
|
+
// Aggressive transient rejection
|
|
372
391
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
373
392
|
energyVad: {
|
|
374
393
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
375
394
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
376
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
377
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
378
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
395
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
396
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
397
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
379
398
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
380
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
399
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
381
400
|
}
|
|
382
401
|
};
|
|
383
402
|
this.lastSilenceTime = Date.now();
|
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import "./chunk-WBQAMGXK.mjs";
|
|
2
2
|
import {
|
|
3
3
|
attachProcessingToTrack
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-6F2HZUYO.mjs";
|
|
5
5
|
import {
|
|
6
6
|
createAudioPipeline
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-R5M2DGAQ.mjs";
|
|
8
8
|
import {
|
|
9
9
|
VADStateMachine
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-K4YLH73B.mjs";
|
|
11
11
|
import {
|
|
12
12
|
closeAudioContext,
|
|
13
13
|
getAudioContext,
|
|
@@ -21,13 +21,13 @@ import {
|
|
|
21
21
|
getVADPlugin,
|
|
22
22
|
registerNoiseSuppressionPlugin,
|
|
23
23
|
registerVADPlugin
|
|
24
|
-
} from "./chunk-
|
|
24
|
+
} from "./chunk-UFKIAMG3.mjs";
|
|
25
25
|
import {
|
|
26
26
|
RNNoisePlugin
|
|
27
27
|
} from "./chunk-XO6B3D4A.mjs";
|
|
28
28
|
import {
|
|
29
29
|
EnergyVADPlugin
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-2G2JFHJY.mjs";
|
|
31
31
|
export {
|
|
32
32
|
EnergyVADPlugin,
|
|
33
33
|
RNNoisePlugin,
|
|
@@ -127,11 +127,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
127
127
|
const energyParams = vadConfig?.energyVad || {};
|
|
128
128
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
129
129
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
130
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
131
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
132
|
-
const minSNR = energyParams.minSNR ??
|
|
130
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
131
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
132
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
133
133
|
const snrRange = energyParams.snrRange ?? 10;
|
|
134
|
-
const minEnergy = energyParams.minEnergy ??
|
|
134
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
135
135
|
return `
|
|
136
136
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
137
137
|
constructor() {
|
|
@@ -160,8 +160,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
160
160
|
|
|
161
161
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
162
162
|
let sum = 0;
|
|
163
|
+
let peak = 0;
|
|
163
164
|
for (let i = 0; i < channel.length; i++) {
|
|
165
|
+
const sample = Math.abs(channel[i]);
|
|
164
166
|
sum += channel[i] * channel[i];
|
|
167
|
+
peak = Math.max(peak, sample);
|
|
165
168
|
}
|
|
166
169
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
167
170
|
|
|
@@ -169,30 +172,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
169
172
|
// this.energy acts as the smoothed RMS value
|
|
170
173
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
171
174
|
|
|
172
|
-
//
|
|
173
|
-
//
|
|
174
|
-
//
|
|
175
|
-
const
|
|
176
|
-
const
|
|
177
|
-
|
|
178
|
-
//
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
184
|
-
// Adapt upwards at normal rate to track rising noise
|
|
185
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
175
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
176
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
177
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
178
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
179
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
180
|
+
|
|
181
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
182
|
+
// This prevents sharp transients from affecting the noise floor
|
|
183
|
+
if (this.energy < this.noiseFloor) {
|
|
184
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
185
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
186
186
|
} else {
|
|
187
|
-
//
|
|
188
|
-
|
|
189
|
-
const
|
|
190
|
-
|
|
187
|
+
// Calculate SNR based on smoothed energy
|
|
188
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
189
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
190
|
+
|
|
191
|
+
// Only adapt upwards if:
|
|
192
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
193
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
194
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
195
|
+
// This is persistent background noise, adapt upwards
|
|
196
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
197
|
+
} else {
|
|
198
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
199
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
200
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
201
|
+
}
|
|
191
202
|
}
|
|
192
203
|
|
|
193
204
|
// Ensure noise floor doesn't drop to absolute zero
|
|
194
|
-
|
|
195
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
205
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
196
206
|
|
|
197
207
|
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
198
208
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
@@ -203,11 +213,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
203
213
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
204
214
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
205
215
|
|
|
206
|
-
// Apply absolute energy threshold
|
|
207
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
216
|
+
// Apply absolute energy threshold with soft knee
|
|
208
217
|
if (this.energy < this.minEnergy) {
|
|
209
218
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
210
|
-
probability *= Math.pow(energyRatio, 2);
|
|
219
|
+
probability *= Math.pow(energyRatio, 2);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
// Apply crest factor penalty
|
|
223
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
224
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
225
|
+
// We penalize anything above 14dB
|
|
226
|
+
if (crestFactorDb > 14) {
|
|
227
|
+
const excess = crestFactorDb - 14;
|
|
228
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
229
|
+
probability *= penalty;
|
|
211
230
|
}
|
|
212
231
|
|
|
213
232
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -330,17 +349,17 @@ var VADStateMachine = class {
|
|
|
330
349
|
// Smooth for natural speech
|
|
331
350
|
preRollMs: config?.preRollMs ?? 250,
|
|
332
351
|
// Generous pre-roll
|
|
333
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
334
|
-
//
|
|
352
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
353
|
+
// Aggressive transient rejection
|
|
335
354
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
336
355
|
energyVad: {
|
|
337
356
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
338
357
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
339
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
340
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
341
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
358
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
359
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
360
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
342
361
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
343
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
362
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
344
363
|
}
|
|
345
364
|
};
|
|
346
365
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import {
|
|
2
2
|
attachProcessingToTrack
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
5
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-6F2HZUYO.mjs";
|
|
4
|
+
import "../chunk-R5M2DGAQ.mjs";
|
|
5
|
+
import "../chunk-K4YLH73B.mjs";
|
|
6
6
|
import "../chunk-OZ7KMC4S.mjs";
|
|
7
|
-
import "../chunk-
|
|
7
|
+
import "../chunk-UFKIAMG3.mjs";
|
|
8
8
|
import "../chunk-XO6B3D4A.mjs";
|
|
9
|
-
import "../chunk-
|
|
9
|
+
import "../chunk-2G2JFHJY.mjs";
|
|
10
10
|
export {
|
|
11
11
|
attachProcessingToTrack
|
|
12
12
|
};
|
|
@@ -125,11 +125,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
125
125
|
const energyParams = vadConfig?.energyVad || {};
|
|
126
126
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
127
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
129
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
130
|
-
const minSNR = energyParams.minSNR ??
|
|
128
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
129
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
130
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
131
131
|
const snrRange = energyParams.snrRange ?? 10;
|
|
132
|
-
const minEnergy = energyParams.minEnergy ??
|
|
132
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
133
133
|
return `
|
|
134
134
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
135
135
|
constructor() {
|
|
@@ -158,8 +158,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
158
158
|
|
|
159
159
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
160
160
|
let sum = 0;
|
|
161
|
+
let peak = 0;
|
|
161
162
|
for (let i = 0; i < channel.length; i++) {
|
|
163
|
+
const sample = Math.abs(channel[i]);
|
|
162
164
|
sum += channel[i] * channel[i];
|
|
165
|
+
peak = Math.max(peak, sample);
|
|
163
166
|
}
|
|
164
167
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
165
168
|
|
|
@@ -167,30 +170,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
167
170
|
// this.energy acts as the smoothed RMS value
|
|
168
171
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
169
172
|
|
|
170
|
-
//
|
|
171
|
-
//
|
|
172
|
-
//
|
|
173
|
-
const
|
|
174
|
-
const
|
|
175
|
-
|
|
176
|
-
//
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
182
|
-
// Adapt upwards at normal rate to track rising noise
|
|
183
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
173
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
174
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
175
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
176
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
177
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
178
|
+
|
|
179
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
180
|
+
// This prevents sharp transients from affecting the noise floor
|
|
181
|
+
if (this.energy < this.noiseFloor) {
|
|
182
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
183
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
184
184
|
} else {
|
|
185
|
-
//
|
|
186
|
-
|
|
187
|
-
const
|
|
188
|
-
|
|
185
|
+
// Calculate SNR based on smoothed energy
|
|
186
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
187
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
188
|
+
|
|
189
|
+
// Only adapt upwards if:
|
|
190
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
191
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
192
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
193
|
+
// This is persistent background noise, adapt upwards
|
|
194
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
195
|
+
} else {
|
|
196
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
197
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
198
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
199
|
+
}
|
|
189
200
|
}
|
|
190
201
|
|
|
191
202
|
// Ensure noise floor doesn't drop to absolute zero
|
|
192
|
-
|
|
193
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
203
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
194
204
|
|
|
195
205
|
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
196
206
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
@@ -201,11 +211,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
201
211
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
202
212
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
203
213
|
|
|
204
|
-
// Apply absolute energy threshold
|
|
205
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
214
|
+
// Apply absolute energy threshold with soft knee
|
|
206
215
|
if (this.energy < this.minEnergy) {
|
|
207
216
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
208
|
-
probability *= Math.pow(energyRatio, 2);
|
|
217
|
+
probability *= Math.pow(energyRatio, 2);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Apply crest factor penalty
|
|
221
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
222
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
223
|
+
// We penalize anything above 14dB
|
|
224
|
+
if (crestFactorDb > 14) {
|
|
225
|
+
const excess = crestFactorDb - 14;
|
|
226
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
227
|
+
probability *= penalty;
|
|
209
228
|
}
|
|
210
229
|
|
|
211
230
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -328,17 +347,17 @@ var VADStateMachine = class {
|
|
|
328
347
|
// Smooth for natural speech
|
|
329
348
|
preRollMs: config?.preRollMs ?? 250,
|
|
330
349
|
// Generous pre-roll
|
|
331
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
332
|
-
//
|
|
350
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
351
|
+
// Aggressive transient rejection
|
|
333
352
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
334
353
|
energyVad: {
|
|
335
354
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
336
355
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
337
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
338
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
339
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
356
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
357
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
358
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
340
359
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
341
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
360
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
342
361
|
}
|
|
343
362
|
};
|
|
344
363
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
createAudioPipeline
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-R5M2DGAQ.mjs";
|
|
4
|
+
import "../chunk-K4YLH73B.mjs";
|
|
5
5
|
import "../chunk-OZ7KMC4S.mjs";
|
|
6
|
-
import "../chunk-
|
|
6
|
+
import "../chunk-UFKIAMG3.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-2G2JFHJY.mjs";
|
|
9
9
|
export {
|
|
10
10
|
createAudioPipeline
|
|
11
11
|
};
|
package/dist/types.d.mts
CHANGED
|
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
|
|
|
70
70
|
preRollMs?: number;
|
|
71
71
|
/**
|
|
72
72
|
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
-
* Filters out
|
|
74
|
-
* Default:
|
|
73
|
+
* Filters out brief transients like keyboard clicks.
|
|
74
|
+
* Default: 250ms (aggressive transient rejection)
|
|
75
75
|
*/
|
|
76
76
|
minSpeechDurationMs?: number;
|
|
77
77
|
/**
|
|
@@ -97,18 +97,18 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.002 (very slow downward drift)
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
* Applied
|
|
106
|
-
* Default: 0.
|
|
105
|
+
* Applied to low-energy, low-crest-factor signals (background noise).
|
|
106
|
+
* Default: 0.02
|
|
107
107
|
*/
|
|
108
108
|
noiseFloorAdaptRateLoud?: number;
|
|
109
109
|
/**
|
|
110
110
|
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
111
|
-
* Default:
|
|
111
|
+
* Default: 12.0 (aggressive noise rejection)
|
|
112
112
|
*/
|
|
113
113
|
minSNR?: number;
|
|
114
114
|
/**
|
|
@@ -118,8 +118,8 @@ interface AudioProcessingConfig {
|
|
|
118
118
|
snrRange?: number;
|
|
119
119
|
/**
|
|
120
120
|
* Minimum absolute RMS energy to consider as speech.
|
|
121
|
-
* Prevents triggering on very quiet background noise
|
|
122
|
-
* Default: 0.
|
|
121
|
+
* Prevents triggering on very quiet background noise.
|
|
122
|
+
* Default: 0.003 (approx -50dB, voice-appropriate level)
|
|
123
123
|
*/
|
|
124
124
|
minEnergy?: number;
|
|
125
125
|
};
|
package/dist/types.d.ts
CHANGED
|
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
|
|
|
70
70
|
preRollMs?: number;
|
|
71
71
|
/**
|
|
72
72
|
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
-
* Filters out
|
|
74
|
-
* Default:
|
|
73
|
+
* Filters out brief transients like keyboard clicks.
|
|
74
|
+
* Default: 250ms (aggressive transient rejection)
|
|
75
75
|
*/
|
|
76
76
|
minSpeechDurationMs?: number;
|
|
77
77
|
/**
|
|
@@ -97,18 +97,18 @@ interface AudioProcessingConfig {
|
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
98
|
/**
|
|
99
99
|
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.
|
|
100
|
+
* Default: 0.002 (very slow downward drift)
|
|
101
101
|
*/
|
|
102
102
|
noiseFloorAdaptRateQuiet?: number;
|
|
103
103
|
/**
|
|
104
104
|
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
* Applied
|
|
106
|
-
* Default: 0.
|
|
105
|
+
* Applied to low-energy, low-crest-factor signals (background noise).
|
|
106
|
+
* Default: 0.02
|
|
107
107
|
*/
|
|
108
108
|
noiseFloorAdaptRateLoud?: number;
|
|
109
109
|
/**
|
|
110
110
|
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
111
|
-
* Default:
|
|
111
|
+
* Default: 12.0 (aggressive noise rejection)
|
|
112
112
|
*/
|
|
113
113
|
minSNR?: number;
|
|
114
114
|
/**
|
|
@@ -118,8 +118,8 @@ interface AudioProcessingConfig {
|
|
|
118
118
|
snrRange?: number;
|
|
119
119
|
/**
|
|
120
120
|
* Minimum absolute RMS energy to consider as speech.
|
|
121
|
-
* Prevents triggering on very quiet background noise
|
|
122
|
-
* Default: 0.
|
|
121
|
+
* Prevents triggering on very quiet background noise.
|
|
122
|
+
* Default: 0.003 (approx -50dB, voice-appropriate level)
|
|
123
123
|
*/
|
|
124
124
|
minEnergy?: number;
|
|
125
125
|
};
|
package/dist/vad/vad-node.js
CHANGED
|
@@ -27,11 +27,11 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
27
27
|
const energyParams = vadConfig?.energyVad || {};
|
|
28
28
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
29
29
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
30
|
-
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ??
|
|
31
|
-
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.
|
|
32
|
-
const minSNR = energyParams.minSNR ??
|
|
30
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 2e-3;
|
|
31
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 0.02;
|
|
32
|
+
const minSNR = energyParams.minSNR ?? 12;
|
|
33
33
|
const snrRange = energyParams.snrRange ?? 10;
|
|
34
|
-
const minEnergy = energyParams.minEnergy ??
|
|
34
|
+
const minEnergy = energyParams.minEnergy ?? 3e-3;
|
|
35
35
|
return `
|
|
36
36
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
37
37
|
constructor() {
|
|
@@ -60,8 +60,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
60
60
|
|
|
61
61
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
62
62
|
let sum = 0;
|
|
63
|
+
let peak = 0;
|
|
63
64
|
for (let i = 0; i < channel.length; i++) {
|
|
65
|
+
const sample = Math.abs(channel[i]);
|
|
64
66
|
sum += channel[i] * channel[i];
|
|
67
|
+
peak = Math.max(peak, sample);
|
|
65
68
|
}
|
|
66
69
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
67
70
|
|
|
@@ -69,30 +72,37 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
69
72
|
// this.energy acts as the smoothed RMS value
|
|
70
73
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
71
74
|
|
|
72
|
-
//
|
|
73
|
-
//
|
|
74
|
-
//
|
|
75
|
-
const
|
|
76
|
-
const
|
|
77
|
-
|
|
78
|
-
//
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
84
|
-
// Adapt upwards at normal rate to track rising noise
|
|
85
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
75
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
76
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
77
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
78
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
79
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
80
|
+
|
|
81
|
+
// Adaptive noise floor estimation using SMOOTHED energy (not instantaneous)
|
|
82
|
+
// This prevents sharp transients from affecting the noise floor
|
|
83
|
+
if (this.energy < this.noiseFloor) {
|
|
84
|
+
// Signal is quieter than noise floor, adapt downwards slowly
|
|
85
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + this.energy * this.noiseFloorAdaptRateQuiet;
|
|
86
86
|
} else {
|
|
87
|
-
//
|
|
88
|
-
|
|
89
|
-
const
|
|
90
|
-
|
|
87
|
+
// Calculate SNR based on smoothed energy
|
|
88
|
+
const smoothedSnr = this.energy / (this.noiseFloor + 1e-6);
|
|
89
|
+
const smoothedSnrDb = 20 * Math.log10(Math.max(1e-6, smoothedSnr));
|
|
90
|
+
|
|
91
|
+
// Only adapt upwards if:
|
|
92
|
+
// 1. SNR is low (< 10dB) - likely just background noise
|
|
93
|
+
// 2. AND crest factor is low (< 15dB) - not a sharp transient
|
|
94
|
+
if (smoothedSnrDb < 10 && crestFactorDb < 15) {
|
|
95
|
+
// This is persistent background noise, adapt upwards
|
|
96
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + this.energy * this.noiseFloorAdaptRateLoud;
|
|
97
|
+
} else {
|
|
98
|
+
// Either high SNR (speech) or high crest factor (click) - adapt very slowly
|
|
99
|
+
const slowRate = this.noiseFloorAdaptRateLoud * 0.01;
|
|
100
|
+
this.noiseFloor = this.noiseFloor * (1 - slowRate) + this.energy * slowRate;
|
|
101
|
+
}
|
|
91
102
|
}
|
|
92
103
|
|
|
93
104
|
// Ensure noise floor doesn't drop to absolute zero
|
|
94
|
-
|
|
95
|
-
this.noiseFloor = Math.max(this.noiseFloor, 0.00005);
|
|
105
|
+
this.noiseFloor = Math.max(this.noiseFloor, 0.0001);
|
|
96
106
|
|
|
97
107
|
// SECOND PASS: Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
98
108
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
@@ -103,11 +113,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
103
113
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
104
114
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
105
115
|
|
|
106
|
-
// Apply absolute energy threshold
|
|
107
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
116
|
+
// Apply absolute energy threshold with soft knee
|
|
108
117
|
if (this.energy < this.minEnergy) {
|
|
109
118
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
110
|
-
probability *= Math.pow(energyRatio, 2);
|
|
119
|
+
probability *= Math.pow(energyRatio, 2);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// Apply crest factor penalty
|
|
123
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
124
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
125
|
+
// We penalize anything above 14dB
|
|
126
|
+
if (crestFactorDb > 14) {
|
|
127
|
+
const excess = crestFactorDb - 14;
|
|
128
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
129
|
+
probability *= penalty;
|
|
111
130
|
}
|
|
112
131
|
|
|
113
132
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
package/dist/vad/vad-node.mjs
CHANGED
package/dist/vad/vad-state.js
CHANGED
|
@@ -44,17 +44,17 @@ var VADStateMachine = class {
|
|
|
44
44
|
// Smooth for natural speech
|
|
45
45
|
preRollMs: config?.preRollMs ?? 250,
|
|
46
46
|
// Generous pre-roll
|
|
47
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
48
|
-
//
|
|
47
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
48
|
+
// Aggressive transient rejection
|
|
49
49
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
50
50
|
energyVad: {
|
|
51
51
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
52
52
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
53
|
-
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ??
|
|
54
|
-
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.
|
|
55
|
-
minSNR: config?.energyVad?.minSNR ??
|
|
53
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 2e-3,
|
|
54
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 0.02,
|
|
55
|
+
minSNR: config?.energyVad?.minSNR ?? 12,
|
|
56
56
|
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
57
|
-
minEnergy: config?.energyVad?.minEnergy ??
|
|
57
|
+
minEnergy: config?.energyVad?.minEnergy ?? 3e-3
|
|
58
58
|
}
|
|
59
59
|
};
|
|
60
60
|
this.lastSilenceTime = Date.now();
|
package/dist/vad/vad-state.mjs
CHANGED