@tensamin/audio 0.1.13 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -7
- package/dist/{chunk-FKR6NWZF.mjs → chunk-GLKAWCEW.mjs} +31 -34
- package/dist/{chunk-DLLK6K76.mjs → chunk-KLBA2CPE.mjs} +5 -7
- package/dist/{chunk-K6X52R7N.mjs → chunk-QQFKHTCQ.mjs} +1 -1
- package/dist/{chunk-OXV7BHX5.mjs → chunk-U26F3GJN.mjs} +1 -1
- package/dist/{chunk-RD4GDIPO.mjs → chunk-WQVMSR7V.mjs} +5 -6
- package/dist/extensibility/plugins.js +31 -34
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +39 -45
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +39 -45
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +39 -45
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +6 -17
- package/dist/types.d.ts +6 -17
- package/dist/vad/vad-node.js +31 -34
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.js +5 -7
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -101,11 +101,9 @@ vad: {
|
|
|
101
101
|
energyVad?: {
|
|
102
102
|
smoothing: number; // Default: 0.95
|
|
103
103
|
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
snrRange: number; // Default: 10.0 (dB)
|
|
108
|
-
minEnergy: number; // Default: 0.001
|
|
104
|
+
minSNR: number; // Default: 8.0 (dB)
|
|
105
|
+
snrRange: number; // Default: 12.0 (dB)
|
|
106
|
+
minEnergy: number; // Default: 0.01
|
|
109
107
|
};
|
|
110
108
|
}
|
|
111
109
|
```
|
|
@@ -116,7 +114,7 @@ vad: {
|
|
|
116
114
|
- `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
|
|
117
115
|
- `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
|
|
118
116
|
- `preRollMs`: Audio buffer duration before speech onset
|
|
119
|
-
- `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default:
|
|
117
|
+
- `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 250ms)
|
|
120
118
|
- `minSilenceDurationMs`: Minimum silence duration between speech segments
|
|
121
119
|
|
|
122
120
|
**Energy VAD Parameters:**
|
|
@@ -124,7 +122,7 @@ vad: {
|
|
|
124
122
|
- `smoothing`: Energy calculation smoothing factor (0-1)
|
|
125
123
|
- `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
|
|
126
124
|
- `snrRange`: Range in dB for probability scaling from minSNR
|
|
127
|
-
- `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.
|
|
125
|
+
- `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.01, ~-40dB)
|
|
128
126
|
|
|
129
127
|
### Output Control
|
|
130
128
|
|
|
@@ -3,11 +3,9 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
3
3
|
const energyParams = vadConfig?.energyVad || {};
|
|
4
4
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
5
5
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
6
|
-
const
|
|
7
|
-
const
|
|
8
|
-
const
|
|
9
|
-
const snrRange = energyParams.snrRange ?? 10;
|
|
10
|
-
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
6
|
+
const minSNR = energyParams.minSNR ?? 8;
|
|
7
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
8
|
+
const minEnergy = energyParams.minEnergy ?? 0.01;
|
|
11
9
|
return `
|
|
12
10
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
13
11
|
constructor() {
|
|
@@ -15,8 +13,6 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
15
13
|
this.smoothing = ${smoothing};
|
|
16
14
|
this.energy = 0;
|
|
17
15
|
this.noiseFloor = ${initialNoiseFloor};
|
|
18
|
-
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
19
|
-
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
20
16
|
this.minSNR = ${minSNR};
|
|
21
17
|
this.snrRange = ${snrRange};
|
|
22
18
|
this.minEnergy = ${minEnergy};
|
|
@@ -36,8 +32,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
36
32
|
|
|
37
33
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
38
34
|
let sum = 0;
|
|
35
|
+
let peak = 0;
|
|
39
36
|
for (let i = 0; i < channel.length; i++) {
|
|
37
|
+
const sample = Math.abs(channel[i]);
|
|
40
38
|
sum += channel[i] * channel[i];
|
|
39
|
+
peak = Math.max(peak, sample);
|
|
41
40
|
}
|
|
42
41
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
43
42
|
|
|
@@ -45,32 +44,21 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
45
44
|
// this.energy acts as the smoothed RMS value
|
|
46
45
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
47
46
|
|
|
48
|
-
//
|
|
49
|
-
//
|
|
50
|
-
//
|
|
51
|
-
const
|
|
52
|
-
const
|
|
53
|
-
|
|
54
|
-
// Adapt the noise floor based on instantaneous SNR
|
|
55
|
-
if (instantRms < this.noiseFloor) {
|
|
56
|
-
// Signal is quieter than noise floor, adapt downwards quickly
|
|
57
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
58
|
-
} else if (instantSnrDb < 12) {
|
|
59
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
60
|
-
// Adapt upwards at normal rate to track rising noise
|
|
61
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
62
|
-
} else {
|
|
63
|
-
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
64
|
-
// Adapt VERY slowly to avoid "chasing" speech
|
|
65
|
-
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
66
|
-
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
67
|
-
}
|
|
47
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
48
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
49
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
50
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
51
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
68
52
|
|
|
69
|
-
//
|
|
70
|
-
//
|
|
71
|
-
|
|
53
|
+
// FIXED noise floor with minimal adaptation
|
|
54
|
+
// Only adapt within strict bounds to prevent drift
|
|
55
|
+
const targetFloor = Math.max(0.0003, Math.min(0.003, instantRms));
|
|
56
|
+
this.noiseFloor = this.noiseFloor * 0.995 + targetFloor * 0.005;
|
|
57
|
+
|
|
58
|
+
// Hard clamp to prevent any drift outside acceptable range
|
|
59
|
+
this.noiseFloor = Math.max(0.0003, Math.min(0.003, this.noiseFloor));
|
|
72
60
|
|
|
73
|
-
//
|
|
61
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
74
62
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
75
63
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
76
64
|
|
|
@@ -79,11 +67,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
79
67
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
80
68
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
81
69
|
|
|
82
|
-
// Apply absolute energy threshold
|
|
83
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
70
|
+
// Apply absolute energy threshold with soft knee
|
|
84
71
|
if (this.energy < this.minEnergy) {
|
|
85
72
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
86
|
-
probability *= Math.pow(energyRatio, 2);
|
|
73
|
+
probability *= Math.pow(energyRatio, 2);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Apply crest factor penalty
|
|
77
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
78
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
79
|
+
// We penalize anything above 14dB
|
|
80
|
+
if (crestFactorDb > 14) {
|
|
81
|
+
const excess = crestFactorDb - 14;
|
|
82
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
83
|
+
probability *= penalty;
|
|
87
84
|
}
|
|
88
85
|
|
|
89
86
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -20,17 +20,15 @@ var VADStateMachine = class {
|
|
|
20
20
|
// Smooth for natural speech
|
|
21
21
|
preRollMs: config?.preRollMs ?? 250,
|
|
22
22
|
// Generous pre-roll
|
|
23
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
24
|
-
//
|
|
23
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
24
|
+
// Aggressive transient rejection
|
|
25
25
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
26
26
|
energyVad: {
|
|
27
27
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
28
28
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
33
|
-
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
29
|
+
minSNR: config?.energyVad?.minSNR ?? 8,
|
|
30
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
31
|
+
minEnergy: config?.energyVad?.minEnergy ?? 0.01
|
|
34
32
|
}
|
|
35
33
|
};
|
|
36
34
|
this.lastSilenceTime = Date.now();
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
VADStateMachine
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-KLBA2CPE.mjs";
|
|
4
4
|
import {
|
|
5
5
|
getAudioContext,
|
|
6
6
|
registerPipeline,
|
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
import {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-U26F3GJN.mjs";
|
|
13
13
|
|
|
14
14
|
// src/pipeline/audio-pipeline.ts
|
|
15
15
|
import mitt from "mitt";
|
|
@@ -37,10 +37,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
37
37
|
energyVad: {
|
|
38
38
|
smoothing: 0.95,
|
|
39
39
|
initialNoiseFloor: 1e-3,
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
snrRange: 8
|
|
40
|
+
minSNR: 8,
|
|
41
|
+
snrRange: 12,
|
|
42
|
+
minEnergy: 0.01
|
|
44
43
|
},
|
|
45
44
|
...config.vad
|
|
46
45
|
},
|
|
@@ -106,11 +106,9 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
106
106
|
const energyParams = vadConfig?.energyVad || {};
|
|
107
107
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
108
108
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
109
|
-
const
|
|
110
|
-
const
|
|
111
|
-
const
|
|
112
|
-
const snrRange = energyParams.snrRange ?? 10;
|
|
113
|
-
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
109
|
+
const minSNR = energyParams.minSNR ?? 8;
|
|
110
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
111
|
+
const minEnergy = energyParams.minEnergy ?? 0.01;
|
|
114
112
|
return `
|
|
115
113
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
116
114
|
constructor() {
|
|
@@ -118,8 +116,6 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
118
116
|
this.smoothing = ${smoothing};
|
|
119
117
|
this.energy = 0;
|
|
120
118
|
this.noiseFloor = ${initialNoiseFloor};
|
|
121
|
-
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
122
|
-
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
123
119
|
this.minSNR = ${minSNR};
|
|
124
120
|
this.snrRange = ${snrRange};
|
|
125
121
|
this.minEnergy = ${minEnergy};
|
|
@@ -139,8 +135,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
139
135
|
|
|
140
136
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
141
137
|
let sum = 0;
|
|
138
|
+
let peak = 0;
|
|
142
139
|
for (let i = 0; i < channel.length; i++) {
|
|
140
|
+
const sample = Math.abs(channel[i]);
|
|
143
141
|
sum += channel[i] * channel[i];
|
|
142
|
+
peak = Math.max(peak, sample);
|
|
144
143
|
}
|
|
145
144
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
146
145
|
|
|
@@ -148,32 +147,21 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
148
147
|
// this.energy acts as the smoothed RMS value
|
|
149
148
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
150
149
|
|
|
151
|
-
//
|
|
152
|
-
//
|
|
153
|
-
//
|
|
154
|
-
const
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
// Adapt the noise floor based on instantaneous SNR
|
|
158
|
-
if (instantRms < this.noiseFloor) {
|
|
159
|
-
// Signal is quieter than noise floor, adapt downwards quickly
|
|
160
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
161
|
-
} else if (instantSnrDb < 12) {
|
|
162
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
163
|
-
// Adapt upwards at normal rate to track rising noise
|
|
164
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
165
|
-
} else {
|
|
166
|
-
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
167
|
-
// Adapt VERY slowly to avoid "chasing" speech
|
|
168
|
-
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
169
|
-
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
170
|
-
}
|
|
150
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
151
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
152
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
153
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
154
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
171
155
|
|
|
172
|
-
//
|
|
173
|
-
//
|
|
174
|
-
|
|
156
|
+
// FIXED noise floor with minimal adaptation
|
|
157
|
+
// Only adapt within strict bounds to prevent drift
|
|
158
|
+
const targetFloor = Math.max(0.0003, Math.min(0.003, instantRms));
|
|
159
|
+
this.noiseFloor = this.noiseFloor * 0.995 + targetFloor * 0.005;
|
|
160
|
+
|
|
161
|
+
// Hard clamp to prevent any drift outside acceptable range
|
|
162
|
+
this.noiseFloor = Math.max(0.0003, Math.min(0.003, this.noiseFloor));
|
|
175
163
|
|
|
176
|
-
//
|
|
164
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
177
165
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
178
166
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
179
167
|
|
|
@@ -182,11 +170,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
182
170
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
183
171
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
184
172
|
|
|
185
|
-
// Apply absolute energy threshold
|
|
186
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
173
|
+
// Apply absolute energy threshold with soft knee
|
|
187
174
|
if (this.energy < this.minEnergy) {
|
|
188
175
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
189
|
-
probability *= Math.pow(energyRatio, 2);
|
|
176
|
+
probability *= Math.pow(energyRatio, 2);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// Apply crest factor penalty
|
|
180
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
181
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
182
|
+
// We penalize anything above 14dB
|
|
183
|
+
if (crestFactorDb > 14) {
|
|
184
|
+
const excess = crestFactorDb - 14;
|
|
185
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
186
|
+
probability *= penalty;
|
|
190
187
|
}
|
|
191
188
|
|
|
192
189
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -3,9 +3,9 @@ import {
|
|
|
3
3
|
getVADPlugin,
|
|
4
4
|
registerNoiseSuppressionPlugin,
|
|
5
5
|
registerVADPlugin
|
|
6
|
-
} from "../chunk-
|
|
6
|
+
} from "../chunk-U26F3GJN.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-GLKAWCEW.mjs";
|
|
9
9
|
export {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin,
|
package/dist/index.js
CHANGED
|
@@ -158,11 +158,9 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
158
158
|
const energyParams = vadConfig?.energyVad || {};
|
|
159
159
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
160
160
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
161
|
-
const
|
|
162
|
-
const
|
|
163
|
-
const
|
|
164
|
-
const snrRange = energyParams.snrRange ?? 10;
|
|
165
|
-
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
161
|
+
const minSNR = energyParams.minSNR ?? 8;
|
|
162
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
163
|
+
const minEnergy = energyParams.minEnergy ?? 0.01;
|
|
166
164
|
return `
|
|
167
165
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
168
166
|
constructor() {
|
|
@@ -170,8 +168,6 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
170
168
|
this.smoothing = ${smoothing};
|
|
171
169
|
this.energy = 0;
|
|
172
170
|
this.noiseFloor = ${initialNoiseFloor};
|
|
173
|
-
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
174
|
-
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
175
171
|
this.minSNR = ${minSNR};
|
|
176
172
|
this.snrRange = ${snrRange};
|
|
177
173
|
this.minEnergy = ${minEnergy};
|
|
@@ -191,8 +187,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
191
187
|
|
|
192
188
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
193
189
|
let sum = 0;
|
|
190
|
+
let peak = 0;
|
|
194
191
|
for (let i = 0; i < channel.length; i++) {
|
|
192
|
+
const sample = Math.abs(channel[i]);
|
|
195
193
|
sum += channel[i] * channel[i];
|
|
194
|
+
peak = Math.max(peak, sample);
|
|
196
195
|
}
|
|
197
196
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
198
197
|
|
|
@@ -200,32 +199,21 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
200
199
|
// this.energy acts as the smoothed RMS value
|
|
201
200
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
202
201
|
|
|
203
|
-
//
|
|
204
|
-
//
|
|
205
|
-
//
|
|
206
|
-
const
|
|
207
|
-
const
|
|
208
|
-
|
|
209
|
-
// Adapt the noise floor based on instantaneous SNR
|
|
210
|
-
if (instantRms < this.noiseFloor) {
|
|
211
|
-
// Signal is quieter than noise floor, adapt downwards quickly
|
|
212
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
213
|
-
} else if (instantSnrDb < 12) {
|
|
214
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
215
|
-
// Adapt upwards at normal rate to track rising noise
|
|
216
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
217
|
-
} else {
|
|
218
|
-
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
219
|
-
// Adapt VERY slowly to avoid "chasing" speech
|
|
220
|
-
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
221
|
-
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
222
|
-
}
|
|
202
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
203
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
204
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
205
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
206
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
223
207
|
|
|
224
|
-
//
|
|
225
|
-
//
|
|
226
|
-
|
|
208
|
+
// FIXED noise floor with minimal adaptation
|
|
209
|
+
// Only adapt within strict bounds to prevent drift
|
|
210
|
+
const targetFloor = Math.max(0.0003, Math.min(0.003, instantRms));
|
|
211
|
+
this.noiseFloor = this.noiseFloor * 0.995 + targetFloor * 0.005;
|
|
212
|
+
|
|
213
|
+
// Hard clamp to prevent any drift outside acceptable range
|
|
214
|
+
this.noiseFloor = Math.max(0.0003, Math.min(0.003, this.noiseFloor));
|
|
227
215
|
|
|
228
|
-
//
|
|
216
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
229
217
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
230
218
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
231
219
|
|
|
@@ -234,11 +222,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
234
222
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
235
223
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
236
224
|
|
|
237
|
-
// Apply absolute energy threshold
|
|
238
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
225
|
+
// Apply absolute energy threshold with soft knee
|
|
239
226
|
if (this.energy < this.minEnergy) {
|
|
240
227
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
241
|
-
probability *= Math.pow(energyRatio, 2);
|
|
228
|
+
probability *= Math.pow(energyRatio, 2);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Apply crest factor penalty
|
|
232
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
233
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
234
|
+
// We penalize anything above 14dB
|
|
235
|
+
if (crestFactorDb > 14) {
|
|
236
|
+
const excess = crestFactorDb - 14;
|
|
237
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
238
|
+
probability *= penalty;
|
|
242
239
|
}
|
|
243
240
|
|
|
244
241
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -367,17 +364,15 @@ var VADStateMachine = class {
|
|
|
367
364
|
// Smooth for natural speech
|
|
368
365
|
preRollMs: config?.preRollMs ?? 250,
|
|
369
366
|
// Generous pre-roll
|
|
370
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
371
|
-
//
|
|
367
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
368
|
+
// Aggressive transient rejection
|
|
372
369
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
373
370
|
energyVad: {
|
|
374
371
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
375
372
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
380
|
-
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
373
|
+
minSNR: config?.energyVad?.minSNR ?? 8,
|
|
374
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
375
|
+
minEnergy: config?.energyVad?.minEnergy ?? 0.01
|
|
381
376
|
}
|
|
382
377
|
};
|
|
383
378
|
this.lastSilenceTime = Date.now();
|
|
@@ -470,10 +465,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
470
465
|
energyVad: {
|
|
471
466
|
smoothing: 0.95,
|
|
472
467
|
initialNoiseFloor: 1e-3,
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
snrRange: 8
|
|
468
|
+
minSNR: 8,
|
|
469
|
+
snrRange: 12,
|
|
470
|
+
minEnergy: 0.01
|
|
477
471
|
},
|
|
478
472
|
...config.vad
|
|
479
473
|
},
|
package/dist/index.mjs
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import "./chunk-WBQAMGXK.mjs";
|
|
2
2
|
import {
|
|
3
3
|
attachProcessingToTrack
|
|
4
|
-
} from "./chunk-
|
|
4
|
+
} from "./chunk-QQFKHTCQ.mjs";
|
|
5
5
|
import {
|
|
6
6
|
createAudioPipeline
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-WQVMSR7V.mjs";
|
|
8
8
|
import {
|
|
9
9
|
VADStateMachine
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-KLBA2CPE.mjs";
|
|
11
11
|
import {
|
|
12
12
|
closeAudioContext,
|
|
13
13
|
getAudioContext,
|
|
@@ -21,13 +21,13 @@ import {
|
|
|
21
21
|
getVADPlugin,
|
|
22
22
|
registerNoiseSuppressionPlugin,
|
|
23
23
|
registerVADPlugin
|
|
24
|
-
} from "./chunk-
|
|
24
|
+
} from "./chunk-U26F3GJN.mjs";
|
|
25
25
|
import {
|
|
26
26
|
RNNoisePlugin
|
|
27
27
|
} from "./chunk-XO6B3D4A.mjs";
|
|
28
28
|
import {
|
|
29
29
|
EnergyVADPlugin
|
|
30
|
-
} from "./chunk-
|
|
30
|
+
} from "./chunk-GLKAWCEW.mjs";
|
|
31
31
|
export {
|
|
32
32
|
EnergyVADPlugin,
|
|
33
33
|
RNNoisePlugin,
|
|
@@ -127,11 +127,9 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
127
127
|
const energyParams = vadConfig?.energyVad || {};
|
|
128
128
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
129
129
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
130
|
-
const
|
|
131
|
-
const
|
|
132
|
-
const
|
|
133
|
-
const snrRange = energyParams.snrRange ?? 10;
|
|
134
|
-
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
130
|
+
const minSNR = energyParams.minSNR ?? 8;
|
|
131
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
132
|
+
const minEnergy = energyParams.minEnergy ?? 0.01;
|
|
135
133
|
return `
|
|
136
134
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
137
135
|
constructor() {
|
|
@@ -139,8 +137,6 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
139
137
|
this.smoothing = ${smoothing};
|
|
140
138
|
this.energy = 0;
|
|
141
139
|
this.noiseFloor = ${initialNoiseFloor};
|
|
142
|
-
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
143
|
-
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
144
140
|
this.minSNR = ${minSNR};
|
|
145
141
|
this.snrRange = ${snrRange};
|
|
146
142
|
this.minEnergy = ${minEnergy};
|
|
@@ -160,8 +156,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
160
156
|
|
|
161
157
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
162
158
|
let sum = 0;
|
|
159
|
+
let peak = 0;
|
|
163
160
|
for (let i = 0; i < channel.length; i++) {
|
|
161
|
+
const sample = Math.abs(channel[i]);
|
|
164
162
|
sum += channel[i] * channel[i];
|
|
163
|
+
peak = Math.max(peak, sample);
|
|
165
164
|
}
|
|
166
165
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
167
166
|
|
|
@@ -169,32 +168,21 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
169
168
|
// this.energy acts as the smoothed RMS value
|
|
170
169
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
171
170
|
|
|
172
|
-
//
|
|
173
|
-
//
|
|
174
|
-
//
|
|
175
|
-
const
|
|
176
|
-
const
|
|
177
|
-
|
|
178
|
-
// Adapt the noise floor based on instantaneous SNR
|
|
179
|
-
if (instantRms < this.noiseFloor) {
|
|
180
|
-
// Signal is quieter than noise floor, adapt downwards quickly
|
|
181
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
182
|
-
} else if (instantSnrDb < 12) {
|
|
183
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
184
|
-
// Adapt upwards at normal rate to track rising noise
|
|
185
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
186
|
-
} else {
|
|
187
|
-
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
188
|
-
// Adapt VERY slowly to avoid "chasing" speech
|
|
189
|
-
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
190
|
-
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
191
|
-
}
|
|
171
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
172
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
173
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
174
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
175
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
192
176
|
|
|
193
|
-
//
|
|
194
|
-
//
|
|
195
|
-
|
|
177
|
+
// FIXED noise floor with minimal adaptation
|
|
178
|
+
// Only adapt within strict bounds to prevent drift
|
|
179
|
+
const targetFloor = Math.max(0.0003, Math.min(0.003, instantRms));
|
|
180
|
+
this.noiseFloor = this.noiseFloor * 0.995 + targetFloor * 0.005;
|
|
181
|
+
|
|
182
|
+
// Hard clamp to prevent any drift outside acceptable range
|
|
183
|
+
this.noiseFloor = Math.max(0.0003, Math.min(0.003, this.noiseFloor));
|
|
196
184
|
|
|
197
|
-
//
|
|
185
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
198
186
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
199
187
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
200
188
|
|
|
@@ -203,11 +191,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
203
191
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
204
192
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
205
193
|
|
|
206
|
-
// Apply absolute energy threshold
|
|
207
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
194
|
+
// Apply absolute energy threshold with soft knee
|
|
208
195
|
if (this.energy < this.minEnergy) {
|
|
209
196
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
210
|
-
probability *= Math.pow(energyRatio, 2);
|
|
197
|
+
probability *= Math.pow(energyRatio, 2);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Apply crest factor penalty
|
|
201
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
202
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
203
|
+
// We penalize anything above 14dB
|
|
204
|
+
if (crestFactorDb > 14) {
|
|
205
|
+
const excess = crestFactorDb - 14;
|
|
206
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
207
|
+
probability *= penalty;
|
|
211
208
|
}
|
|
212
209
|
|
|
213
210
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -330,17 +327,15 @@ var VADStateMachine = class {
|
|
|
330
327
|
// Smooth for natural speech
|
|
331
328
|
preRollMs: config?.preRollMs ?? 250,
|
|
332
329
|
// Generous pre-roll
|
|
333
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
334
|
-
//
|
|
330
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
331
|
+
// Aggressive transient rejection
|
|
335
332
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
336
333
|
energyVad: {
|
|
337
334
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
338
335
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
343
|
-
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
336
|
+
minSNR: config?.energyVad?.minSNR ?? 8,
|
|
337
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
338
|
+
minEnergy: config?.energyVad?.minEnergy ?? 0.01
|
|
344
339
|
}
|
|
345
340
|
};
|
|
346
341
|
this.lastSilenceTime = Date.now();
|
|
@@ -433,10 +428,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
433
428
|
energyVad: {
|
|
434
429
|
smoothing: 0.95,
|
|
435
430
|
initialNoiseFloor: 1e-3,
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
snrRange: 8
|
|
431
|
+
minSNR: 8,
|
|
432
|
+
snrRange: 12,
|
|
433
|
+
minEnergy: 0.01
|
|
440
434
|
},
|
|
441
435
|
...config.vad
|
|
442
436
|
},
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
import {
|
|
2
2
|
attachProcessingToTrack
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
5
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-QQFKHTCQ.mjs";
|
|
4
|
+
import "../chunk-WQVMSR7V.mjs";
|
|
5
|
+
import "../chunk-KLBA2CPE.mjs";
|
|
6
6
|
import "../chunk-OZ7KMC4S.mjs";
|
|
7
|
-
import "../chunk-
|
|
7
|
+
import "../chunk-U26F3GJN.mjs";
|
|
8
8
|
import "../chunk-XO6B3D4A.mjs";
|
|
9
|
-
import "../chunk-
|
|
9
|
+
import "../chunk-GLKAWCEW.mjs";
|
|
10
10
|
export {
|
|
11
11
|
attachProcessingToTrack
|
|
12
12
|
};
|
|
@@ -125,11 +125,9 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
125
125
|
const energyParams = vadConfig?.energyVad || {};
|
|
126
126
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
127
127
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
128
|
-
const
|
|
129
|
-
const
|
|
130
|
-
const
|
|
131
|
-
const snrRange = energyParams.snrRange ?? 10;
|
|
132
|
-
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
128
|
+
const minSNR = energyParams.minSNR ?? 8;
|
|
129
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
130
|
+
const minEnergy = energyParams.minEnergy ?? 0.01;
|
|
133
131
|
return `
|
|
134
132
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
135
133
|
constructor() {
|
|
@@ -137,8 +135,6 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
137
135
|
this.smoothing = ${smoothing};
|
|
138
136
|
this.energy = 0;
|
|
139
137
|
this.noiseFloor = ${initialNoiseFloor};
|
|
140
|
-
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
141
|
-
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
142
138
|
this.minSNR = ${minSNR};
|
|
143
139
|
this.snrRange = ${snrRange};
|
|
144
140
|
this.minEnergy = ${minEnergy};
|
|
@@ -158,8 +154,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
158
154
|
|
|
159
155
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
160
156
|
let sum = 0;
|
|
157
|
+
let peak = 0;
|
|
161
158
|
for (let i = 0; i < channel.length; i++) {
|
|
159
|
+
const sample = Math.abs(channel[i]);
|
|
162
160
|
sum += channel[i] * channel[i];
|
|
161
|
+
peak = Math.max(peak, sample);
|
|
163
162
|
}
|
|
164
163
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
165
164
|
|
|
@@ -167,32 +166,21 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
167
166
|
// this.energy acts as the smoothed RMS value
|
|
168
167
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
169
168
|
|
|
170
|
-
//
|
|
171
|
-
//
|
|
172
|
-
//
|
|
173
|
-
const
|
|
174
|
-
const
|
|
175
|
-
|
|
176
|
-
// Adapt the noise floor based on instantaneous SNR
|
|
177
|
-
if (instantRms < this.noiseFloor) {
|
|
178
|
-
// Signal is quieter than noise floor, adapt downwards quickly
|
|
179
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
180
|
-
} else if (instantSnrDb < 12) {
|
|
181
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
182
|
-
// Adapt upwards at normal rate to track rising noise
|
|
183
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
184
|
-
} else {
|
|
185
|
-
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
186
|
-
// Adapt VERY slowly to avoid "chasing" speech
|
|
187
|
-
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
188
|
-
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
189
|
-
}
|
|
169
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
170
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
171
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
172
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
173
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
190
174
|
|
|
191
|
-
//
|
|
192
|
-
//
|
|
193
|
-
|
|
175
|
+
// FIXED noise floor with minimal adaptation
|
|
176
|
+
// Only adapt within strict bounds to prevent drift
|
|
177
|
+
const targetFloor = Math.max(0.0003, Math.min(0.003, instantRms));
|
|
178
|
+
this.noiseFloor = this.noiseFloor * 0.995 + targetFloor * 0.005;
|
|
179
|
+
|
|
180
|
+
// Hard clamp to prevent any drift outside acceptable range
|
|
181
|
+
this.noiseFloor = Math.max(0.0003, Math.min(0.003, this.noiseFloor));
|
|
194
182
|
|
|
195
|
-
//
|
|
183
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
196
184
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
197
185
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
198
186
|
|
|
@@ -201,11 +189,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
201
189
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
202
190
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
203
191
|
|
|
204
|
-
// Apply absolute energy threshold
|
|
205
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
192
|
+
// Apply absolute energy threshold with soft knee
|
|
206
193
|
if (this.energy < this.minEnergy) {
|
|
207
194
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
208
|
-
probability *= Math.pow(energyRatio, 2);
|
|
195
|
+
probability *= Math.pow(energyRatio, 2);
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
// Apply crest factor penalty
|
|
199
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
200
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
201
|
+
// We penalize anything above 14dB
|
|
202
|
+
if (crestFactorDb > 14) {
|
|
203
|
+
const excess = crestFactorDb - 14;
|
|
204
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
205
|
+
probability *= penalty;
|
|
209
206
|
}
|
|
210
207
|
|
|
211
208
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
|
@@ -328,17 +325,15 @@ var VADStateMachine = class {
|
|
|
328
325
|
// Smooth for natural speech
|
|
329
326
|
preRollMs: config?.preRollMs ?? 250,
|
|
330
327
|
// Generous pre-roll
|
|
331
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
332
|
-
//
|
|
328
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
329
|
+
// Aggressive transient rejection
|
|
333
330
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
334
331
|
energyVad: {
|
|
335
332
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
336
333
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
341
|
-
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
334
|
+
minSNR: config?.energyVad?.minSNR ?? 8,
|
|
335
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
336
|
+
minEnergy: config?.energyVad?.minEnergy ?? 0.01
|
|
342
337
|
}
|
|
343
338
|
};
|
|
344
339
|
this.lastSilenceTime = Date.now();
|
|
@@ -431,10 +426,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
431
426
|
energyVad: {
|
|
432
427
|
smoothing: 0.95,
|
|
433
428
|
initialNoiseFloor: 1e-3,
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
snrRange: 8
|
|
429
|
+
minSNR: 8,
|
|
430
|
+
snrRange: 12,
|
|
431
|
+
minEnergy: 0.01
|
|
438
432
|
},
|
|
439
433
|
...config.vad
|
|
440
434
|
},
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
import {
|
|
2
2
|
createAudioPipeline
|
|
3
|
-
} from "../chunk-
|
|
4
|
-
import "../chunk-
|
|
3
|
+
} from "../chunk-WQVMSR7V.mjs";
|
|
4
|
+
import "../chunk-KLBA2CPE.mjs";
|
|
5
5
|
import "../chunk-OZ7KMC4S.mjs";
|
|
6
|
-
import "../chunk-
|
|
6
|
+
import "../chunk-U26F3GJN.mjs";
|
|
7
7
|
import "../chunk-XO6B3D4A.mjs";
|
|
8
|
-
import "../chunk-
|
|
8
|
+
import "../chunk-GLKAWCEW.mjs";
|
|
9
9
|
export {
|
|
10
10
|
createAudioPipeline
|
|
11
11
|
};
|
package/dist/types.d.mts
CHANGED
|
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
|
|
|
70
70
|
preRollMs?: number;
|
|
71
71
|
/**
|
|
72
72
|
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
-
* Filters out
|
|
74
|
-
* Default:
|
|
73
|
+
* Filters out brief transients like keyboard clicks.
|
|
74
|
+
* Default: 250ms (aggressive transient rejection)
|
|
75
75
|
*/
|
|
76
76
|
minSpeechDurationMs?: number;
|
|
77
77
|
/**
|
|
@@ -95,31 +95,20 @@ interface AudioProcessingConfig {
|
|
|
95
95
|
* Default: 0.001
|
|
96
96
|
*/
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
|
-
/**
|
|
99
|
-
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.01
|
|
101
|
-
*/
|
|
102
|
-
noiseFloorAdaptRateQuiet?: number;
|
|
103
|
-
/**
|
|
104
|
-
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
* Applied when instantaneous SNR < 12dB (background noise).
|
|
106
|
-
* Default: 0.1 (fast tracking of rising noise)
|
|
107
|
-
*/
|
|
108
|
-
noiseFloorAdaptRateLoud?: number;
|
|
109
98
|
/**
|
|
110
99
|
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
111
|
-
* Default:
|
|
100
|
+
* Default: 8.0
|
|
112
101
|
*/
|
|
113
102
|
minSNR?: number;
|
|
114
103
|
/**
|
|
115
104
|
* SNR range in dB for probability scaling.
|
|
116
|
-
* Default:
|
|
105
|
+
* Default: 12.0 (probability scales from minSNR to minSNR+snrRange)
|
|
117
106
|
*/
|
|
118
107
|
snrRange?: number;
|
|
119
108
|
/**
|
|
120
109
|
* Minimum absolute RMS energy to consider as speech.
|
|
121
|
-
* Prevents triggering on
|
|
122
|
-
* Default: 0.
|
|
110
|
+
* Prevents triggering on quiet background noise.
|
|
111
|
+
* Default: 0.01 (approx -40dB, typical voice level)
|
|
123
112
|
*/
|
|
124
113
|
minEnergy?: number;
|
|
125
114
|
};
|
package/dist/types.d.ts
CHANGED
|
@@ -70,8 +70,8 @@ interface AudioProcessingConfig {
|
|
|
70
70
|
preRollMs?: number;
|
|
71
71
|
/**
|
|
72
72
|
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
-
* Filters out
|
|
74
|
-
* Default:
|
|
73
|
+
* Filters out brief transients like keyboard clicks.
|
|
74
|
+
* Default: 250ms (aggressive transient rejection)
|
|
75
75
|
*/
|
|
76
76
|
minSpeechDurationMs?: number;
|
|
77
77
|
/**
|
|
@@ -95,31 +95,20 @@ interface AudioProcessingConfig {
|
|
|
95
95
|
* Default: 0.001
|
|
96
96
|
*/
|
|
97
97
|
initialNoiseFloor?: number;
|
|
98
|
-
/**
|
|
99
|
-
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
-
* Default: 0.01
|
|
101
|
-
*/
|
|
102
|
-
noiseFloorAdaptRateQuiet?: number;
|
|
103
|
-
/**
|
|
104
|
-
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
-
* Applied when instantaneous SNR < 12dB (background noise).
|
|
106
|
-
* Default: 0.1 (fast tracking of rising noise)
|
|
107
|
-
*/
|
|
108
|
-
noiseFloorAdaptRateLoud?: number;
|
|
109
98
|
/**
|
|
110
99
|
* Minimum SNR (Signal-to-Noise Ratio) in dB for speech detection.
|
|
111
|
-
* Default:
|
|
100
|
+
* Default: 8.0
|
|
112
101
|
*/
|
|
113
102
|
minSNR?: number;
|
|
114
103
|
/**
|
|
115
104
|
* SNR range in dB for probability scaling.
|
|
116
|
-
* Default:
|
|
105
|
+
* Default: 12.0 (probability scales from minSNR to minSNR+snrRange)
|
|
117
106
|
*/
|
|
118
107
|
snrRange?: number;
|
|
119
108
|
/**
|
|
120
109
|
* Minimum absolute RMS energy to consider as speech.
|
|
121
|
-
* Prevents triggering on
|
|
122
|
-
* Default: 0.
|
|
110
|
+
* Prevents triggering on quiet background noise.
|
|
111
|
+
* Default: 0.01 (approx -40dB, typical voice level)
|
|
123
112
|
*/
|
|
124
113
|
minEnergy?: number;
|
|
125
114
|
};
|
package/dist/vad/vad-node.js
CHANGED
|
@@ -27,11 +27,9 @@ var createEnergyVadWorkletCode = (vadConfig) => {
|
|
|
27
27
|
const energyParams = vadConfig?.energyVad || {};
|
|
28
28
|
const smoothing = energyParams.smoothing ?? 0.95;
|
|
29
29
|
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
30
|
-
const
|
|
31
|
-
const
|
|
32
|
-
const
|
|
33
|
-
const snrRange = energyParams.snrRange ?? 10;
|
|
34
|
-
const minEnergy = energyParams.minEnergy ?? 1e-3;
|
|
30
|
+
const minSNR = energyParams.minSNR ?? 8;
|
|
31
|
+
const snrRange = energyParams.snrRange ?? 12;
|
|
32
|
+
const minEnergy = energyParams.minEnergy ?? 0.01;
|
|
35
33
|
return `
|
|
36
34
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
37
35
|
constructor() {
|
|
@@ -39,8 +37,6 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
39
37
|
this.smoothing = ${smoothing};
|
|
40
38
|
this.energy = 0;
|
|
41
39
|
this.noiseFloor = ${initialNoiseFloor};
|
|
42
|
-
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
43
|
-
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
44
40
|
this.minSNR = ${minSNR};
|
|
45
41
|
this.snrRange = ${snrRange};
|
|
46
42
|
this.minEnergy = ${minEnergy};
|
|
@@ -60,8 +56,11 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
60
56
|
|
|
61
57
|
// Calculate instantaneous RMS (Root Mean Square) energy
|
|
62
58
|
let sum = 0;
|
|
59
|
+
let peak = 0;
|
|
63
60
|
for (let i = 0; i < channel.length; i++) {
|
|
61
|
+
const sample = Math.abs(channel[i]);
|
|
64
62
|
sum += channel[i] * channel[i];
|
|
63
|
+
peak = Math.max(peak, sample);
|
|
65
64
|
}
|
|
66
65
|
const instantRms = Math.sqrt(sum / channel.length);
|
|
67
66
|
|
|
@@ -69,32 +68,21 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
69
68
|
// this.energy acts as the smoothed RMS value
|
|
70
69
|
this.energy = this.energy * this.smoothing + instantRms * (1 - this.smoothing);
|
|
71
70
|
|
|
72
|
-
//
|
|
73
|
-
//
|
|
74
|
-
//
|
|
75
|
-
const
|
|
76
|
-
const
|
|
77
|
-
|
|
78
|
-
// Adapt the noise floor based on instantaneous SNR
|
|
79
|
-
if (instantRms < this.noiseFloor) {
|
|
80
|
-
// Signal is quieter than noise floor, adapt downwards quickly
|
|
81
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + instantRms * this.noiseFloorAdaptRateQuiet;
|
|
82
|
-
} else if (instantSnrDb < 12) {
|
|
83
|
-
// Signal is louder but SNR is low (< 12dB) - likely just louder background noise
|
|
84
|
-
// Adapt upwards at normal rate to track rising noise
|
|
85
|
-
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + instantRms * this.noiseFloorAdaptRateLoud;
|
|
86
|
-
} else {
|
|
87
|
-
// Signal has high SNR (>= 12dB) - likely speech or transient
|
|
88
|
-
// Adapt VERY slowly to avoid "chasing" speech
|
|
89
|
-
const slowRate = this.noiseFloorAdaptRateLoud * 0.02;
|
|
90
|
-
this.noiseFloor = this.noiseFloor * (1 - slowRate) + instantRms * slowRate;
|
|
91
|
-
}
|
|
71
|
+
// Calculate Crest Factor (peak-to-RMS ratio)
|
|
72
|
+
// Voice typically has crest factor of 2-4 (6-12dB)
|
|
73
|
+
// Keyboard clicks have crest factor of 10-30+ (20-30dB)
|
|
74
|
+
const crestFactor = peak / (instantRms + 1e-10);
|
|
75
|
+
const crestFactorDb = 20 * Math.log10(Math.max(1e-6, crestFactor));
|
|
92
76
|
|
|
93
|
-
//
|
|
94
|
-
//
|
|
95
|
-
|
|
77
|
+
// FIXED noise floor with minimal adaptation
|
|
78
|
+
// Only adapt within strict bounds to prevent drift
|
|
79
|
+
const targetFloor = Math.max(0.0003, Math.min(0.003, instantRms));
|
|
80
|
+
this.noiseFloor = this.noiseFloor * 0.995 + targetFloor * 0.005;
|
|
81
|
+
|
|
82
|
+
// Hard clamp to prevent any drift outside acceptable range
|
|
83
|
+
this.noiseFloor = Math.max(0.0003, Math.min(0.003, this.noiseFloor));
|
|
96
84
|
|
|
97
|
-
//
|
|
85
|
+
// Calculate Signal-to-Noise Ratio (SNR) in dB using smoothed energy
|
|
98
86
|
const snr = this.energy / (this.noiseFloor + 1e-6);
|
|
99
87
|
const snrDb = 20 * Math.log10(Math.max(1e-6, snr));
|
|
100
88
|
|
|
@@ -103,11 +91,20 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
103
91
|
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
104
92
|
let probability = Math.min(1, Math.max(0, (snrDb - this.minSNR) / this.snrRange));
|
|
105
93
|
|
|
106
|
-
// Apply absolute energy threshold
|
|
107
|
-
// We use a soft threshold to avoid abrupt cutting
|
|
94
|
+
// Apply absolute energy threshold with soft knee
|
|
108
95
|
if (this.energy < this.minEnergy) {
|
|
109
96
|
const energyRatio = this.energy / (this.minEnergy + 1e-6);
|
|
110
|
-
probability *= Math.pow(energyRatio, 2);
|
|
97
|
+
probability *= Math.pow(energyRatio, 2);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// Apply crest factor penalty
|
|
101
|
+
// Reject signals with high crest factor (sharp transients like keyboard clicks)
|
|
102
|
+
// Voice: 6-12dB, Keyboard: 20-30dB
|
|
103
|
+
// We penalize anything above 14dB
|
|
104
|
+
if (crestFactorDb > 14) {
|
|
105
|
+
const excess = crestFactorDb - 14;
|
|
106
|
+
const penalty = Math.max(0, 1 - (excess / 10)); // Linear falloff over 10dB
|
|
107
|
+
probability *= penalty;
|
|
111
108
|
}
|
|
112
109
|
|
|
113
110
|
this.port.postMessage({ probability, snr: snrDb, noiseFloor: this.noiseFloor, rms: this.energy });
|
package/dist/vad/vad-node.mjs
CHANGED
package/dist/vad/vad-state.js
CHANGED
|
@@ -44,17 +44,15 @@ var VADStateMachine = class {
|
|
|
44
44
|
// Smooth for natural speech
|
|
45
45
|
preRollMs: config?.preRollMs ?? 250,
|
|
46
46
|
// Generous pre-roll
|
|
47
|
-
minSpeechDurationMs: config?.minSpeechDurationMs ??
|
|
48
|
-
//
|
|
47
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 250,
|
|
48
|
+
// Aggressive transient rejection
|
|
49
49
|
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
50
50
|
energyVad: {
|
|
51
51
|
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
52
52
|
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
snrRange: config?.energyVad?.snrRange ?? 10,
|
|
57
|
-
minEnergy: config?.energyVad?.minEnergy ?? 1e-3
|
|
53
|
+
minSNR: config?.energyVad?.minSNR ?? 8,
|
|
54
|
+
snrRange: config?.energyVad?.snrRange ?? 12,
|
|
55
|
+
minEnergy: config?.energyVad?.minEnergy ?? 0.01
|
|
58
56
|
}
|
|
59
57
|
};
|
|
60
58
|
this.lastSilenceTime = Date.now();
|
package/dist/vad/vad-state.mjs
CHANGED