@tensamin/audio 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -54
- package/dist/{chunk-XMTQPMQ6.mjs → chunk-GVKCBKW6.mjs} +1 -1
- package/dist/{chunk-6P2RDBW5.mjs → chunk-H5UKZU2Y.mjs} +1 -1
- package/dist/chunk-N553RHTI.mjs +93 -0
- package/dist/chunk-VEJXAEMM.mjs +136 -0
- package/dist/{chunk-EXH2PNUE.mjs → chunk-XXTNAUYX.mjs} +133 -34
- package/dist/extensibility/plugins.js +52 -14
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +225 -54
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +225 -54
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +225 -54
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +118 -10
- package/dist/types.d.ts +118 -10
- package/dist/vad/vad-node.d.mts +2 -0
- package/dist/vad/vad-node.d.ts +2 -0
- package/dist/vad/vad-node.js +52 -14
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.d.mts +1 -0
- package/dist/vad/vad-state.d.ts +1 -0
- package/dist/vad/vad-state.js +42 -8
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
- package/dist/chunk-JJASCVEW.mjs +0 -59
- package/dist/chunk-R5JVHKWA.mjs +0 -98
package/dist/types.d.ts
CHANGED
|
@@ -35,46 +35,154 @@ interface AudioProcessingConfig {
|
|
|
35
35
|
vad?: {
|
|
36
36
|
enabled: boolean;
|
|
37
37
|
/**
|
|
38
|
-
* Plugin name to use. Defaults to '
|
|
38
|
+
* Plugin name to use. Defaults to 'energy-vad'.
|
|
39
39
|
*/
|
|
40
40
|
pluginName?: string;
|
|
41
41
|
/**
|
|
42
42
|
* Probability threshold for speech onset (0-1).
|
|
43
|
-
*
|
|
43
|
+
* When VAD probability rises above this, audio is unmuted.
|
|
44
|
+
* Lower = more sensitive (catches quiet speech, may include noise)
|
|
45
|
+
* Higher = less sensitive (only confident speech, may clip quiet parts)
|
|
46
|
+
* Default: 0.6 (optimized for voice-only)
|
|
44
47
|
*/
|
|
45
48
|
startThreshold?: number;
|
|
46
49
|
/**
|
|
47
50
|
* Probability threshold for speech offset (0-1).
|
|
48
|
-
*
|
|
51
|
+
* When VAD probability drops below this (after hangover), audio is muted.
|
|
52
|
+
* Lower = keeps audio on longer (less aggressive gating)
|
|
53
|
+
* Higher = mutes faster (more aggressive noise suppression)
|
|
54
|
+
* Default: 0.45 (balanced voice detection)
|
|
49
55
|
*/
|
|
50
56
|
stopThreshold?: number;
|
|
51
57
|
/**
|
|
52
|
-
* Time in ms to wait after speech stops before
|
|
53
|
-
*
|
|
58
|
+
* Time in ms to wait after speech stops before muting.
|
|
59
|
+
* Prevents rapid on/off toggling during pauses.
|
|
60
|
+
* Lower = more aggressive gating, may clip between words
|
|
61
|
+
* Higher = smoother but may let trailing noise through
|
|
62
|
+
* Default: 400ms (optimized for natural speech)
|
|
54
63
|
*/
|
|
55
64
|
hangoverMs?: number;
|
|
56
65
|
/**
|
|
57
|
-
* Time in ms of audio to buffer before speech onset
|
|
58
|
-
*
|
|
66
|
+
* Time in ms of audio to buffer before speech onset.
|
|
67
|
+
* Prevents cutting off the beginning of speech.
|
|
68
|
+
* Default: 250ms (generous pre-roll for voice)
|
|
59
69
|
*/
|
|
60
70
|
preRollMs?: number;
|
|
71
|
+
/**
|
|
72
|
+
* Minimum speech duration in ms to consider it valid speech.
|
|
73
|
+
* Filters out very brief noise spikes.
|
|
74
|
+
* Default: 100ms
|
|
75
|
+
*/
|
|
76
|
+
minSpeechDurationMs?: number;
|
|
77
|
+
/**
|
|
78
|
+
* Minimum silence duration in ms before allowing another speech segment.
|
|
79
|
+
* Prevents false positives from quick noise bursts.
|
|
80
|
+
* Default: 150ms
|
|
81
|
+
*/
|
|
82
|
+
minSilenceDurationMs?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Advanced: Energy VAD specific parameters
|
|
85
|
+
*/
|
|
86
|
+
energyVad?: {
|
|
87
|
+
/**
|
|
88
|
+
* Smoothing factor for energy calculation (0-1).
|
|
89
|
+
* Higher = more smoothing, slower to react
|
|
90
|
+
* Default: 0.95
|
|
91
|
+
*/
|
|
92
|
+
smoothing?: number;
|
|
93
|
+
/**
|
|
94
|
+
* Initial noise floor estimate.
|
|
95
|
+
* Default: 0.001
|
|
96
|
+
*/
|
|
97
|
+
initialNoiseFloor?: number;
|
|
98
|
+
/**
|
|
99
|
+
* Rate at which noise floor adapts to quiet signals (0-1).
|
|
100
|
+
* Default: 0.01
|
|
101
|
+
*/
|
|
102
|
+
noiseFloorAdaptRateQuiet?: number;
|
|
103
|
+
/**
|
|
104
|
+
* Rate at which noise floor adapts to loud signals (0-1).
|
|
105
|
+
* Default: 0.001 (slower adaptation for speech)
|
|
106
|
+
*/
|
|
107
|
+
noiseFloorAdaptRateLoud?: number;
|
|
108
|
+
/**
|
|
109
|
+
* Minimum SNR (Signal-to-Noise Ratio) for speech detection.
|
|
110
|
+
* Default: 2.0 (voice is 2x louder than noise floor)
|
|
111
|
+
*/
|
|
112
|
+
minSNR?: number;
|
|
113
|
+
/**
|
|
114
|
+
* SNR range for probability scaling.
|
|
115
|
+
* Default: 8.0 (probability scales from minSNR to minSNR+snrRange)
|
|
116
|
+
*/
|
|
117
|
+
snrRange?: number;
|
|
118
|
+
};
|
|
61
119
|
};
|
|
62
120
|
/**
|
|
63
121
|
* Output gain and muting configuration.
|
|
64
122
|
*/
|
|
65
123
|
output?: {
|
|
66
124
|
/**
|
|
67
|
-
* Gain to apply when speaking (0-
|
|
125
|
+
* Gain to apply when speaking (0-infinity).
|
|
126
|
+
* Values > 1.0 will amplify the voice.
|
|
127
|
+
* Default: 1.0 (unity gain)
|
|
68
128
|
*/
|
|
69
129
|
speechGain?: number;
|
|
70
130
|
/**
|
|
71
|
-
* Gain to apply when silent (0-1).
|
|
131
|
+
* Gain to apply when silent (0-1).
|
|
132
|
+
* 0.0 = complete mute (recommended for voice-only)
|
|
133
|
+
* 0.1-0.3 = allow some background ambience
|
|
134
|
+
* Default: 0.0 (full mute for voice-only)
|
|
72
135
|
*/
|
|
73
136
|
silenceGain?: number;
|
|
74
137
|
/**
|
|
75
|
-
* Time in seconds to ramp gain changes.
|
|
138
|
+
* Time in seconds to ramp gain changes.
|
|
139
|
+
* Lower = faster transitions (may cause clicks)
|
|
140
|
+
* Higher = smoother transitions (may sound sluggish)
|
|
141
|
+
* Default: 0.015 (fast but smooth for voice)
|
|
76
142
|
*/
|
|
77
143
|
gainRampTime?: number;
|
|
144
|
+
/**
|
|
145
|
+
* Apply additional gain reduction during the transition to silence.
|
|
146
|
+
* Helps create cleaner cutoffs without abrupt clicks.
|
|
147
|
+
* Default: true
|
|
148
|
+
*/
|
|
149
|
+
smoothTransitions?: boolean;
|
|
150
|
+
/**
|
|
151
|
+
* Maximum gain in dB to apply (prevents clipping).
|
|
152
|
+
* Default: 6.0 dB (roughly 2x amplitude)
|
|
153
|
+
*/
|
|
154
|
+
maxGainDb?: number;
|
|
155
|
+
/**
|
|
156
|
+
* Apply dynamic range compression when speaking.
|
|
157
|
+
* Makes quiet parts louder and loud parts quieter.
|
|
158
|
+
* Default: false (transparent audio)
|
|
159
|
+
*/
|
|
160
|
+
enableCompression?: boolean;
|
|
161
|
+
/**
|
|
162
|
+
* Compression settings (when enabled)
|
|
163
|
+
*/
|
|
164
|
+
compression?: {
|
|
165
|
+
/**
|
|
166
|
+
* Threshold in dB above which compression starts.
|
|
167
|
+
* Default: -24.0 dB
|
|
168
|
+
*/
|
|
169
|
+
threshold?: number;
|
|
170
|
+
/**
|
|
171
|
+
* Compression ratio (1:N).
|
|
172
|
+
* Default: 3.0 (3:1 ratio)
|
|
173
|
+
*/
|
|
174
|
+
ratio?: number;
|
|
175
|
+
/**
|
|
176
|
+
* Attack time in seconds.
|
|
177
|
+
* Default: 0.003 (3ms)
|
|
178
|
+
*/
|
|
179
|
+
attack?: number;
|
|
180
|
+
/**
|
|
181
|
+
* Release time in seconds.
|
|
182
|
+
* Default: 0.05 (50ms)
|
|
183
|
+
*/
|
|
184
|
+
release?: number;
|
|
185
|
+
};
|
|
78
186
|
};
|
|
79
187
|
/**
|
|
80
188
|
* LiveKit integration configuration.
|
package/dist/vad/vad-node.d.mts
CHANGED
|
@@ -3,7 +3,9 @@ import 'mitt';
|
|
|
3
3
|
|
|
4
4
|
declare class EnergyVADPlugin implements VADPlugin {
|
|
5
5
|
name: string;
|
|
6
|
+
private workletNode;
|
|
6
7
|
createNode(context: AudioContext, config: AudioProcessingConfig["vad"], onDecision: (probability: number) => void): Promise<AudioNode>;
|
|
8
|
+
updateSpeakingState(isSpeaking: boolean): void;
|
|
7
9
|
}
|
|
8
10
|
|
|
9
11
|
export { EnergyVADPlugin };
|
package/dist/vad/vad-node.d.ts
CHANGED
|
@@ -3,7 +3,9 @@ import 'mitt';
|
|
|
3
3
|
|
|
4
4
|
declare class EnergyVADPlugin implements VADPlugin {
|
|
5
5
|
name: string;
|
|
6
|
+
private workletNode;
|
|
6
7
|
createNode(context: AudioContext, config: AudioProcessingConfig["vad"], onDecision: (probability: number) => void): Promise<AudioNode>;
|
|
8
|
+
updateSpeakingState(isSpeaking: boolean): void;
|
|
7
9
|
}
|
|
8
10
|
|
|
9
11
|
export { EnergyVADPlugin };
|
package/dist/vad/vad-node.js
CHANGED
|
@@ -23,13 +23,32 @@ __export(vad_node_exports, {
|
|
|
23
23
|
EnergyVADPlugin: () => EnergyVADPlugin
|
|
24
24
|
});
|
|
25
25
|
module.exports = __toCommonJS(vad_node_exports);
|
|
26
|
-
var
|
|
26
|
+
var createEnergyVadWorkletCode = (vadConfig) => {
|
|
27
|
+
const energyParams = vadConfig?.energyVad || {};
|
|
28
|
+
const smoothing = energyParams.smoothing ?? 0.95;
|
|
29
|
+
const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
|
|
30
|
+
const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
|
|
31
|
+
const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-3;
|
|
32
|
+
const minSNR = energyParams.minSNR ?? 2;
|
|
33
|
+
const snrRange = energyParams.snrRange ?? 8;
|
|
34
|
+
return `
|
|
27
35
|
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
28
36
|
constructor() {
|
|
29
37
|
super();
|
|
30
|
-
this.smoothing =
|
|
38
|
+
this.smoothing = ${smoothing};
|
|
31
39
|
this.energy = 0;
|
|
32
|
-
this.noiseFloor =
|
|
40
|
+
this.noiseFloor = ${initialNoiseFloor};
|
|
41
|
+
this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
|
|
42
|
+
this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
|
|
43
|
+
this.minSNR = ${minSNR};
|
|
44
|
+
this.snrRange = ${snrRange};
|
|
45
|
+
this.isSpeaking = false;
|
|
46
|
+
|
|
47
|
+
this.port.onmessage = (event) => {
|
|
48
|
+
if (event.data && event.data.isSpeaking !== undefined) {
|
|
49
|
+
this.isSpeaking = event.data.isSpeaking;
|
|
50
|
+
}
|
|
51
|
+
};
|
|
33
52
|
}
|
|
34
53
|
|
|
35
54
|
process(inputs, outputs, parameters) {
|
|
@@ -37,41 +56,54 @@ class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
|
37
56
|
if (!input || !input.length) return true;
|
|
38
57
|
const channel = input[0];
|
|
39
58
|
|
|
40
|
-
// Calculate RMS
|
|
59
|
+
// Calculate RMS (Root Mean Square) energy
|
|
41
60
|
let sum = 0;
|
|
42
61
|
for (let i = 0; i < channel.length; i++) {
|
|
43
62
|
sum += channel[i] * channel[i];
|
|
44
63
|
}
|
|
45
64
|
const rms = Math.sqrt(sum / channel.length);
|
|
46
65
|
|
|
47
|
-
//
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
66
|
+
// Adaptive noise floor estimation - ONLY during silence
|
|
67
|
+
// This prevents the noise floor from rising during speech
|
|
68
|
+
if (!this.isSpeaking) {
|
|
69
|
+
if (rms < this.noiseFloor) {
|
|
70
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
|
|
71
|
+
} else {
|
|
72
|
+
// Even during silence, if we detect a loud signal, adapt very slowly
|
|
73
|
+
// This could be brief noise we haven't classified as speech yet
|
|
74
|
+
this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
|
|
75
|
+
}
|
|
52
76
|
}
|
|
77
|
+
// During speech, freeze the noise floor to maintain consistent detection
|
|
53
78
|
|
|
54
|
-
// Calculate
|
|
55
|
-
// This is a heuristic mapping from energy to 0-1
|
|
79
|
+
// Calculate Signal-to-Noise Ratio (SNR)
|
|
56
80
|
const snr = rms / (this.noiseFloor + 1e-6);
|
|
57
|
-
|
|
81
|
+
|
|
82
|
+
// Map SNR to probability (0-1)
|
|
83
|
+
// Probability is 0 when SNR <= minSNR
|
|
84
|
+
// Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
|
|
85
|
+
// Probability is 1 when SNR >= (minSNR + snrRange)
|
|
86
|
+
const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
|
|
58
87
|
|
|
59
|
-
this.port.postMessage({ probability });
|
|
88
|
+
this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
|
|
60
89
|
|
|
61
90
|
return true;
|
|
62
91
|
}
|
|
63
92
|
}
|
|
64
93
|
registerProcessor('energy-vad-processor', EnergyVadProcessor);
|
|
65
94
|
`;
|
|
95
|
+
};
|
|
66
96
|
var EnergyVADPlugin = class {
|
|
67
97
|
name = "energy-vad";
|
|
98
|
+
workletNode = null;
|
|
68
99
|
async createNode(context, config, onDecision) {
|
|
69
100
|
if (!config?.enabled) {
|
|
70
101
|
console.log("VAD disabled, using passthrough node");
|
|
71
102
|
const pass = context.createGain();
|
|
72
103
|
return pass;
|
|
73
104
|
}
|
|
74
|
-
const
|
|
105
|
+
const workletCode = createEnergyVadWorkletCode(config);
|
|
106
|
+
const blob = new Blob([workletCode], {
|
|
75
107
|
type: "application/javascript"
|
|
76
108
|
});
|
|
77
109
|
const url = URL.createObjectURL(blob);
|
|
@@ -90,6 +122,7 @@ var EnergyVADPlugin = class {
|
|
|
90
122
|
let node;
|
|
91
123
|
try {
|
|
92
124
|
node = new AudioWorkletNode(context, "energy-vad-processor");
|
|
125
|
+
this.workletNode = node;
|
|
93
126
|
console.log("Energy VAD node created successfully");
|
|
94
127
|
} catch (e) {
|
|
95
128
|
const error = new Error(
|
|
@@ -115,6 +148,11 @@ var EnergyVADPlugin = class {
|
|
|
115
148
|
};
|
|
116
149
|
return node;
|
|
117
150
|
}
|
|
151
|
+
updateSpeakingState(isSpeaking) {
|
|
152
|
+
if (this.workletNode) {
|
|
153
|
+
this.workletNode.port.postMessage({ isSpeaking });
|
|
154
|
+
}
|
|
155
|
+
}
|
|
118
156
|
};
|
|
119
157
|
// Annotate the CommonJS export names for ESM import in node:
|
|
120
158
|
0 && (module.exports = {
|
package/dist/vad/vad-node.mjs
CHANGED
package/dist/vad/vad-state.d.mts
CHANGED
|
@@ -6,6 +6,7 @@ declare class VADStateMachine {
|
|
|
6
6
|
private currentState;
|
|
7
7
|
private lastSpeechTime;
|
|
8
8
|
private speechStartTime;
|
|
9
|
+
private lastSilenceTime;
|
|
9
10
|
private frameDurationMs;
|
|
10
11
|
constructor(config: AudioProcessingConfig["vad"]);
|
|
11
12
|
updateConfig(config: Partial<AudioProcessingConfig["vad"]>): void;
|
package/dist/vad/vad-state.d.ts
CHANGED
|
@@ -6,6 +6,7 @@ declare class VADStateMachine {
|
|
|
6
6
|
private currentState;
|
|
7
7
|
private lastSpeechTime;
|
|
8
8
|
private speechStartTime;
|
|
9
|
+
private lastSilenceTime;
|
|
9
10
|
private frameDurationMs;
|
|
10
11
|
constructor(config: AudioProcessingConfig["vad"]);
|
|
11
12
|
updateConfig(config: Partial<AudioProcessingConfig["vad"]>): void;
|
package/dist/vad/vad-state.js
CHANGED
|
@@ -28,31 +28,60 @@ var VADStateMachine = class {
|
|
|
28
28
|
currentState = "silent";
|
|
29
29
|
lastSpeechTime = 0;
|
|
30
30
|
speechStartTime = 0;
|
|
31
|
+
lastSilenceTime = 0;
|
|
31
32
|
frameDurationMs = 20;
|
|
32
33
|
// Assumed frame duration, updated by calls
|
|
33
34
|
constructor(config) {
|
|
34
35
|
this.config = {
|
|
35
36
|
enabled: config?.enabled ?? true,
|
|
36
37
|
pluginName: config?.pluginName ?? "energy-vad",
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
// Voice-optimized defaults
|
|
39
|
+
startThreshold: config?.startThreshold ?? 0.6,
|
|
40
|
+
// Higher threshold to avoid noise
|
|
41
|
+
stopThreshold: config?.stopThreshold ?? 0.45,
|
|
42
|
+
// Balanced for voice
|
|
43
|
+
hangoverMs: config?.hangoverMs ?? 400,
|
|
44
|
+
// Smooth for natural speech
|
|
45
|
+
preRollMs: config?.preRollMs ?? 250,
|
|
46
|
+
// Generous pre-roll
|
|
47
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 100,
|
|
48
|
+
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
49
|
+
energyVad: {
|
|
50
|
+
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
51
|
+
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
52
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
|
|
53
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
|
|
54
|
+
minSNR: config?.energyVad?.minSNR ?? 2,
|
|
55
|
+
snrRange: config?.energyVad?.snrRange ?? 8
|
|
56
|
+
}
|
|
41
57
|
};
|
|
58
|
+
this.lastSilenceTime = Date.now();
|
|
42
59
|
}
|
|
43
60
|
updateConfig(config) {
|
|
44
61
|
this.config = { ...this.config, ...config };
|
|
45
62
|
}
|
|
46
63
|
processFrame(probability, timestamp) {
|
|
47
|
-
const {
|
|
64
|
+
const {
|
|
65
|
+
startThreshold,
|
|
66
|
+
stopThreshold,
|
|
67
|
+
hangoverMs,
|
|
68
|
+
minSpeechDurationMs,
|
|
69
|
+
minSilenceDurationMs
|
|
70
|
+
} = this.config;
|
|
48
71
|
let newState = this.currentState;
|
|
49
72
|
if (this.currentState === "silent" || this.currentState === "speech_ending") {
|
|
50
73
|
if (probability >= startThreshold) {
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
74
|
+
const silenceDuration = timestamp - this.lastSilenceTime;
|
|
75
|
+
if (silenceDuration >= minSilenceDurationMs) {
|
|
76
|
+
newState = "speech_starting";
|
|
77
|
+
this.speechStartTime = timestamp;
|
|
78
|
+
this.lastSpeechTime = timestamp;
|
|
79
|
+
} else {
|
|
80
|
+
newState = "silent";
|
|
81
|
+
}
|
|
54
82
|
} else {
|
|
55
83
|
newState = "silent";
|
|
84
|
+
this.lastSilenceTime = timestamp;
|
|
56
85
|
}
|
|
57
86
|
} else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
|
|
58
87
|
if (probability >= stopThreshold) {
|
|
@@ -60,10 +89,15 @@ var VADStateMachine = class {
|
|
|
60
89
|
this.lastSpeechTime = timestamp;
|
|
61
90
|
} else {
|
|
62
91
|
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
92
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
63
93
|
if (timeSinceSpeech < hangoverMs) {
|
|
64
94
|
newState = "speaking";
|
|
95
|
+
} else if (speechDuration < minSpeechDurationMs) {
|
|
96
|
+
newState = "silent";
|
|
97
|
+
this.lastSilenceTime = timestamp;
|
|
65
98
|
} else {
|
|
66
99
|
newState = "speech_ending";
|
|
100
|
+
this.lastSilenceTime = timestamp;
|
|
67
101
|
}
|
|
68
102
|
}
|
|
69
103
|
}
|
package/dist/vad/vad-state.mjs
CHANGED
package/package.json
CHANGED
package/dist/chunk-JJASCVEW.mjs
DELETED
|
@@ -1,59 +0,0 @@
|
|
|
1
|
-
// src/vad/vad-state.ts
|
|
2
|
-
var VADStateMachine = class {
|
|
3
|
-
config;
|
|
4
|
-
currentState = "silent";
|
|
5
|
-
lastSpeechTime = 0;
|
|
6
|
-
speechStartTime = 0;
|
|
7
|
-
frameDurationMs = 20;
|
|
8
|
-
// Assumed frame duration, updated by calls
|
|
9
|
-
constructor(config) {
|
|
10
|
-
this.config = {
|
|
11
|
-
enabled: config?.enabled ?? true,
|
|
12
|
-
pluginName: config?.pluginName ?? "energy-vad",
|
|
13
|
-
startThreshold: config?.startThreshold ?? 0.5,
|
|
14
|
-
stopThreshold: config?.stopThreshold ?? 0.4,
|
|
15
|
-
hangoverMs: config?.hangoverMs ?? 300,
|
|
16
|
-
preRollMs: config?.preRollMs ?? 200
|
|
17
|
-
};
|
|
18
|
-
}
|
|
19
|
-
updateConfig(config) {
|
|
20
|
-
this.config = { ...this.config, ...config };
|
|
21
|
-
}
|
|
22
|
-
processFrame(probability, timestamp) {
|
|
23
|
-
const { startThreshold, stopThreshold, hangoverMs } = this.config;
|
|
24
|
-
let newState = this.currentState;
|
|
25
|
-
if (this.currentState === "silent" || this.currentState === "speech_ending") {
|
|
26
|
-
if (probability >= startThreshold) {
|
|
27
|
-
newState = "speech_starting";
|
|
28
|
-
this.speechStartTime = timestamp;
|
|
29
|
-
this.lastSpeechTime = timestamp;
|
|
30
|
-
} else {
|
|
31
|
-
newState = "silent";
|
|
32
|
-
}
|
|
33
|
-
} else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
|
|
34
|
-
if (probability >= stopThreshold) {
|
|
35
|
-
newState = "speaking";
|
|
36
|
-
this.lastSpeechTime = timestamp;
|
|
37
|
-
} else {
|
|
38
|
-
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
39
|
-
if (timeSinceSpeech < hangoverMs) {
|
|
40
|
-
newState = "speaking";
|
|
41
|
-
} else {
|
|
42
|
-
newState = "speech_ending";
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
}
|
|
46
|
-
if (newState === "speech_starting") newState = "speaking";
|
|
47
|
-
if (newState === "speech_ending") newState = "silent";
|
|
48
|
-
this.currentState = newState;
|
|
49
|
-
return {
|
|
50
|
-
isSpeaking: newState === "speaking",
|
|
51
|
-
probability,
|
|
52
|
-
state: newState
|
|
53
|
-
};
|
|
54
|
-
}
|
|
55
|
-
};
|
|
56
|
-
|
|
57
|
-
export {
|
|
58
|
-
VADStateMachine
|
|
59
|
-
};
|
package/dist/chunk-R5JVHKWA.mjs
DELETED
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
// src/vad/vad-node.ts
|
|
2
|
-
var energyVadWorkletCode = `
|
|
3
|
-
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
4
|
-
constructor() {
|
|
5
|
-
super();
|
|
6
|
-
this.smoothing = 0.95;
|
|
7
|
-
this.energy = 0;
|
|
8
|
-
this.noiseFloor = 0.001;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
process(inputs, outputs, parameters) {
|
|
12
|
-
const input = inputs[0];
|
|
13
|
-
if (!input || !input.length) return true;
|
|
14
|
-
const channel = input[0];
|
|
15
|
-
|
|
16
|
-
// Calculate RMS
|
|
17
|
-
let sum = 0;
|
|
18
|
-
for (let i = 0; i < channel.length; i++) {
|
|
19
|
-
sum += channel[i] * channel[i];
|
|
20
|
-
}
|
|
21
|
-
const rms = Math.sqrt(sum / channel.length);
|
|
22
|
-
|
|
23
|
-
// Simple adaptive noise floor (very basic)
|
|
24
|
-
if (rms < this.noiseFloor) {
|
|
25
|
-
this.noiseFloor = this.noiseFloor * 0.99 + rms * 0.01;
|
|
26
|
-
} else {
|
|
27
|
-
this.noiseFloor = this.noiseFloor * 0.999 + rms * 0.001;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
// Calculate "probability" based on SNR
|
|
31
|
-
// This is a heuristic mapping from energy to 0-1
|
|
32
|
-
const snr = rms / (this.noiseFloor + 1e-6);
|
|
33
|
-
const probability = Math.min(1, Math.max(0, (snr - 1.5) / 10)); // Arbitrary scaling
|
|
34
|
-
|
|
35
|
-
this.port.postMessage({ probability });
|
|
36
|
-
|
|
37
|
-
return true;
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
registerProcessor('energy-vad-processor', EnergyVadProcessor);
|
|
41
|
-
`;
|
|
42
|
-
var EnergyVADPlugin = class {
|
|
43
|
-
name = "energy-vad";
|
|
44
|
-
async createNode(context, config, onDecision) {
|
|
45
|
-
if (!config?.enabled) {
|
|
46
|
-
console.log("VAD disabled, using passthrough node");
|
|
47
|
-
const pass = context.createGain();
|
|
48
|
-
return pass;
|
|
49
|
-
}
|
|
50
|
-
const blob = new Blob([energyVadWorkletCode], {
|
|
51
|
-
type: "application/javascript"
|
|
52
|
-
});
|
|
53
|
-
const url = URL.createObjectURL(blob);
|
|
54
|
-
try {
|
|
55
|
-
await context.audioWorklet.addModule(url);
|
|
56
|
-
console.log("Energy VAD worklet loaded successfully");
|
|
57
|
-
} catch (e) {
|
|
58
|
-
const error = new Error(
|
|
59
|
-
`Failed to load Energy VAD worklet: ${e instanceof Error ? e.message : String(e)}`
|
|
60
|
-
);
|
|
61
|
-
console.error(error.message);
|
|
62
|
-
URL.revokeObjectURL(url);
|
|
63
|
-
throw error;
|
|
64
|
-
}
|
|
65
|
-
URL.revokeObjectURL(url);
|
|
66
|
-
let node;
|
|
67
|
-
try {
|
|
68
|
-
node = new AudioWorkletNode(context, "energy-vad-processor");
|
|
69
|
-
console.log("Energy VAD node created successfully");
|
|
70
|
-
} catch (e) {
|
|
71
|
-
const error = new Error(
|
|
72
|
-
`Failed to create Energy VAD node: ${e instanceof Error ? e.message : String(e)}`
|
|
73
|
-
);
|
|
74
|
-
console.error(error.message);
|
|
75
|
-
throw error;
|
|
76
|
-
}
|
|
77
|
-
node.port.onmessage = (event) => {
|
|
78
|
-
try {
|
|
79
|
-
const { probability } = event.data;
|
|
80
|
-
if (typeof probability === "number" && !isNaN(probability)) {
|
|
81
|
-
onDecision(probability);
|
|
82
|
-
} else {
|
|
83
|
-
console.warn("Invalid VAD probability received:", event.data);
|
|
84
|
-
}
|
|
85
|
-
} catch (error) {
|
|
86
|
-
console.error("Error in VAD message handler:", error);
|
|
87
|
-
}
|
|
88
|
-
};
|
|
89
|
-
node.port.onmessageerror = (event) => {
|
|
90
|
-
console.error("VAD port message error:", event);
|
|
91
|
-
};
|
|
92
|
-
return node;
|
|
93
|
-
}
|
|
94
|
-
};
|
|
95
|
-
|
|
96
|
-
export {
|
|
97
|
-
EnergyVADPlugin
|
|
98
|
-
};
|