@tensamin/audio 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +217 -54
- package/dist/{chunk-EXH2PNUE.mjs → chunk-AHBRT4RD.mjs} +128 -33
- package/dist/{chunk-XMTQPMQ6.mjs → chunk-ERJVV5JR.mjs} +1 -1
- package/dist/chunk-N553RHTI.mjs +93 -0
- package/dist/{chunk-R5JVHKWA.mjs → chunk-NMHKX64G.mjs} +32 -12
- package/dist/{chunk-6P2RDBW5.mjs → chunk-YOSTLLCS.mjs} +1 -1
- package/dist/extensibility/plugins.js +32 -12
- package/dist/extensibility/plugins.mjs +2 -2
- package/dist/index.js +200 -51
- package/dist/index.mjs +5 -5
- package/dist/livekit/integration.js +200 -51
- package/dist/livekit/integration.mjs +5 -5
- package/dist/pipeline/audio-pipeline.js +200 -51
- package/dist/pipeline/audio-pipeline.mjs +4 -4
- package/dist/types.d.mts +118 -10
- package/dist/types.d.ts +118 -10
- package/dist/vad/vad-node.js +32 -12
- package/dist/vad/vad-node.mjs +1 -1
- package/dist/vad/vad-state.d.mts +1 -0
- package/dist/vad/vad-state.d.ts +1 -0
- package/dist/vad/vad-state.js +42 -8
- package/dist/vad/vad-state.mjs +1 -1
- package/package.json +1 -1
- package/dist/chunk-JJASCVEW.mjs +0 -59
package/README.md
CHANGED
|
@@ -1,90 +1,56 @@
|
|
|
1
1
|
# @tensamin/audio
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Audio processing library for the web with RNNoise-based noise suppression and Voice Activity Detection (VAD). Designed for voice communication applications with LiveKit integration support.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
-
|
|
10
|
-
-
|
|
11
|
-
-
|
|
7
|
+
- Configurable Voice Activity Detection with energy-based algorithm
|
|
8
|
+
- RNNoise noise suppression via `@sapphi-red/web-noise-suppressor`
|
|
9
|
+
- Automatic audio gating based on voice detection
|
|
10
|
+
- Runtime configuration updates
|
|
11
|
+
- LiveKit `LocalAudioTrack` integration
|
|
12
|
+
- Plugin system for custom audio processors
|
|
13
|
+
- Optional dynamic range compression
|
|
12
14
|
|
|
13
15
|
## Installation
|
|
14
16
|
|
|
15
17
|
```bash
|
|
16
18
|
npm install @tensamin/audio livekit-client
|
|
17
|
-
bun add @tensamin/audio livekit-client
|
|
18
|
-
pnpm install @tensamin/audio livekit-client
|
|
19
19
|
```
|
|
20
20
|
|
|
21
|
-
##
|
|
21
|
+
## Requirements
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
### For Noise Suppression (Optional)
|
|
26
|
-
|
|
27
|
-
If you want to enable noise suppression, download these files from `https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/`:
|
|
23
|
+
For noise suppression, the following files must be provided:
|
|
28
24
|
|
|
29
25
|
- `rnnoise.wasm`
|
|
30
26
|
- `rnnoise_simd.wasm`
|
|
31
|
-
- `
|
|
27
|
+
- `worklet.js`
|
|
32
28
|
|
|
33
|
-
|
|
29
|
+
Available at: `https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/`
|
|
34
30
|
|
|
35
|
-
|
|
31
|
+
Place these files in a publicly accessible directory (e.g., `public/audio-processor/`).
|
|
36
32
|
|
|
37
33
|
## Usage
|
|
38
34
|
|
|
39
|
-
###
|
|
40
|
-
|
|
41
|
-
If you want to use the pipeline without noise suppression or VAD (e.g., for testing or when features are not needed), you can disable them:
|
|
35
|
+
### Basic Example
|
|
42
36
|
|
|
43
37
|
```ts
|
|
44
38
|
import { createAudioPipeline } from "@tensamin/audio";
|
|
45
39
|
|
|
46
|
-
// Get a stream
|
|
47
40
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
48
41
|
const track = stream.getAudioTracks()[0];
|
|
49
42
|
|
|
50
|
-
// Create pipeline
|
|
51
|
-
const pipeline = await createAudioPipeline(track, {
|
|
52
|
-
noiseSuppression: { enabled: false },
|
|
53
|
-
vad: { enabled: false },
|
|
54
|
-
});
|
|
55
|
-
|
|
56
|
-
// Use the processed track
|
|
57
|
-
const processedStream = new MediaStream([pipeline.processedTrack]);
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
### Basic Usage (Raw MediaStream)
|
|
61
|
-
|
|
62
|
-
```ts
|
|
63
|
-
import { createAudioPipeline } from "@tensamin/audio";
|
|
64
|
-
|
|
65
|
-
// Get a stream
|
|
66
|
-
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
67
|
-
const track = stream.getAudioTracks()[0];
|
|
68
|
-
|
|
69
|
-
// Create pipeline
|
|
70
43
|
const pipeline = await createAudioPipeline(track, {
|
|
71
44
|
noiseSuppression: {
|
|
72
45
|
enabled: true,
|
|
73
46
|
wasmUrl: "/audio-processor/rnnoise.wasm",
|
|
74
47
|
simdUrl: "/audio-processor/rnnoise_simd.wasm",
|
|
75
|
-
workletUrl: "/audio-processor/
|
|
48
|
+
workletUrl: "/audio-processor/worklet.js",
|
|
76
49
|
},
|
|
77
50
|
vad: { enabled: true },
|
|
78
51
|
});
|
|
79
52
|
|
|
80
|
-
// Use the processed track
|
|
81
53
|
const processedStream = new MediaStream([pipeline.processedTrack]);
|
|
82
|
-
// audioElement.srcObject = processedStream;
|
|
83
|
-
|
|
84
|
-
// Listen to VAD events
|
|
85
|
-
pipeline.events.on("vadChange", (state) => {
|
|
86
|
-
console.log("Is Speaking:", state.isSpeaking);
|
|
87
|
-
});
|
|
88
54
|
```
|
|
89
55
|
|
|
90
56
|
### LiveKit Integration
|
|
@@ -93,21 +59,218 @@ pipeline.events.on("vadChange", (state) => {
|
|
|
93
59
|
import { attachProcessingToTrack } from "@tensamin/audio";
|
|
94
60
|
import { LocalAudioTrack } from "livekit-client";
|
|
95
61
|
|
|
96
|
-
// Assume you have a LocalAudioTrack
|
|
97
62
|
const localTrack = await LocalAudioTrack.create();
|
|
98
63
|
|
|
99
|
-
// Attach processing (replaces the underlying track)
|
|
100
64
|
const pipeline = await attachProcessingToTrack(localTrack, {
|
|
101
65
|
noiseSuppression: {
|
|
102
66
|
enabled: true,
|
|
103
67
|
wasmUrl: "/audio-processor/rnnoise.wasm",
|
|
104
68
|
simdUrl: "/audio-processor/rnnoise_simd.wasm",
|
|
105
|
-
workletUrl: "/audio-processor/
|
|
69
|
+
workletUrl: "/audio-processor/worklet.js",
|
|
106
70
|
},
|
|
107
71
|
vad: { enabled: true },
|
|
108
|
-
livekit: { manageTrackMute: true },
|
|
72
|
+
livekit: { manageTrackMute: true },
|
|
109
73
|
});
|
|
110
74
|
|
|
111
|
-
// Publish the track
|
|
112
75
|
await room.localParticipant.publishTrack(localTrack);
|
|
113
76
|
```
|
|
77
|
+
|
|
78
|
+
### Monitoring VAD State
|
|
79
|
+
|
|
80
|
+
```ts
|
|
81
|
+
pipeline.events.on("vadChange", (state) => {
|
|
82
|
+
console.log("Speaking:", state.isSpeaking);
|
|
83
|
+
console.log("Probability:", state.probability);
|
|
84
|
+
console.log("State:", state.state);
|
|
85
|
+
});
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Configuration
|
|
89
|
+
|
|
90
|
+
### Voice Activity Detection
|
|
91
|
+
|
|
92
|
+
```ts
|
|
93
|
+
vad: {
|
|
94
|
+
enabled: boolean;
|
|
95
|
+
startThreshold: number; // Default: 0.6 (range: 0-1)
|
|
96
|
+
stopThreshold: number; // Default: 0.45 (range: 0-1)
|
|
97
|
+
hangoverMs: number; // Default: 400
|
|
98
|
+
preRollMs: number; // Default: 250
|
|
99
|
+
minSpeechDurationMs: number; // Default: 100
|
|
100
|
+
minSilenceDurationMs: number; // Default: 150
|
|
101
|
+
energyVad?: {
|
|
102
|
+
smoothing: number; // Default: 0.95
|
|
103
|
+
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
+
noiseFloorAdaptRateQuiet: number; // Default: 0.01
|
|
105
|
+
noiseFloorAdaptRateLoud: number; // Default: 0.001
|
|
106
|
+
minSNR: number; // Default: 2.0
|
|
107
|
+
snrRange: number; // Default: 8.0
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**Threshold Parameters:**
|
|
113
|
+
|
|
114
|
+
- `startThreshold`: Probability threshold to unmute audio
|
|
115
|
+
- `stopThreshold`: Probability threshold to mute audio (after hangover)
|
|
116
|
+
- `hangoverMs`: Delay before muting after speech stops
|
|
117
|
+
- `preRollMs`: Audio buffer duration before speech onset
|
|
118
|
+
- `minSpeechDurationMs`: Minimum duration to consider as valid speech
|
|
119
|
+
- `minSilenceDurationMs`: Minimum silence duration between speech segments
|
|
120
|
+
|
|
121
|
+
**Energy VAD Parameters:**
|
|
122
|
+
|
|
123
|
+
- `smoothing`: Energy calculation smoothing factor (0-1)
|
|
124
|
+
- `minSNR`: Minimum signal-to-noise ratio for speech detection
|
|
125
|
+
- `snrRange`: Range for probability scaling from minSNR
|
|
126
|
+
|
|
127
|
+
### Output Control
|
|
128
|
+
|
|
129
|
+
```ts
|
|
130
|
+
output: {
|
|
131
|
+
speechGain: number; // Default: 1.0
|
|
132
|
+
silenceGain: number; // Default: 0.0
|
|
133
|
+
gainRampTime: number; // Default: 0.015 (seconds)
|
|
134
|
+
smoothTransitions: boolean; // Default: true
|
|
135
|
+
maxGainDb: number; // Default: 6.0
|
|
136
|
+
enableCompression: boolean; // Default: false
|
|
137
|
+
compression?: {
|
|
138
|
+
threshold: number; // Default: -24.0 (dB)
|
|
139
|
+
ratio: number; // Default: 3.0
|
|
140
|
+
attack: number; // Default: 0.003 (seconds)
|
|
141
|
+
release: number; // Default: 0.05 (seconds)
|
|
142
|
+
};
|
|
143
|
+
}
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
**Gain Parameters:**
|
|
147
|
+
|
|
148
|
+
- `speechGain`: Gain multiplier when speaking (1.0 = unity)
|
|
149
|
+
- `silenceGain`: Gain multiplier when silent (0.0 = mute)
|
|
150
|
+
- `gainRampTime`: Transition duration for gain changes
|
|
151
|
+
- `maxGainDb`: Maximum gain limit to prevent clipping
|
|
152
|
+
|
|
153
|
+
**Compression Parameters:**
|
|
154
|
+
|
|
155
|
+
- `threshold`: Level above which compression is applied
|
|
156
|
+
- `ratio`: Compression ratio (e.g., 3.0 = 3:1)
|
|
157
|
+
- `attack`: Time to reach full compression
|
|
158
|
+
- `release`: Time to release compression
|
|
159
|
+
|
|
160
|
+
### Runtime Configuration Updates
|
|
161
|
+
|
|
162
|
+
```ts
|
|
163
|
+
pipeline.setConfig({
|
|
164
|
+
vad: {
|
|
165
|
+
startThreshold: 0.7,
|
|
166
|
+
stopThreshold: 0.55,
|
|
167
|
+
},
|
|
168
|
+
output: {
|
|
169
|
+
speechGain: 1.3,
|
|
170
|
+
},
|
|
171
|
+
});
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Configuration Examples
|
|
175
|
+
|
|
176
|
+
### Noisy Environment
|
|
177
|
+
|
|
178
|
+
```ts
|
|
179
|
+
{
|
|
180
|
+
vad: {
|
|
181
|
+
startThreshold: 0.7,
|
|
182
|
+
stopThreshold: 0.55,
|
|
183
|
+
minSpeechDurationMs: 150,
|
|
184
|
+
energyVad: { minSNR: 3.0 }
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Quiet Speaker
|
|
190
|
+
|
|
191
|
+
```ts
|
|
192
|
+
{
|
|
193
|
+
vad: {
|
|
194
|
+
startThreshold: 0.4,
|
|
195
|
+
stopThreshold: 0.25,
|
|
196
|
+
energyVad: { minSNR: 1.5 }
|
|
197
|
+
},
|
|
198
|
+
output: {
|
|
199
|
+
speechGain: 1.5
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Natural Conversation
|
|
205
|
+
|
|
206
|
+
```ts
|
|
207
|
+
{
|
|
208
|
+
vad: {
|
|
209
|
+
startThreshold: 0.5,
|
|
210
|
+
stopThreshold: 0.3,
|
|
211
|
+
hangoverMs: 600,
|
|
212
|
+
},
|
|
213
|
+
output: {
|
|
214
|
+
silenceGain: 0.2
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
## API Reference
|
|
220
|
+
|
|
221
|
+
### `createAudioPipeline(track, config)`
|
|
222
|
+
|
|
223
|
+
Creates an audio processing pipeline from a MediaStreamTrack.
|
|
224
|
+
|
|
225
|
+
**Parameters:**
|
|
226
|
+
|
|
227
|
+
- `track`: MediaStreamTrack - Source audio track
|
|
228
|
+
- `config`: AudioProcessingConfig - Configuration object
|
|
229
|
+
|
|
230
|
+
**Returns:** `Promise<AudioPipelineHandle>`
|
|
231
|
+
|
|
232
|
+
### AudioPipelineHandle
|
|
233
|
+
|
|
234
|
+
```ts
|
|
235
|
+
interface AudioPipelineHandle {
|
|
236
|
+
processedTrack: MediaStreamTrack;
|
|
237
|
+
events: Emitter<AudioPipelineEvents>;
|
|
238
|
+
state: VADState;
|
|
239
|
+
setConfig(config: Partial<AudioProcessingConfig>): void;
|
|
240
|
+
dispose(): void;
|
|
241
|
+
}
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
### AudioPipelineEvents
|
|
245
|
+
|
|
246
|
+
```ts
|
|
247
|
+
type AudioPipelineEvents = {
|
|
248
|
+
vadChange: VADState;
|
|
249
|
+
error: Error;
|
|
250
|
+
};
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### VADState
|
|
254
|
+
|
|
255
|
+
```ts
|
|
256
|
+
interface VADState {
|
|
257
|
+
isSpeaking: boolean;
|
|
258
|
+
probability: number;
|
|
259
|
+
state: "silent" | "speech_starting" | "speaking" | "speech_ending";
|
|
260
|
+
}
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## Default Values
|
|
264
|
+
|
|
265
|
+
| Parameter | Default | Description |
|
|
266
|
+
| ---------------------- | ------- | -------------------------------- |
|
|
267
|
+
| `startThreshold` | 0.6 | Unmute at 60% confidence |
|
|
268
|
+
| `stopThreshold` | 0.45 | Mute below 45% confidence |
|
|
269
|
+
| `hangoverMs` | 400 | Wait 400ms before muting |
|
|
270
|
+
| `preRollMs` | 250 | Buffer 250ms before speech |
|
|
271
|
+
| `minSpeechDurationMs` | 100 | Minimum valid speech duration |
|
|
272
|
+
| `minSilenceDurationMs` | 150 | Minimum silence between speech |
|
|
273
|
+
| `silenceGain` | 0.0 | Complete mute when silent |
|
|
274
|
+
| `speechGain` | 1.0 | Unity gain when speaking |
|
|
275
|
+
| `minSNR` | 2.0 | Voice must be 2x noise floor |
|
|
276
|
+
| `snrRange` | 8.0 | Probability scales over SNR 2-10 |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
VADStateMachine
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-N553RHTI.mjs";
|
|
4
4
|
import {
|
|
5
5
|
getAudioContext,
|
|
6
6
|
registerPipeline,
|
|
@@ -9,14 +9,16 @@ import {
|
|
|
9
9
|
import {
|
|
10
10
|
getNoiseSuppressionPlugin,
|
|
11
11
|
getVADPlugin
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-YOSTLLCS.mjs";
|
|
13
13
|
|
|
14
14
|
// src/pipeline/audio-pipeline.ts
|
|
15
15
|
import mitt from "mitt";
|
|
16
16
|
async function createAudioPipeline(sourceTrack, config = {}) {
|
|
17
17
|
const context = getAudioContext();
|
|
18
18
|
registerPipeline();
|
|
19
|
-
const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
|
|
19
|
+
const nsEnabled = config.noiseSuppression?.enabled !== false && Boolean(
|
|
20
|
+
config.noiseSuppression?.wasmUrl && config.noiseSuppression?.simdUrl && config.noiseSuppression?.workletUrl
|
|
21
|
+
);
|
|
20
22
|
const vadEnabled = config.vad?.enabled !== false;
|
|
21
23
|
const fullConfig = {
|
|
22
24
|
noiseSuppression: {
|
|
@@ -25,13 +27,38 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
25
27
|
},
|
|
26
28
|
vad: {
|
|
27
29
|
enabled: vadEnabled,
|
|
30
|
+
// Voice-optimized defaults (will be overridden by config)
|
|
31
|
+
startThreshold: 0.6,
|
|
32
|
+
stopThreshold: 0.45,
|
|
33
|
+
hangoverMs: 400,
|
|
34
|
+
preRollMs: 250,
|
|
35
|
+
minSpeechDurationMs: 100,
|
|
36
|
+
minSilenceDurationMs: 150,
|
|
37
|
+
energyVad: {
|
|
38
|
+
smoothing: 0.95,
|
|
39
|
+
initialNoiseFloor: 1e-3,
|
|
40
|
+
noiseFloorAdaptRateQuiet: 0.01,
|
|
41
|
+
noiseFloorAdaptRateLoud: 1e-3,
|
|
42
|
+
minSNR: 2,
|
|
43
|
+
snrRange: 8
|
|
44
|
+
},
|
|
28
45
|
...config.vad
|
|
29
46
|
},
|
|
30
47
|
output: {
|
|
31
48
|
speechGain: 1,
|
|
32
|
-
silenceGain:
|
|
33
|
-
//
|
|
34
|
-
gainRampTime: 0.
|
|
49
|
+
silenceGain: 0,
|
|
50
|
+
// Full mute for voice-only
|
|
51
|
+
gainRampTime: 0.015,
|
|
52
|
+
// Fast but smooth transitions
|
|
53
|
+
smoothTransitions: true,
|
|
54
|
+
maxGainDb: 6,
|
|
55
|
+
enableCompression: false,
|
|
56
|
+
compression: {
|
|
57
|
+
threshold: -24,
|
|
58
|
+
ratio: 3,
|
|
59
|
+
attack: 3e-3,
|
|
60
|
+
release: 0.05
|
|
61
|
+
},
|
|
35
62
|
...config.output
|
|
36
63
|
},
|
|
37
64
|
livekit: { manageTrackMute: false, ...config.livekit }
|
|
@@ -42,7 +69,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
42
69
|
output: fullConfig.output
|
|
43
70
|
});
|
|
44
71
|
if (!sourceTrack || sourceTrack.kind !== "audio") {
|
|
45
|
-
throw new Error(
|
|
72
|
+
throw new Error(
|
|
73
|
+
"createAudioPipeline requires a valid audio MediaStreamTrack"
|
|
74
|
+
);
|
|
46
75
|
}
|
|
47
76
|
if (sourceTrack.readyState === "ended") {
|
|
48
77
|
throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
|
|
@@ -56,10 +85,7 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
56
85
|
const nsPlugin = getNoiseSuppressionPlugin(
|
|
57
86
|
fullConfig.noiseSuppression?.pluginName
|
|
58
87
|
);
|
|
59
|
-
nsNode = await nsPlugin.createNode(
|
|
60
|
-
context,
|
|
61
|
-
fullConfig.noiseSuppression
|
|
62
|
-
);
|
|
88
|
+
nsNode = await nsPlugin.createNode(context, fullConfig.noiseSuppression);
|
|
63
89
|
} catch (error) {
|
|
64
90
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
65
91
|
console.error("Failed to create noise suppression node:", err);
|
|
@@ -69,25 +95,21 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
69
95
|
const vadStateMachine = new VADStateMachine(fullConfig.vad);
|
|
70
96
|
try {
|
|
71
97
|
const vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
|
|
72
|
-
vadNode = await vadPlugin.createNode(
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
emitter.emit("vadChange", newState);
|
|
81
|
-
lastVadState = newState;
|
|
82
|
-
updateGain(newState);
|
|
83
|
-
}
|
|
84
|
-
} catch (vadError) {
|
|
85
|
-
const err = vadError instanceof Error ? vadError : new Error(String(vadError));
|
|
86
|
-
console.error("Error in VAD callback:", err);
|
|
87
|
-
emitter.emit("error", err);
|
|
98
|
+
vadNode = await vadPlugin.createNode(context, fullConfig.vad, (prob) => {
|
|
99
|
+
try {
|
|
100
|
+
const timestamp = context.currentTime * 1e3;
|
|
101
|
+
const newState = vadStateMachine.processFrame(prob, timestamp);
|
|
102
|
+
if (newState.state !== lastVadState.state || Math.abs(newState.probability - lastVadState.probability) > 0.1) {
|
|
103
|
+
emitter.emit("vadChange", newState);
|
|
104
|
+
lastVadState = newState;
|
|
105
|
+
updateGain(newState);
|
|
88
106
|
}
|
|
107
|
+
} catch (vadError) {
|
|
108
|
+
const err = vadError instanceof Error ? vadError : new Error(String(vadError));
|
|
109
|
+
console.error("Error in VAD callback:", err);
|
|
110
|
+
emitter.emit("error", err);
|
|
89
111
|
}
|
|
90
|
-
);
|
|
112
|
+
});
|
|
91
113
|
} catch (error) {
|
|
92
114
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
93
115
|
console.error("Failed to create VAD node:", err);
|
|
@@ -104,15 +126,31 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
104
126
|
nsNode.connect(splitter);
|
|
105
127
|
splitter.connect(vadNode);
|
|
106
128
|
const delayNode = context.createDelay(1);
|
|
107
|
-
const preRollSeconds = (fullConfig.vad?.preRollMs ??
|
|
129
|
+
const preRollSeconds = (fullConfig.vad?.preRollMs ?? 250) / 1e3;
|
|
108
130
|
delayNode.delayTime.value = preRollSeconds;
|
|
109
131
|
const gainNode = context.createGain();
|
|
110
132
|
gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
|
|
133
|
+
let compressor = null;
|
|
134
|
+
if (fullConfig.output?.enableCompression) {
|
|
135
|
+
compressor = context.createDynamicsCompressor();
|
|
136
|
+
const comp = fullConfig.output.compression;
|
|
137
|
+
compressor.threshold.value = comp.threshold ?? -24;
|
|
138
|
+
compressor.ratio.value = comp.ratio ?? 3;
|
|
139
|
+
compressor.attack.value = comp.attack ?? 3e-3;
|
|
140
|
+
compressor.release.value = comp.release ?? 0.05;
|
|
141
|
+
compressor.knee.value = 10;
|
|
142
|
+
}
|
|
111
143
|
const destination = context.createMediaStreamDestination();
|
|
112
144
|
try {
|
|
113
145
|
splitter.connect(delayNode);
|
|
114
146
|
delayNode.connect(gainNode);
|
|
115
|
-
|
|
147
|
+
if (compressor) {
|
|
148
|
+
gainNode.connect(compressor);
|
|
149
|
+
compressor.connect(destination);
|
|
150
|
+
console.log("Compression enabled:", fullConfig.output?.compression);
|
|
151
|
+
} else {
|
|
152
|
+
gainNode.connect(destination);
|
|
153
|
+
}
|
|
116
154
|
} catch (error) {
|
|
117
155
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
118
156
|
console.error("Failed to wire audio pipeline:", err);
|
|
@@ -121,10 +159,24 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
121
159
|
}
|
|
122
160
|
function updateGain(state) {
|
|
123
161
|
try {
|
|
124
|
-
const {
|
|
125
|
-
|
|
162
|
+
const {
|
|
163
|
+
speechGain = 1,
|
|
164
|
+
silenceGain = 0,
|
|
165
|
+
gainRampTime = 0.015,
|
|
166
|
+
smoothTransitions = true,
|
|
167
|
+
maxGainDb = 6
|
|
168
|
+
} = fullConfig.output;
|
|
169
|
+
const maxGainLinear = Math.pow(10, maxGainDb / 20);
|
|
170
|
+
const limitedSpeechGain = Math.min(speechGain, maxGainLinear);
|
|
171
|
+
const targetGain = state.isSpeaking ? limitedSpeechGain : silenceGain;
|
|
126
172
|
const now = context.currentTime;
|
|
127
|
-
|
|
173
|
+
if (smoothTransitions) {
|
|
174
|
+
gainNode.gain.cancelScheduledValues(now);
|
|
175
|
+
gainNode.gain.setValueAtTime(gainNode.gain.value, now);
|
|
176
|
+
gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime / 3);
|
|
177
|
+
} else {
|
|
178
|
+
gainNode.gain.setValueAtTime(targetGain, now);
|
|
179
|
+
}
|
|
128
180
|
} catch (error) {
|
|
129
181
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
130
182
|
console.error("Failed to update gain:", err);
|
|
@@ -180,6 +232,9 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
180
232
|
vadNode.disconnect();
|
|
181
233
|
delayNode.disconnect();
|
|
182
234
|
gainNode.disconnect();
|
|
235
|
+
if (compressor) {
|
|
236
|
+
compressor.disconnect();
|
|
237
|
+
}
|
|
183
238
|
destination.stream.getTracks().forEach((t) => t.stop());
|
|
184
239
|
unregisterPipeline();
|
|
185
240
|
} catch (error) {
|
|
@@ -196,7 +251,47 @@ async function createAudioPipeline(sourceTrack, config = {}) {
|
|
|
196
251
|
try {
|
|
197
252
|
if (newConfig.vad) {
|
|
198
253
|
vadStateMachine.updateConfig(newConfig.vad);
|
|
254
|
+
Object.assign(fullConfig.vad, newConfig.vad);
|
|
255
|
+
if (newConfig.vad.preRollMs !== void 0) {
|
|
256
|
+
const preRollSeconds2 = newConfig.vad.preRollMs / 1e3;
|
|
257
|
+
delayNode.delayTime.setValueAtTime(
|
|
258
|
+
preRollSeconds2,
|
|
259
|
+
context.currentTime
|
|
260
|
+
);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
if (newConfig.output) {
|
|
264
|
+
Object.assign(fullConfig.output, newConfig.output);
|
|
265
|
+
updateGain(lastVadState);
|
|
266
|
+
if (compressor && newConfig.output.compression) {
|
|
267
|
+
const comp = newConfig.output.compression;
|
|
268
|
+
if (comp.threshold !== void 0) {
|
|
269
|
+
compressor.threshold.setValueAtTime(
|
|
270
|
+
comp.threshold,
|
|
271
|
+
context.currentTime
|
|
272
|
+
);
|
|
273
|
+
}
|
|
274
|
+
if (comp.ratio !== void 0) {
|
|
275
|
+
compressor.ratio.setValueAtTime(comp.ratio, context.currentTime);
|
|
276
|
+
}
|
|
277
|
+
if (comp.attack !== void 0) {
|
|
278
|
+
compressor.attack.setValueAtTime(
|
|
279
|
+
comp.attack,
|
|
280
|
+
context.currentTime
|
|
281
|
+
);
|
|
282
|
+
}
|
|
283
|
+
if (comp.release !== void 0) {
|
|
284
|
+
compressor.release.setValueAtTime(
|
|
285
|
+
comp.release,
|
|
286
|
+
context.currentTime
|
|
287
|
+
);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
if (newConfig.livekit) {
|
|
292
|
+
Object.assign(fullConfig.livekit, newConfig.livekit);
|
|
199
293
|
}
|
|
294
|
+
console.log("Pipeline config updated:", newConfig);
|
|
200
295
|
} catch (error) {
|
|
201
296
|
const err = error instanceof Error ? error : new Error(String(error));
|
|
202
297
|
console.error("Failed to update config:", err);
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
// src/vad/vad-state.ts
|
|
2
|
+
var VADStateMachine = class {
|
|
3
|
+
config;
|
|
4
|
+
currentState = "silent";
|
|
5
|
+
lastSpeechTime = 0;
|
|
6
|
+
speechStartTime = 0;
|
|
7
|
+
lastSilenceTime = 0;
|
|
8
|
+
frameDurationMs = 20;
|
|
9
|
+
// Assumed frame duration, updated by calls
|
|
10
|
+
constructor(config) {
|
|
11
|
+
this.config = {
|
|
12
|
+
enabled: config?.enabled ?? true,
|
|
13
|
+
pluginName: config?.pluginName ?? "energy-vad",
|
|
14
|
+
// Voice-optimized defaults
|
|
15
|
+
startThreshold: config?.startThreshold ?? 0.6,
|
|
16
|
+
// Higher threshold to avoid noise
|
|
17
|
+
stopThreshold: config?.stopThreshold ?? 0.45,
|
|
18
|
+
// Balanced for voice
|
|
19
|
+
hangoverMs: config?.hangoverMs ?? 400,
|
|
20
|
+
// Smooth for natural speech
|
|
21
|
+
preRollMs: config?.preRollMs ?? 250,
|
|
22
|
+
// Generous pre-roll
|
|
23
|
+
minSpeechDurationMs: config?.minSpeechDurationMs ?? 100,
|
|
24
|
+
minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
|
|
25
|
+
energyVad: {
|
|
26
|
+
smoothing: config?.energyVad?.smoothing ?? 0.95,
|
|
27
|
+
initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
|
|
28
|
+
noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
|
|
29
|
+
noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
|
|
30
|
+
minSNR: config?.energyVad?.minSNR ?? 2,
|
|
31
|
+
snrRange: config?.energyVad?.snrRange ?? 8
|
|
32
|
+
}
|
|
33
|
+
};
|
|
34
|
+
this.lastSilenceTime = Date.now();
|
|
35
|
+
}
|
|
36
|
+
updateConfig(config) {
|
|
37
|
+
this.config = { ...this.config, ...config };
|
|
38
|
+
}
|
|
39
|
+
processFrame(probability, timestamp) {
|
|
40
|
+
const {
|
|
41
|
+
startThreshold,
|
|
42
|
+
stopThreshold,
|
|
43
|
+
hangoverMs,
|
|
44
|
+
minSpeechDurationMs,
|
|
45
|
+
minSilenceDurationMs
|
|
46
|
+
} = this.config;
|
|
47
|
+
let newState = this.currentState;
|
|
48
|
+
if (this.currentState === "silent" || this.currentState === "speech_ending") {
|
|
49
|
+
if (probability >= startThreshold) {
|
|
50
|
+
const silenceDuration = timestamp - this.lastSilenceTime;
|
|
51
|
+
if (silenceDuration >= minSilenceDurationMs) {
|
|
52
|
+
newState = "speech_starting";
|
|
53
|
+
this.speechStartTime = timestamp;
|
|
54
|
+
this.lastSpeechTime = timestamp;
|
|
55
|
+
} else {
|
|
56
|
+
newState = "silent";
|
|
57
|
+
}
|
|
58
|
+
} else {
|
|
59
|
+
newState = "silent";
|
|
60
|
+
this.lastSilenceTime = timestamp;
|
|
61
|
+
}
|
|
62
|
+
} else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
|
|
63
|
+
if (probability >= stopThreshold) {
|
|
64
|
+
newState = "speaking";
|
|
65
|
+
this.lastSpeechTime = timestamp;
|
|
66
|
+
} else {
|
|
67
|
+
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
68
|
+
const speechDuration = timestamp - this.speechStartTime;
|
|
69
|
+
if (timeSinceSpeech < hangoverMs) {
|
|
70
|
+
newState = "speaking";
|
|
71
|
+
} else if (speechDuration < minSpeechDurationMs) {
|
|
72
|
+
newState = "silent";
|
|
73
|
+
this.lastSilenceTime = timestamp;
|
|
74
|
+
} else {
|
|
75
|
+
newState = "speech_ending";
|
|
76
|
+
this.lastSilenceTime = timestamp;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
if (newState === "speech_starting") newState = "speaking";
|
|
81
|
+
if (newState === "speech_ending") newState = "silent";
|
|
82
|
+
this.currentState = newState;
|
|
83
|
+
return {
|
|
84
|
+
isSpeaking: newState === "speaking",
|
|
85
|
+
probability,
|
|
86
|
+
state: newState
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
export {
|
|
92
|
+
VADStateMachine
|
|
93
|
+
};
|