@tensamin/audio 0.1.15 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -229
- package/dist/chunk-AQ5RVY33.mjs +74 -0
- package/dist/chunk-BSYE2MWZ.mjs +178 -0
- package/dist/chunk-DTIMONGP.mjs +92 -0
- package/dist/chunk-IS37FHDN.mjs +33 -0
- package/dist/chunk-JBGGED5Q.mjs +129 -0
- package/dist/chunk-QNQK6QFB.mjs +71 -0
- package/dist/context/audio-context.d.mts +0 -24
- package/dist/context/audio-context.d.ts +0 -24
- package/dist/index.d.mts +2 -8
- package/dist/index.d.ts +2 -8
- package/dist/index.js +403 -651
- package/dist/index.mjs +11 -43
- package/dist/livekit/integration.d.mts +5 -8
- package/dist/livekit/integration.d.ts +5 -8
- package/dist/livekit/integration.js +401 -598
- package/dist/livekit/integration.mjs +10 -8
- package/dist/noise-suppression/deepfilternet-node.d.mts +12 -0
- package/dist/noise-suppression/deepfilternet-node.d.ts +12 -0
- package/dist/noise-suppression/deepfilternet-node.js +57 -0
- package/dist/noise-suppression/deepfilternet-node.mjs +6 -0
- package/dist/pipeline/audio-pipeline.d.mts +2 -2
- package/dist/pipeline/audio-pipeline.d.ts +2 -2
- package/dist/pipeline/audio-pipeline.js +219 -529
- package/dist/pipeline/audio-pipeline.mjs +4 -5
- package/dist/pipeline/remote-audio-monitor.d.mts +12 -0
- package/dist/pipeline/remote-audio-monitor.d.ts +12 -0
- package/dist/pipeline/remote-audio-monitor.js +276 -0
- package/dist/pipeline/remote-audio-monitor.mjs +9 -0
- package/dist/types.d.mts +45 -246
- package/dist/types.d.ts +45 -246
- package/dist/vad/vad-node.d.mts +7 -9
- package/dist/vad/vad-node.d.ts +7 -9
- package/dist/vad/vad-node.js +47 -134
- package/dist/vad/vad-node.mjs +3 -3
- package/dist/vad/vad-state.d.mts +9 -11
- package/dist/vad/vad-state.d.ts +9 -11
- package/dist/vad/vad-state.js +50 -77
- package/dist/vad/vad-state.mjs +3 -3
- package/package.json +21 -21
- package/dist/chunk-GLKAWCEW.mjs +0 -158
- package/dist/chunk-KLBA2CPE.mjs +0 -101
- package/dist/chunk-QQFKHTCQ.mjs +0 -91
- package/dist/chunk-U26F3GJN.mjs +0 -47
- package/dist/chunk-WQVMSR7V.mjs +0 -310
- package/dist/chunk-XO6B3D4A.mjs +0 -67
- package/dist/extensibility/plugins.d.mts +0 -9
- package/dist/extensibility/plugins.d.ts +0 -9
- package/dist/extensibility/plugins.js +0 -298
- package/dist/extensibility/plugins.mjs +0 -14
- package/dist/noise-suppression/rnnoise-node.d.mts +0 -10
- package/dist/noise-suppression/rnnoise-node.d.ts +0 -10
- package/dist/noise-suppression/rnnoise-node.js +0 -101
- package/dist/noise-suppression/rnnoise-node.mjs +0 -6
package/README.md
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
# @tensamin/audio
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
DeepFilterNet3-based noise suppression and realtime speaking detection for LiveKit.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
- Automatic
|
|
10
|
-
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
7
|
+
- DeepFilterNet3 WASM noise suppression
|
|
8
|
+
- Realtime `speaking` boolean + dB level
|
|
9
|
+
- Automatic mute/unmute for LiveKit tracks
|
|
10
|
+
- Simple min/max dB speaking thresholds
|
|
11
|
+
|
|
12
|
+
> [Noise suppression is provided via the `deepfilternet3-noise-filter` package.](https://www.npmjs.com/package/deepfilternet3-noise-filter)
|
|
13
|
+
> [That package is based on DeepFilterNet by Rikorose.](https://github.com/Rikorose/DeepFilterNet)
|
|
14
|
+
|
|
14
15
|
|
|
15
16
|
## Installation
|
|
16
17
|
|
|
@@ -18,259 +19,81 @@ Audio processing library for the web with RNNoise-based noise suppression and Vo
|
|
|
18
19
|
npm install @tensamin/audio livekit-client
|
|
19
20
|
```
|
|
20
21
|
|
|
21
|
-
##
|
|
22
|
-
|
|
23
|
-
For noise suppression, the following files must be provided:
|
|
24
|
-
|
|
25
|
-
- `rnnoise.wasm`
|
|
26
|
-
- `rnnoise_simd.wasm`
|
|
27
|
-
- `worklet.js`
|
|
28
|
-
|
|
29
|
-
Available at: `https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/`
|
|
30
|
-
|
|
31
|
-
Place these files in a publicly accessible directory (e.g., `public/audio-processor/`).
|
|
32
|
-
|
|
33
|
-
## Usage
|
|
34
|
-
|
|
35
|
-
### Basic Example
|
|
36
|
-
|
|
37
|
-
```ts
|
|
38
|
-
import { createAudioPipeline } from "@tensamin/audio";
|
|
39
|
-
|
|
40
|
-
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
41
|
-
const track = stream.getAudioTracks()[0];
|
|
42
|
-
|
|
43
|
-
const pipeline = await createAudioPipeline(track, {
|
|
44
|
-
noiseSuppression: {
|
|
45
|
-
enabled: true,
|
|
46
|
-
wasmUrl: "/audio-processor/rnnoise.wasm",
|
|
47
|
-
simdUrl: "/audio-processor/rnnoise_simd.wasm",
|
|
48
|
-
workletUrl: "/audio-processor/worklet.js",
|
|
49
|
-
},
|
|
50
|
-
vad: { enabled: true },
|
|
51
|
-
});
|
|
52
|
-
|
|
53
|
-
const processedStream = new MediaStream([pipeline.processedTrack]);
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
### LiveKit Integration
|
|
22
|
+
## Quick Start (LiveKit)
|
|
57
23
|
|
|
58
24
|
```ts
|
|
59
|
-
import { attachProcessingToTrack } from "@tensamin/audio";
|
|
60
25
|
import { LocalAudioTrack } from "livekit-client";
|
|
26
|
+
import { attachSpeakingDetectionToTrack } from "@tensamin/audio";
|
|
61
27
|
|
|
62
28
|
const localTrack = await LocalAudioTrack.create();
|
|
63
29
|
|
|
64
|
-
const
|
|
30
|
+
const controller = await attachSpeakingDetectionToTrack(localTrack, {
|
|
31
|
+
speaking: {
|
|
32
|
+
minDb: -60,
|
|
33
|
+
maxDb: -20,
|
|
34
|
+
},
|
|
65
35
|
noiseSuppression: {
|
|
66
36
|
enabled: true,
|
|
67
|
-
wasmUrl: "/audio-processor/rnnoise.wasm",
|
|
68
|
-
simdUrl: "/audio-processor/rnnoise_simd.wasm",
|
|
69
|
-
workletUrl: "/audio-processor/worklet.js",
|
|
70
37
|
},
|
|
71
|
-
|
|
72
|
-
livekit: { manageTrackMute: true },
|
|
38
|
+
muteWhenSilent: true,
|
|
73
39
|
});
|
|
74
40
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
### Monitoring VAD State
|
|
79
|
-
|
|
80
|
-
```ts
|
|
81
|
-
pipeline.events.on("vadChange", (state) => {
|
|
82
|
-
console.log("Speaking:", state.isSpeaking);
|
|
83
|
-
console.log("Probability:", state.probability);
|
|
84
|
-
console.log("State:", state.state);
|
|
41
|
+
controller.onChange((state) => {
|
|
42
|
+
console.log("speaking", state.speaking);
|
|
43
|
+
console.log("levelDb", state.levelDb);
|
|
85
44
|
});
|
|
86
|
-
```
|
|
87
|
-
|
|
88
|
-
## Configuration
|
|
89
|
-
|
|
90
|
-
### Voice Activity Detection
|
|
91
45
|
|
|
92
|
-
|
|
93
|
-
vad: {
|
|
94
|
-
enabled: boolean;
|
|
95
|
-
startThreshold: number; // Default: 0.6 (range: 0-1)
|
|
96
|
-
stopThreshold: number; // Default: 0.45 (range: 0-1)
|
|
97
|
-
hangoverMs: number; // Default: 400
|
|
98
|
-
preRollMs: number; // Default: 250
|
|
99
|
-
minSpeechDurationMs: number; // Default: 100
|
|
100
|
-
minSilenceDurationMs: number; // Default: 150
|
|
101
|
-
energyVad?: {
|
|
102
|
-
smoothing: number; // Default: 0.95
|
|
103
|
-
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
-
minSNR: number; // Default: 8.0 (dB)
|
|
105
|
-
snrRange: number; // Default: 12.0 (dB)
|
|
106
|
-
minEnergy: number; // Default: 0.01
|
|
107
|
-
};
|
|
108
|
-
}
|
|
46
|
+
await room.localParticipant.publishTrack(localTrack);
|
|
109
47
|
```
|
|
110
48
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
- `startThreshold`: Probability threshold to unmute audio (Default: 0.8, ~18dB SNR)
|
|
114
|
-
- `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
|
|
115
|
-
- `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
|
|
116
|
-
- `preRollMs`: Audio buffer duration before speech onset
|
|
117
|
-
- `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 250ms)
|
|
118
|
-
- `minSilenceDurationMs`: Minimum silence duration between speech segments
|
|
119
|
-
|
|
120
|
-
**Energy VAD Parameters:**
|
|
49
|
+
## Configuration
|
|
121
50
|
|
|
122
|
-
|
|
123
|
-
- `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
|
|
124
|
-
- `snrRange`: Range in dB for probability scaling from minSNR
|
|
125
|
-
- `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.01, ~-40dB)
|
|
51
|
+
All options are passed via `LivekitSpeakingOptions` to `attachSpeakingDetectionToTrack`.
|
|
126
52
|
|
|
127
|
-
###
|
|
53
|
+
### Noise suppression (DeepFilterNet3)
|
|
128
54
|
|
|
129
55
|
```ts
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
maxGainDb: number; // Default: 6.0
|
|
136
|
-
enableCompression: boolean; // Default: false
|
|
137
|
-
compression?: {
|
|
138
|
-
threshold: number; // Default: -24.0 (dB)
|
|
139
|
-
ratio: number; // Default: 3.0
|
|
140
|
-
attack: number; // Default: 0.003 (seconds)
|
|
141
|
-
release: number; // Default: 0.05 (seconds)
|
|
56
|
+
noiseSuppression: {
|
|
57
|
+
enabled?: boolean; // default: true
|
|
58
|
+
noiseReductionLevel?: number; // 0-100, default: 60
|
|
59
|
+
assetConfig?: {
|
|
60
|
+
cdnUrl?: string;
|
|
142
61
|
};
|
|
143
62
|
}
|
|
144
63
|
```
|
|
145
64
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
- `speechGain`: Gain multiplier when speaking (1.0 = unity)
|
|
149
|
-
- `silenceGain`: Gain multiplier when silent (0.0 = mute)
|
|
150
|
-
- `gainRampTime`: Transition duration for gain changes
|
|
151
|
-
- `maxGainDb`: Maximum gain limit to prevent clipping
|
|
152
|
-
|
|
153
|
-
**Compression Parameters:**
|
|
154
|
-
|
|
155
|
-
- `threshold`: Level above which compression is applied
|
|
156
|
-
- `ratio`: Compression ratio (e.g., 3.0 = 3:1)
|
|
157
|
-
- `attack`: Time to reach full compression
|
|
158
|
-
- `release`: Time to release compression
|
|
159
|
-
|
|
160
|
-
### Runtime Configuration Updates
|
|
161
|
-
|
|
162
|
-
```ts
|
|
163
|
-
pipeline.setConfig({
|
|
164
|
-
vad: {
|
|
165
|
-
startThreshold: 0.7,
|
|
166
|
-
stopThreshold: 0.55,
|
|
167
|
-
},
|
|
168
|
-
output: {
|
|
169
|
-
speechGain: 1.3,
|
|
170
|
-
},
|
|
171
|
-
});
|
|
172
|
-
```
|
|
173
|
-
|
|
174
|
-
## Configuration Examples
|
|
175
|
-
|
|
176
|
-
### Noisy Environment
|
|
177
|
-
|
|
178
|
-
```ts
|
|
179
|
-
{
|
|
180
|
-
vad: {
|
|
181
|
-
startThreshold: 0.7,
|
|
182
|
-
stopThreshold: 0.55,
|
|
183
|
-
minSpeechDurationMs: 150,
|
|
184
|
-
energyVad: { minSNR: 3.0 }
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
```
|
|
188
|
-
|
|
189
|
-
### Quiet Speaker
|
|
190
|
-
|
|
191
|
-
```ts
|
|
192
|
-
{
|
|
193
|
-
vad: {
|
|
194
|
-
startThreshold: 0.4,
|
|
195
|
-
stopThreshold: 0.25,
|
|
196
|
-
energyVad: { minSNR: 1.5 }
|
|
197
|
-
},
|
|
198
|
-
output: {
|
|
199
|
-
speechGain: 1.5
|
|
200
|
-
}
|
|
201
|
-
}
|
|
202
|
-
```
|
|
203
|
-
|
|
204
|
-
### Natural Conversation
|
|
65
|
+
### Speaking detection (dB-based)
|
|
205
66
|
|
|
206
67
|
```ts
|
|
207
|
-
{
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
}
|
|
68
|
+
speaking: {
|
|
69
|
+
minDb: number; // e.g. -60
|
|
70
|
+
maxDb: number; // e.g. -20
|
|
71
|
+
speakOnRatio?: number; // default: 0.6
|
|
72
|
+
speakOffRatio?: number; // default: 0.3
|
|
73
|
+
hangoverMs?: number; // default: 350
|
|
74
|
+
attackMs?: number; // default: 50
|
|
75
|
+
releaseMs?: number; // default: 120
|
|
216
76
|
}
|
|
217
77
|
```
|
|
218
78
|
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
### `createAudioPipeline(track, config)`
|
|
79
|
+
`minDb` / `maxDb` define the dynamic range used for level normalization. `speakOnRatio` and `speakOffRatio` (0–1) control when speech starts/stops within that range.
|
|
222
80
|
|
|
223
|
-
|
|
81
|
+
### Output gain control
|
|
224
82
|
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
### AudioPipelineHandle
|
|
233
|
-
|
|
234
|
-
```ts
|
|
235
|
-
interface AudioPipelineHandle {
|
|
236
|
-
processedTrack: MediaStreamTrack;
|
|
237
|
-
events: Emitter<AudioPipelineEvents>;
|
|
238
|
-
state: VADState;
|
|
239
|
-
setConfig(config: Partial<AudioProcessingConfig>): void;
|
|
240
|
-
dispose(): void;
|
|
83
|
+
````ts
|
|
84
|
+
output: {
|
|
85
|
+
speechGain?: number; // default: 1.0
|
|
86
|
+
silenceGain?: number; // default: 0.0
|
|
87
|
+
gainRampTime?: number; // default: 0.015 (s)
|
|
88
|
+
maxGainDb?: number; // default: 6.0
|
|
89
|
+
smoothTransitions?: boolean;// default: true
|
|
241
90
|
}
|
|
242
|
-
|
|
91
|
+
``+
|
|
243
92
|
|
|
244
|
-
###
|
|
93
|
+
### LiveKit mute handling
|
|
245
94
|
|
|
246
95
|
```ts
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
error: Error;
|
|
250
|
-
};
|
|
251
|
-
```
|
|
252
|
-
|
|
253
|
-
### VADState
|
|
254
|
-
|
|
255
|
-
```ts
|
|
256
|
-
interface VADState {
|
|
257
|
-
isSpeaking: boolean;
|
|
258
|
-
probability: number;
|
|
259
|
-
state: "silent" | "speech_starting" | "speaking" | "speech_ending";
|
|
260
|
-
}
|
|
261
|
-
```
|
|
262
|
-
|
|
263
|
-
## Default Values
|
|
96
|
+
muteWhenSilent?: boolean; // default: false
|
|
97
|
+
````
|
|
264
98
|
|
|
265
|
-
|
|
266
|
-
| ---------------------- | ------- | -------------------------------- |
|
|
267
|
-
| `startThreshold` | 0.6 | Unmute at 60% confidence |
|
|
268
|
-
| `stopThreshold` | 0.45 | Mute below 45% confidence |
|
|
269
|
-
| `hangoverMs` | 400 | Wait 400ms before muting |
|
|
270
|
-
| `preRollMs` | 250 | Buffer 250ms before speech |
|
|
271
|
-
| `minSpeechDurationMs` | 100 | Minimum valid speech duration |
|
|
272
|
-
| `minSilenceDurationMs` | 150 | Minimum silence between speech |
|
|
273
|
-
| `silenceGain` | 0.0 | Complete mute when silent |
|
|
274
|
-
| `speechGain` | 1.0 | Unity gain when speaking |
|
|
275
|
-
| `minSNR` | 2.0 | Voice must be 2x noise floor |
|
|
276
|
-
| `snrRange` | 8.0 | Probability scales over SNR 2-10 |
|
|
99
|
+
When `muteWhenSilent` is `true`, the library automatically calls `track.mute()` when silence is detected and `track.unmute()` when speech resumes (only if it muted the track itself).
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
// src/vad/vad-state.ts
|
|
2
|
+
var LevelBasedVAD = class {
|
|
3
|
+
config;
|
|
4
|
+
speaking = false;
|
|
5
|
+
pendingSpeechSince = null;
|
|
6
|
+
pendingSilenceSince = null;
|
|
7
|
+
constructor(config) {
|
|
8
|
+
this.config = {
|
|
9
|
+
minDb: config.minDb,
|
|
10
|
+
maxDb: config.maxDb,
|
|
11
|
+
speakOnRatio: config.speakOnRatio ?? 0.6,
|
|
12
|
+
speakOffRatio: config.speakOffRatio ?? 0.3,
|
|
13
|
+
hangoverMs: config.hangoverMs ?? 350,
|
|
14
|
+
attackMs: config.attackMs ?? 50,
|
|
15
|
+
releaseMs: config.releaseMs ?? 120
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
updateConfig(config) {
|
|
19
|
+
this.config = {
|
|
20
|
+
...this.config,
|
|
21
|
+
...config,
|
|
22
|
+
speakOnRatio: config.speakOnRatio ?? this.config.speakOnRatio,
|
|
23
|
+
speakOffRatio: config.speakOffRatio ?? this.config.speakOffRatio,
|
|
24
|
+
hangoverMs: config.hangoverMs ?? this.config.hangoverMs,
|
|
25
|
+
attackMs: config.attackMs ?? this.config.attackMs,
|
|
26
|
+
releaseMs: config.releaseMs ?? this.config.releaseMs
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
process(levelDb, timestampMs) {
|
|
30
|
+
const {
|
|
31
|
+
minDb,
|
|
32
|
+
maxDb,
|
|
33
|
+
speakOnRatio,
|
|
34
|
+
speakOffRatio,
|
|
35
|
+
hangoverMs,
|
|
36
|
+
attackMs,
|
|
37
|
+
releaseMs
|
|
38
|
+
} = this.config;
|
|
39
|
+
const clamped = Math.min(maxDb, Math.max(minDb, levelDb));
|
|
40
|
+
const norm = (clamped - minDb) / Math.max(1, maxDb - minDb);
|
|
41
|
+
if (!this.speaking) {
|
|
42
|
+
if (norm >= speakOnRatio) {
|
|
43
|
+
this.pendingSpeechSince = this.pendingSpeechSince ?? timestampMs;
|
|
44
|
+
if (timestampMs - this.pendingSpeechSince >= attackMs) {
|
|
45
|
+
this.speaking = true;
|
|
46
|
+
this.pendingSpeechSince = null;
|
|
47
|
+
this.pendingSilenceSince = null;
|
|
48
|
+
}
|
|
49
|
+
} else {
|
|
50
|
+
this.pendingSpeechSince = null;
|
|
51
|
+
}
|
|
52
|
+
} else {
|
|
53
|
+
if (norm <= speakOffRatio) {
|
|
54
|
+
this.pendingSilenceSince = this.pendingSilenceSince ?? timestampMs;
|
|
55
|
+
const releaseWindow = Math.max(releaseMs, hangoverMs);
|
|
56
|
+
if (timestampMs - this.pendingSilenceSince >= releaseWindow) {
|
|
57
|
+
this.speaking = false;
|
|
58
|
+
this.pendingSilenceSince = null;
|
|
59
|
+
this.pendingSpeechSince = null;
|
|
60
|
+
}
|
|
61
|
+
} else {
|
|
62
|
+
this.pendingSilenceSince = null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return {
|
|
66
|
+
speaking: this.speaking,
|
|
67
|
+
levelDb: clamped
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
export {
|
|
73
|
+
LevelBasedVAD
|
|
74
|
+
};
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import {
|
|
2
|
+
createDeepFilterNet3Node
|
|
3
|
+
} from "./chunk-IS37FHDN.mjs";
|
|
4
|
+
import {
|
|
5
|
+
LevelBasedVAD
|
|
6
|
+
} from "./chunk-AQ5RVY33.mjs";
|
|
7
|
+
import {
|
|
8
|
+
getAudioContext,
|
|
9
|
+
registerPipeline,
|
|
10
|
+
unregisterPipeline
|
|
11
|
+
} from "./chunk-OZ7KMC4S.mjs";
|
|
12
|
+
import {
|
|
13
|
+
createLevelDetectorNode
|
|
14
|
+
} from "./chunk-QNQK6QFB.mjs";
|
|
15
|
+
|
|
16
|
+
// src/pipeline/audio-pipeline.ts
|
|
17
|
+
import mitt from "mitt";
|
|
18
|
+
async function createAudioPipeline(sourceTrack, config = {}) {
|
|
19
|
+
const context = getAudioContext();
|
|
20
|
+
registerPipeline();
|
|
21
|
+
const nsConfig = {
|
|
22
|
+
enabled: config.noiseSuppression?.enabled ?? true,
|
|
23
|
+
noiseReductionLevel: config.noiseSuppression?.noiseReductionLevel ?? 60
|
|
24
|
+
};
|
|
25
|
+
if (config.noiseSuppression?.assetConfig) {
|
|
26
|
+
nsConfig.assetConfig = config.noiseSuppression.assetConfig;
|
|
27
|
+
}
|
|
28
|
+
const fullConfig = {
|
|
29
|
+
noiseSuppression: nsConfig,
|
|
30
|
+
speaking: {
|
|
31
|
+
minDb: config.speaking?.minDb ?? -60,
|
|
32
|
+
maxDb: config.speaking?.maxDb ?? -20,
|
|
33
|
+
speakOnRatio: config.speaking?.speakOnRatio ?? 0.6,
|
|
34
|
+
speakOffRatio: config.speaking?.speakOffRatio ?? 0.3,
|
|
35
|
+
hangoverMs: config.speaking?.hangoverMs ?? 350,
|
|
36
|
+
attackMs: config.speaking?.attackMs ?? 50,
|
|
37
|
+
releaseMs: config.speaking?.releaseMs ?? 120
|
|
38
|
+
},
|
|
39
|
+
output: {
|
|
40
|
+
speechGain: config.output?.speechGain ?? 1,
|
|
41
|
+
silenceGain: config.output?.silenceGain ?? 0,
|
|
42
|
+
gainRampTime: config.output?.gainRampTime ?? 0.015,
|
|
43
|
+
maxGainDb: config.output?.maxGainDb ?? 6,
|
|
44
|
+
smoothTransitions: config.output?.smoothTransitions ?? true
|
|
45
|
+
},
|
|
46
|
+
muteWhenSilent: config.muteWhenSilent ?? false
|
|
47
|
+
};
|
|
48
|
+
if (!sourceTrack || sourceTrack.kind !== "audio") {
|
|
49
|
+
throw new Error(
|
|
50
|
+
"createAudioPipeline requires a valid audio MediaStreamTrack"
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
if (sourceTrack.readyState === "ended") {
|
|
54
|
+
throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
|
|
55
|
+
}
|
|
56
|
+
const sourceStream = new MediaStream([sourceTrack]);
|
|
57
|
+
const sourceNode = context.createMediaStreamSource(sourceStream);
|
|
58
|
+
const emitter = mitt();
|
|
59
|
+
const vad = new LevelBasedVAD(fullConfig.speaking);
|
|
60
|
+
let lastState = { speaking: false, levelDb: -Infinity };
|
|
61
|
+
const nsHandle = await createDeepFilterNet3Node(
|
|
62
|
+
context,
|
|
63
|
+
fullConfig.noiseSuppression
|
|
64
|
+
);
|
|
65
|
+
const levelHandle = await createLevelDetectorNode(context, (levelDb) => {
|
|
66
|
+
try {
|
|
67
|
+
const timestamp = context.currentTime * 1e3;
|
|
68
|
+
const nextState = vad.process(levelDb, timestamp);
|
|
69
|
+
const speakingChanged = nextState.speaking !== lastState.speaking;
|
|
70
|
+
const levelChanged = Math.abs(nextState.levelDb - lastState.levelDb) > 0.5;
|
|
71
|
+
if (speakingChanged || levelChanged) {
|
|
72
|
+
lastState = nextState;
|
|
73
|
+
updateGain(nextState);
|
|
74
|
+
emitter.emit("speakingChange", nextState);
|
|
75
|
+
}
|
|
76
|
+
} catch (error) {
|
|
77
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
78
|
+
emitter.emit("error", err);
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
const splitter = context.createGain();
|
|
82
|
+
sourceNode.connect(nsHandle.node);
|
|
83
|
+
nsHandle.node.connect(splitter);
|
|
84
|
+
splitter.connect(levelHandle.node);
|
|
85
|
+
const gainNode = context.createGain();
|
|
86
|
+
gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
|
|
87
|
+
splitter.connect(gainNode);
|
|
88
|
+
const destination = context.createMediaStreamDestination();
|
|
89
|
+
gainNode.connect(destination);
|
|
90
|
+
function updateGain(state) {
|
|
91
|
+
const {
|
|
92
|
+
speechGain = 1,
|
|
93
|
+
silenceGain = 0,
|
|
94
|
+
gainRampTime = 0.015,
|
|
95
|
+
smoothTransitions = true,
|
|
96
|
+
maxGainDb = 6
|
|
97
|
+
} = fullConfig.output ?? {};
|
|
98
|
+
const maxGainLinear = Math.pow(10, maxGainDb / 20);
|
|
99
|
+
const limitedSpeechGain = Math.min(speechGain ?? 1, maxGainLinear);
|
|
100
|
+
const target = state.speaking ? limitedSpeechGain : silenceGain ?? 0;
|
|
101
|
+
const now = context.currentTime;
|
|
102
|
+
gainNode.gain.cancelScheduledValues(now);
|
|
103
|
+
gainNode.gain.setValueAtTime(gainNode.gain.value, now);
|
|
104
|
+
if (smoothTransitions) {
|
|
105
|
+
gainNode.gain.setTargetAtTime(target, now, gainRampTime / 3);
|
|
106
|
+
} else {
|
|
107
|
+
gainNode.gain.setValueAtTime(target, now);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
const audioTracks = destination.stream.getAudioTracks();
|
|
111
|
+
if (audioTracks.length === 0) {
|
|
112
|
+
nsHandle.dispose();
|
|
113
|
+
levelHandle.dispose();
|
|
114
|
+
unregisterPipeline();
|
|
115
|
+
throw new Error("Failed to create processed audio track");
|
|
116
|
+
}
|
|
117
|
+
const processedTrack = audioTracks[0];
|
|
118
|
+
function dispose() {
|
|
119
|
+
try {
|
|
120
|
+
sourceNode.disconnect();
|
|
121
|
+
nsHandle.node.disconnect();
|
|
122
|
+
splitter.disconnect();
|
|
123
|
+
levelHandle.node.disconnect();
|
|
124
|
+
gainNode.disconnect();
|
|
125
|
+
destination.stream.getTracks().forEach((t) => t.stop());
|
|
126
|
+
levelHandle.dispose();
|
|
127
|
+
nsHandle.dispose();
|
|
128
|
+
} catch (error) {
|
|
129
|
+
console.error("Error during pipeline disposal", error);
|
|
130
|
+
} finally {
|
|
131
|
+
unregisterPipeline();
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
const handle = {
|
|
135
|
+
processedTrack,
|
|
136
|
+
events: emitter,
|
|
137
|
+
get state() {
|
|
138
|
+
return lastState;
|
|
139
|
+
},
|
|
140
|
+
setConfig: (next) => {
|
|
141
|
+
try {
|
|
142
|
+
if (next.speaking) {
|
|
143
|
+
vad.updateConfig(next.speaking);
|
|
144
|
+
fullConfig.speaking = { ...fullConfig.speaking, ...next.speaking };
|
|
145
|
+
}
|
|
146
|
+
if (next.output) {
|
|
147
|
+
fullConfig.output = { ...fullConfig.output, ...next.output };
|
|
148
|
+
updateGain(lastState);
|
|
149
|
+
}
|
|
150
|
+
if (next.noiseSuppression) {
|
|
151
|
+
const ns = next.noiseSuppression;
|
|
152
|
+
fullConfig.noiseSuppression = {
|
|
153
|
+
...fullConfig.noiseSuppression,
|
|
154
|
+
...ns
|
|
155
|
+
};
|
|
156
|
+
if (typeof ns.noiseReductionLevel === "number") {
|
|
157
|
+
nsHandle.processor.setSuppressionLevel(ns.noiseReductionLevel);
|
|
158
|
+
}
|
|
159
|
+
if (typeof ns.enabled === "boolean") {
|
|
160
|
+
nsHandle.processor.setNoiseSuppressionEnabled(ns.enabled);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
if (typeof next.muteWhenSilent === "boolean") {
|
|
164
|
+
fullConfig.muteWhenSilent = next.muteWhenSilent;
|
|
165
|
+
}
|
|
166
|
+
} catch (error) {
|
|
167
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
168
|
+
emitter.emit("error", err);
|
|
169
|
+
}
|
|
170
|
+
},
|
|
171
|
+
dispose
|
|
172
|
+
};
|
|
173
|
+
return handle;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export {
|
|
177
|
+
createAudioPipeline
|
|
178
|
+
};
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import {
|
|
2
|
+
LevelBasedVAD
|
|
3
|
+
} from "./chunk-AQ5RVY33.mjs";
|
|
4
|
+
import {
|
|
5
|
+
getAudioContext,
|
|
6
|
+
registerPipeline,
|
|
7
|
+
unregisterPipeline
|
|
8
|
+
} from "./chunk-OZ7KMC4S.mjs";
|
|
9
|
+
import {
|
|
10
|
+
createLevelDetectorNode
|
|
11
|
+
} from "./chunk-QNQK6QFB.mjs";
|
|
12
|
+
|
|
13
|
+
// src/pipeline/remote-audio-monitor.ts
|
|
14
|
+
import mitt from "mitt";
|
|
15
|
+
async function createRemoteAudioMonitor(sourceTrack, config = {}) {
|
|
16
|
+
const context = getAudioContext();
|
|
17
|
+
registerPipeline();
|
|
18
|
+
const fullConfig = {
|
|
19
|
+
speaking: {
|
|
20
|
+
minDb: config.speaking?.minDb ?? -60,
|
|
21
|
+
maxDb: config.speaking?.maxDb ?? -20,
|
|
22
|
+
speakOnRatio: config.speaking?.speakOnRatio ?? 0.6,
|
|
23
|
+
speakOffRatio: config.speaking?.speakOffRatio ?? 0.3,
|
|
24
|
+
hangoverMs: config.speaking?.hangoverMs ?? 350,
|
|
25
|
+
attackMs: config.speaking?.attackMs ?? 50,
|
|
26
|
+
releaseMs: config.speaking?.releaseMs ?? 120
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
if (!sourceTrack || sourceTrack.kind !== "audio") {
|
|
30
|
+
throw new Error(
|
|
31
|
+
"createRemoteAudioMonitor requires a valid audio MediaStreamTrack"
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
if (sourceTrack.readyState === "ended") {
|
|
35
|
+
throw new Error("Cannot create monitor from an ended MediaStreamTrack");
|
|
36
|
+
}
|
|
37
|
+
const sourceStream = new MediaStream([sourceTrack]);
|
|
38
|
+
const sourceNode = context.createMediaStreamSource(sourceStream);
|
|
39
|
+
const emitter = mitt();
|
|
40
|
+
const vad = new LevelBasedVAD(fullConfig.speaking);
|
|
41
|
+
let lastState = { speaking: false, levelDb: -Infinity };
|
|
42
|
+
const levelHandle = await createLevelDetectorNode(context, (levelDb) => {
|
|
43
|
+
try {
|
|
44
|
+
const timestamp = context.currentTime * 1e3;
|
|
45
|
+
const nextState = vad.process(levelDb, timestamp);
|
|
46
|
+
const speakingChanged = nextState.speaking !== lastState.speaking;
|
|
47
|
+
const levelChanged = Math.abs(nextState.levelDb - lastState.levelDb) > 0.5;
|
|
48
|
+
if (speakingChanged || levelChanged) {
|
|
49
|
+
lastState = nextState;
|
|
50
|
+
emitter.emit("speakingChange", nextState);
|
|
51
|
+
}
|
|
52
|
+
} catch (error) {
|
|
53
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
54
|
+
emitter.emit("error", err);
|
|
55
|
+
}
|
|
56
|
+
});
|
|
57
|
+
sourceNode.connect(levelHandle.node);
|
|
58
|
+
function dispose() {
|
|
59
|
+
try {
|
|
60
|
+
sourceNode.disconnect();
|
|
61
|
+
levelHandle.node.disconnect();
|
|
62
|
+
levelHandle.dispose();
|
|
63
|
+
} catch (error) {
|
|
64
|
+
console.error("Error during remote monitor disposal", error);
|
|
65
|
+
} finally {
|
|
66
|
+
unregisterPipeline();
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
const handle = {
|
|
70
|
+
events: emitter,
|
|
71
|
+
get state() {
|
|
72
|
+
return lastState;
|
|
73
|
+
},
|
|
74
|
+
setConfig: (next) => {
|
|
75
|
+
try {
|
|
76
|
+
if (next.speaking) {
|
|
77
|
+
vad.updateConfig(next.speaking);
|
|
78
|
+
fullConfig.speaking = { ...fullConfig.speaking, ...next.speaking };
|
|
79
|
+
}
|
|
80
|
+
} catch (error) {
|
|
81
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
82
|
+
emitter.emit("error", err);
|
|
83
|
+
}
|
|
84
|
+
},
|
|
85
|
+
dispose
|
|
86
|
+
};
|
|
87
|
+
return handle;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
export {
|
|
91
|
+
createRemoteAudioMonitor
|
|
92
|
+
};
|