@tensamin/audio 0.1.15 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +48 -229
- package/dist/chunk-6BJ4XGSA.mjs +80 -0
- package/dist/chunk-AQ5RVY33.mjs +74 -0
- package/dist/chunk-IS37FHDN.mjs +33 -0
- package/dist/chunk-K4J3UUOR.mjs +178 -0
- package/dist/chunk-QNQK6QFB.mjs +71 -0
- package/dist/context/audio-context.d.mts +0 -24
- package/dist/context/audio-context.d.ts +0 -24
- package/dist/index.d.mts +2 -8
- package/dist/index.d.ts +2 -8
- package/dist/index.js +285 -655
- package/dist/index.mjs +8 -43
- package/dist/livekit/integration.d.mts +3 -7
- package/dist/livekit/integration.d.ts +3 -7
- package/dist/livekit/integration.js +280 -601
- package/dist/livekit/integration.mjs +7 -8
- package/dist/noise-suppression/deepfilternet-node.d.mts +12 -0
- package/dist/noise-suppression/deepfilternet-node.d.ts +12 -0
- package/dist/noise-suppression/deepfilternet-node.js +57 -0
- package/dist/noise-suppression/deepfilternet-node.mjs +6 -0
- package/dist/pipeline/audio-pipeline.d.mts +2 -2
- package/dist/pipeline/audio-pipeline.d.ts +2 -2
- package/dist/pipeline/audio-pipeline.js +219 -529
- package/dist/pipeline/audio-pipeline.mjs +4 -5
- package/dist/types.d.mts +42 -246
- package/dist/types.d.ts +42 -246
- package/dist/vad/vad-node.d.mts +7 -9
- package/dist/vad/vad-node.d.ts +7 -9
- package/dist/vad/vad-node.js +47 -134
- package/dist/vad/vad-node.mjs +3 -3
- package/dist/vad/vad-state.d.mts +9 -11
- package/dist/vad/vad-state.d.ts +9 -11
- package/dist/vad/vad-state.js +50 -77
- package/dist/vad/vad-state.mjs +3 -3
- package/package.json +21 -21
- package/dist/chunk-GLKAWCEW.mjs +0 -158
- package/dist/chunk-KLBA2CPE.mjs +0 -101
- package/dist/chunk-QQFKHTCQ.mjs +0 -91
- package/dist/chunk-U26F3GJN.mjs +0 -47
- package/dist/chunk-WQVMSR7V.mjs +0 -310
- package/dist/chunk-XO6B3D4A.mjs +0 -67
- package/dist/extensibility/plugins.d.mts +0 -9
- package/dist/extensibility/plugins.d.ts +0 -9
- package/dist/extensibility/plugins.js +0 -298
- package/dist/extensibility/plugins.mjs +0 -14
- package/dist/noise-suppression/rnnoise-node.d.mts +0 -10
- package/dist/noise-suppression/rnnoise-node.d.ts +0 -10
- package/dist/noise-suppression/rnnoise-node.js +0 -101
- package/dist/noise-suppression/rnnoise-node.mjs +0 -6
package/README.md
CHANGED
|
@@ -1,16 +1,13 @@
|
|
|
1
1
|
# @tensamin/audio
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
DeepFilterNet3-based noise suppression and realtime speaking detection for LiveKit.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
-
|
|
8
|
-
-
|
|
9
|
-
- Automatic
|
|
10
|
-
-
|
|
11
|
-
- LiveKit `LocalAudioTrack` integration
|
|
12
|
-
- Plugin system for custom audio processors
|
|
13
|
-
- Optional dynamic range compression
|
|
7
|
+
- DeepFilterNet3 WASM noise suppression
|
|
8
|
+
- Realtime `speaking` boolean + dB level
|
|
9
|
+
- Automatic mute/unmute for LiveKit tracks
|
|
10
|
+
- Simple min/max dB speaking thresholds
|
|
14
11
|
|
|
15
12
|
## Installation
|
|
16
13
|
|
|
@@ -18,259 +15,81 @@ Audio processing library for the web with RNNoise-based noise suppression and Vo
|
|
|
18
15
|
npm install @tensamin/audio livekit-client
|
|
19
16
|
```
|
|
20
17
|
|
|
21
|
-
##
|
|
22
|
-
|
|
23
|
-
For noise suppression, the following files must be provided:
|
|
24
|
-
|
|
25
|
-
- `rnnoise.wasm`
|
|
26
|
-
- `rnnoise_simd.wasm`
|
|
27
|
-
- `worklet.js`
|
|
28
|
-
|
|
29
|
-
Available at: `https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/`
|
|
30
|
-
|
|
31
|
-
Place these files in a publicly accessible directory (e.g., `public/audio-processor/`).
|
|
32
|
-
|
|
33
|
-
## Usage
|
|
34
|
-
|
|
35
|
-
### Basic Example
|
|
18
|
+
## Quick Start (LiveKit)
|
|
36
19
|
|
|
37
20
|
```ts
|
|
38
|
-
import { createAudioPipeline } from "@tensamin/audio";
|
|
39
|
-
|
|
40
|
-
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
41
|
-
const track = stream.getAudioTracks()[0];
|
|
42
|
-
|
|
43
|
-
const pipeline = await createAudioPipeline(track, {
|
|
44
|
-
noiseSuppression: {
|
|
45
|
-
enabled: true,
|
|
46
|
-
wasmUrl: "/audio-processor/rnnoise.wasm",
|
|
47
|
-
simdUrl: "/audio-processor/rnnoise_simd.wasm",
|
|
48
|
-
workletUrl: "/audio-processor/worklet.js",
|
|
49
|
-
},
|
|
50
|
-
vad: { enabled: true },
|
|
51
|
-
});
|
|
52
|
-
|
|
53
|
-
const processedStream = new MediaStream([pipeline.processedTrack]);
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
### LiveKit Integration
|
|
57
|
-
|
|
58
|
-
```ts
|
|
59
|
-
import { attachProcessingToTrack } from "@tensamin/audio";
|
|
60
21
|
import { LocalAudioTrack } from "livekit-client";
|
|
22
|
+
import { attachSpeakingDetectionToTrack } from "@tensamin/audio";
|
|
61
23
|
|
|
62
24
|
const localTrack = await LocalAudioTrack.create();
|
|
63
25
|
|
|
64
|
-
const
|
|
26
|
+
const controller = await attachSpeakingDetectionToTrack(localTrack, {
|
|
27
|
+
speaking: {
|
|
28
|
+
minDb: -60,
|
|
29
|
+
maxDb: -20,
|
|
30
|
+
},
|
|
65
31
|
noiseSuppression: {
|
|
66
32
|
enabled: true,
|
|
67
|
-
wasmUrl: "/audio-processor/rnnoise.wasm",
|
|
68
|
-
simdUrl: "/audio-processor/rnnoise_simd.wasm",
|
|
69
|
-
workletUrl: "/audio-processor/worklet.js",
|
|
70
33
|
},
|
|
71
|
-
|
|
72
|
-
livekit: { manageTrackMute: true },
|
|
34
|
+
muteWhenSilent: true,
|
|
73
35
|
});
|
|
74
36
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
### Monitoring VAD State
|
|
79
|
-
|
|
80
|
-
```ts
|
|
81
|
-
pipeline.events.on("vadChange", (state) => {
|
|
82
|
-
console.log("Speaking:", state.isSpeaking);
|
|
83
|
-
console.log("Probability:", state.probability);
|
|
84
|
-
console.log("State:", state.state);
|
|
37
|
+
controller.onChange((state) => {
|
|
38
|
+
console.log("speaking", state.speaking);
|
|
39
|
+
console.log("levelDb", state.levelDb);
|
|
85
40
|
});
|
|
86
|
-
```
|
|
87
41
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
### Voice Activity Detection
|
|
91
|
-
|
|
92
|
-
```ts
|
|
93
|
-
vad: {
|
|
94
|
-
enabled: boolean;
|
|
95
|
-
startThreshold: number; // Default: 0.6 (range: 0-1)
|
|
96
|
-
stopThreshold: number; // Default: 0.45 (range: 0-1)
|
|
97
|
-
hangoverMs: number; // Default: 400
|
|
98
|
-
preRollMs: number; // Default: 250
|
|
99
|
-
minSpeechDurationMs: number; // Default: 100
|
|
100
|
-
minSilenceDurationMs: number; // Default: 150
|
|
101
|
-
energyVad?: {
|
|
102
|
-
smoothing: number; // Default: 0.95
|
|
103
|
-
initialNoiseFloor: number; // Default: 0.001
|
|
104
|
-
minSNR: number; // Default: 8.0 (dB)
|
|
105
|
-
snrRange: number; // Default: 12.0 (dB)
|
|
106
|
-
minEnergy: number; // Default: 0.01
|
|
107
|
-
};
|
|
108
|
-
}
|
|
42
|
+
await room.localParticipant.publishTrack(localTrack);
|
|
109
43
|
```
|
|
110
44
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
- `startThreshold`: Probability threshold to unmute audio (Default: 0.8, ~18dB SNR)
|
|
114
|
-
- `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
|
|
115
|
-
- `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
|
|
116
|
-
- `preRollMs`: Audio buffer duration before speech onset
|
|
117
|
-
- `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 250ms)
|
|
118
|
-
- `minSilenceDurationMs`: Minimum silence duration between speech segments
|
|
119
|
-
|
|
120
|
-
**Energy VAD Parameters:**
|
|
45
|
+
## Configuration
|
|
121
46
|
|
|
122
|
-
|
|
123
|
-
- `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
|
|
124
|
-
- `snrRange`: Range in dB for probability scaling from minSNR
|
|
125
|
-
- `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.01, ~-40dB)
|
|
47
|
+
All options are passed via `LivekitSpeakingOptions` to `attachSpeakingDetectionToTrack`.
|
|
126
48
|
|
|
127
|
-
###
|
|
49
|
+
### Noise suppression (DeepFilterNet3)
|
|
128
50
|
|
|
129
51
|
```ts
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
maxGainDb: number; // Default: 6.0
|
|
136
|
-
enableCompression: boolean; // Default: false
|
|
137
|
-
compression?: {
|
|
138
|
-
threshold: number; // Default: -24.0 (dB)
|
|
139
|
-
ratio: number; // Default: 3.0
|
|
140
|
-
attack: number; // Default: 0.003 (seconds)
|
|
141
|
-
release: number; // Default: 0.05 (seconds)
|
|
52
|
+
noiseSuppression: {
|
|
53
|
+
enabled?: boolean; // default: true
|
|
54
|
+
noiseReductionLevel?: number; // 0-100, default: 60
|
|
55
|
+
assetConfig?: {
|
|
56
|
+
cdnUrl?: string;
|
|
142
57
|
};
|
|
143
58
|
}
|
|
144
59
|
```
|
|
145
60
|
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
- `speechGain`: Gain multiplier when speaking (1.0 = unity)
|
|
149
|
-
- `silenceGain`: Gain multiplier when silent (0.0 = mute)
|
|
150
|
-
- `gainRampTime`: Transition duration for gain changes
|
|
151
|
-
- `maxGainDb`: Maximum gain limit to prevent clipping
|
|
152
|
-
|
|
153
|
-
**Compression Parameters:**
|
|
154
|
-
|
|
155
|
-
- `threshold`: Level above which compression is applied
|
|
156
|
-
- `ratio`: Compression ratio (e.g., 3.0 = 3:1)
|
|
157
|
-
- `attack`: Time to reach full compression
|
|
158
|
-
- `release`: Time to release compression
|
|
159
|
-
|
|
160
|
-
### Runtime Configuration Updates
|
|
161
|
-
|
|
162
|
-
```ts
|
|
163
|
-
pipeline.setConfig({
|
|
164
|
-
vad: {
|
|
165
|
-
startThreshold: 0.7,
|
|
166
|
-
stopThreshold: 0.55,
|
|
167
|
-
},
|
|
168
|
-
output: {
|
|
169
|
-
speechGain: 1.3,
|
|
170
|
-
},
|
|
171
|
-
});
|
|
172
|
-
```
|
|
173
|
-
|
|
174
|
-
## Configuration Examples
|
|
175
|
-
|
|
176
|
-
### Noisy Environment
|
|
177
|
-
|
|
178
|
-
```ts
|
|
179
|
-
{
|
|
180
|
-
vad: {
|
|
181
|
-
startThreshold: 0.7,
|
|
182
|
-
stopThreshold: 0.55,
|
|
183
|
-
minSpeechDurationMs: 150,
|
|
184
|
-
energyVad: { minSNR: 3.0 }
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
```
|
|
188
|
-
|
|
189
|
-
### Quiet Speaker
|
|
61
|
+
### Speaking detection (dB-based)
|
|
190
62
|
|
|
191
63
|
```ts
|
|
192
|
-
{
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
}
|
|
64
|
+
speaking: {
|
|
65
|
+
minDb: number; // e.g. -60
|
|
66
|
+
maxDb: number; // e.g. -20
|
|
67
|
+
speakOnRatio?: number; // default: 0.6
|
|
68
|
+
speakOffRatio?: number; // default: 0.3
|
|
69
|
+
hangoverMs?: number; // default: 350
|
|
70
|
+
attackMs?: number; // default: 50
|
|
71
|
+
releaseMs?: number; // default: 120
|
|
201
72
|
}
|
|
202
73
|
```
|
|
203
74
|
|
|
204
|
-
|
|
75
|
+
`minDb` / `maxDb` define the dynamic range used for level normalization. `speakOnRatio` and `speakOffRatio` (0–1) control when speech starts/stops within that range.
|
|
205
76
|
|
|
206
|
-
|
|
207
|
-
{
|
|
208
|
-
vad: {
|
|
209
|
-
startThreshold: 0.5,
|
|
210
|
-
stopThreshold: 0.3,
|
|
211
|
-
hangoverMs: 600,
|
|
212
|
-
},
|
|
213
|
-
output: {
|
|
214
|
-
silenceGain: 0.2
|
|
215
|
-
}
|
|
216
|
-
}
|
|
217
|
-
```
|
|
218
|
-
|
|
219
|
-
## API Reference
|
|
220
|
-
|
|
221
|
-
### `createAudioPipeline(track, config)`
|
|
222
|
-
|
|
223
|
-
Creates an audio processing pipeline from a MediaStreamTrack.
|
|
224
|
-
|
|
225
|
-
**Parameters:**
|
|
77
|
+
### Output gain control
|
|
226
78
|
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
```ts
|
|
235
|
-
interface AudioPipelineHandle {
|
|
236
|
-
processedTrack: MediaStreamTrack;
|
|
237
|
-
events: Emitter<AudioPipelineEvents>;
|
|
238
|
-
state: VADState;
|
|
239
|
-
setConfig(config: Partial<AudioProcessingConfig>): void;
|
|
240
|
-
dispose(): void;
|
|
79
|
+
````ts
|
|
80
|
+
output: {
|
|
81
|
+
speechGain?: number; // default: 1.0
|
|
82
|
+
silenceGain?: number; // default: 0.0
|
|
83
|
+
gainRampTime?: number; // default: 0.015 (s)
|
|
84
|
+
maxGainDb?: number; // default: 6.0
|
|
85
|
+
smoothTransitions?: boolean;// default: true
|
|
241
86
|
}
|
|
242
|
-
|
|
87
|
+
``+
|
|
243
88
|
|
|
244
|
-
###
|
|
89
|
+
### LiveKit mute handling
|
|
245
90
|
|
|
246
91
|
```ts
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
error: Error;
|
|
250
|
-
};
|
|
251
|
-
```
|
|
252
|
-
|
|
253
|
-
### VADState
|
|
254
|
-
|
|
255
|
-
```ts
|
|
256
|
-
interface VADState {
|
|
257
|
-
isSpeaking: boolean;
|
|
258
|
-
probability: number;
|
|
259
|
-
state: "silent" | "speech_starting" | "speaking" | "speech_ending";
|
|
260
|
-
}
|
|
261
|
-
```
|
|
262
|
-
|
|
263
|
-
## Default Values
|
|
92
|
+
muteWhenSilent?: boolean; // default: false
|
|
93
|
+
````
|
|
264
94
|
|
|
265
|
-
|
|
266
|
-
| ---------------------- | ------- | -------------------------------- |
|
|
267
|
-
| `startThreshold` | 0.6 | Unmute at 60% confidence |
|
|
268
|
-
| `stopThreshold` | 0.45 | Mute below 45% confidence |
|
|
269
|
-
| `hangoverMs` | 400 | Wait 400ms before muting |
|
|
270
|
-
| `preRollMs` | 250 | Buffer 250ms before speech |
|
|
271
|
-
| `minSpeechDurationMs` | 100 | Minimum valid speech duration |
|
|
272
|
-
| `minSilenceDurationMs` | 150 | Minimum silence between speech |
|
|
273
|
-
| `silenceGain` | 0.0 | Complete mute when silent |
|
|
274
|
-
| `speechGain` | 1.0 | Unity gain when speaking |
|
|
275
|
-
| `minSNR` | 2.0 | Voice must be 2x noise floor |
|
|
276
|
-
| `snrRange` | 8.0 | Probability scales over SNR 2-10 |
|
|
95
|
+
When `muteWhenSilent` is `true`, the library automatically calls `track.mute()` when silence is detected and `track.unmute()` when speech resumes (only if it muted the track itself).
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import {
|
|
2
|
+
createAudioPipeline
|
|
3
|
+
} from "./chunk-K4J3UUOR.mjs";
|
|
4
|
+
|
|
5
|
+
// src/livekit/integration.ts
|
|
6
|
+
import "mitt";
|
|
7
|
+
async function attachSpeakingDetectionToTrack(track, options = {}) {
|
|
8
|
+
if (!track) {
|
|
9
|
+
throw new Error(
|
|
10
|
+
"attachSpeakingDetectionToTrack requires a valid LocalAudioTrack"
|
|
11
|
+
);
|
|
12
|
+
}
|
|
13
|
+
const originalTrack = track.mediaStreamTrack;
|
|
14
|
+
if (!originalTrack || originalTrack.readyState === "ended") {
|
|
15
|
+
throw new Error("LocalAudioTrack has no live MediaStreamTrack to process");
|
|
16
|
+
}
|
|
17
|
+
const pipeline = await createAudioPipeline(originalTrack, options);
|
|
18
|
+
await track.replaceTrack(pipeline.processedTrack);
|
|
19
|
+
const listeners = /* @__PURE__ */ new Set();
|
|
20
|
+
let mutedByController = false;
|
|
21
|
+
let currentState = pipeline.state;
|
|
22
|
+
const speakingHandler = (state) => {
|
|
23
|
+
currentState = state;
|
|
24
|
+
listeners.forEach((listener) => listener(state));
|
|
25
|
+
if (options.muteWhenSilent) {
|
|
26
|
+
if (!state.speaking && !track.isMuted) {
|
|
27
|
+
track.mute().catch((error) => console.error("mute failed", error));
|
|
28
|
+
mutedByController = true;
|
|
29
|
+
}
|
|
30
|
+
if (state.speaking && mutedByController) {
|
|
31
|
+
track.unmute().catch((error) => console.error("unmute failed", error));
|
|
32
|
+
mutedByController = false;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
pipeline.events.on("speakingChange", speakingHandler);
|
|
37
|
+
const errorHandler = (error) => {
|
|
38
|
+
console.error("Audio pipeline error", error);
|
|
39
|
+
};
|
|
40
|
+
pipeline.events.on("error", errorHandler);
|
|
41
|
+
const controller = {
|
|
42
|
+
get speaking() {
|
|
43
|
+
return currentState.speaking;
|
|
44
|
+
},
|
|
45
|
+
get levelDb() {
|
|
46
|
+
return currentState.levelDb;
|
|
47
|
+
},
|
|
48
|
+
onChange: (listener) => {
|
|
49
|
+
listeners.add(listener);
|
|
50
|
+
listener(currentState);
|
|
51
|
+
return () => listeners.delete(listener);
|
|
52
|
+
},
|
|
53
|
+
setConfig: (config) => {
|
|
54
|
+
pipeline.setConfig(config);
|
|
55
|
+
if (typeof config.muteWhenSilent === "boolean") {
|
|
56
|
+
options.muteWhenSilent = config.muteWhenSilent;
|
|
57
|
+
}
|
|
58
|
+
},
|
|
59
|
+
dispose: () => {
|
|
60
|
+
pipeline.events.off("speakingChange", speakingHandler);
|
|
61
|
+
pipeline.events.off("error", errorHandler);
|
|
62
|
+
listeners.clear();
|
|
63
|
+
if (mutedByController && !track.isMuted) {
|
|
64
|
+
track.unmute().catch((error) => console.error("unmute failed", error));
|
|
65
|
+
mutedByController = false;
|
|
66
|
+
}
|
|
67
|
+
pipeline.dispose();
|
|
68
|
+
if (originalTrack.readyState === "live") {
|
|
69
|
+
track.replaceTrack(originalTrack).catch((error) => {
|
|
70
|
+
console.error("Failed to restore original track", error);
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
return controller;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
export {
|
|
79
|
+
attachSpeakingDetectionToTrack
|
|
80
|
+
};
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
// src/vad/vad-state.ts
|
|
2
|
+
var LevelBasedVAD = class {
|
|
3
|
+
config;
|
|
4
|
+
speaking = false;
|
|
5
|
+
pendingSpeechSince = null;
|
|
6
|
+
pendingSilenceSince = null;
|
|
7
|
+
constructor(config) {
|
|
8
|
+
this.config = {
|
|
9
|
+
minDb: config.minDb,
|
|
10
|
+
maxDb: config.maxDb,
|
|
11
|
+
speakOnRatio: config.speakOnRatio ?? 0.6,
|
|
12
|
+
speakOffRatio: config.speakOffRatio ?? 0.3,
|
|
13
|
+
hangoverMs: config.hangoverMs ?? 350,
|
|
14
|
+
attackMs: config.attackMs ?? 50,
|
|
15
|
+
releaseMs: config.releaseMs ?? 120
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
updateConfig(config) {
|
|
19
|
+
this.config = {
|
|
20
|
+
...this.config,
|
|
21
|
+
...config,
|
|
22
|
+
speakOnRatio: config.speakOnRatio ?? this.config.speakOnRatio,
|
|
23
|
+
speakOffRatio: config.speakOffRatio ?? this.config.speakOffRatio,
|
|
24
|
+
hangoverMs: config.hangoverMs ?? this.config.hangoverMs,
|
|
25
|
+
attackMs: config.attackMs ?? this.config.attackMs,
|
|
26
|
+
releaseMs: config.releaseMs ?? this.config.releaseMs
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
process(levelDb, timestampMs) {
|
|
30
|
+
const {
|
|
31
|
+
minDb,
|
|
32
|
+
maxDb,
|
|
33
|
+
speakOnRatio,
|
|
34
|
+
speakOffRatio,
|
|
35
|
+
hangoverMs,
|
|
36
|
+
attackMs,
|
|
37
|
+
releaseMs
|
|
38
|
+
} = this.config;
|
|
39
|
+
const clamped = Math.min(maxDb, Math.max(minDb, levelDb));
|
|
40
|
+
const norm = (clamped - minDb) / Math.max(1, maxDb - minDb);
|
|
41
|
+
if (!this.speaking) {
|
|
42
|
+
if (norm >= speakOnRatio) {
|
|
43
|
+
this.pendingSpeechSince = this.pendingSpeechSince ?? timestampMs;
|
|
44
|
+
if (timestampMs - this.pendingSpeechSince >= attackMs) {
|
|
45
|
+
this.speaking = true;
|
|
46
|
+
this.pendingSpeechSince = null;
|
|
47
|
+
this.pendingSilenceSince = null;
|
|
48
|
+
}
|
|
49
|
+
} else {
|
|
50
|
+
this.pendingSpeechSince = null;
|
|
51
|
+
}
|
|
52
|
+
} else {
|
|
53
|
+
if (norm <= speakOffRatio) {
|
|
54
|
+
this.pendingSilenceSince = this.pendingSilenceSince ?? timestampMs;
|
|
55
|
+
const releaseWindow = Math.max(releaseMs, hangoverMs);
|
|
56
|
+
if (timestampMs - this.pendingSilenceSince >= releaseWindow) {
|
|
57
|
+
this.speaking = false;
|
|
58
|
+
this.pendingSilenceSince = null;
|
|
59
|
+
this.pendingSpeechSince = null;
|
|
60
|
+
}
|
|
61
|
+
} else {
|
|
62
|
+
this.pendingSilenceSince = null;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
return {
|
|
66
|
+
speaking: this.speaking,
|
|
67
|
+
levelDb: clamped
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
export {
|
|
73
|
+
LevelBasedVAD
|
|
74
|
+
};
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
// src/noise-suppression/deepfilternet-node.ts
|
|
2
|
+
import { DeepFilterNet3Processor } from "deepfilternet3-noise-filter";
|
|
3
|
+
async function createDeepFilterNet3Node(context, config) {
|
|
4
|
+
const processorConfig = {
|
|
5
|
+
sampleRate: context.sampleRate,
|
|
6
|
+
noiseReductionLevel: config?.noiseReductionLevel ?? 60
|
|
7
|
+
};
|
|
8
|
+
if (config?.assetConfig) {
|
|
9
|
+
processorConfig.assetConfig = config.assetConfig;
|
|
10
|
+
}
|
|
11
|
+
const processor = new DeepFilterNet3Processor(processorConfig);
|
|
12
|
+
await processor.initialize();
|
|
13
|
+
const node = await processor.createAudioWorkletNode(context);
|
|
14
|
+
const enabled = config?.enabled ?? true;
|
|
15
|
+
if (!enabled) {
|
|
16
|
+
processor.setNoiseSuppressionEnabled(false);
|
|
17
|
+
}
|
|
18
|
+
return {
|
|
19
|
+
node,
|
|
20
|
+
processor,
|
|
21
|
+
dispose: () => {
|
|
22
|
+
try {
|
|
23
|
+
processor.destroy();
|
|
24
|
+
} catch (error) {
|
|
25
|
+
console.error("Failed to dispose DeepFilterNet3 processor", error);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export {
|
|
32
|
+
createDeepFilterNet3Node
|
|
33
|
+
};
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import {
|
|
2
|
+
getAudioContext,
|
|
3
|
+
registerPipeline,
|
|
4
|
+
unregisterPipeline
|
|
5
|
+
} from "./chunk-OZ7KMC4S.mjs";
|
|
6
|
+
import {
|
|
7
|
+
createDeepFilterNet3Node
|
|
8
|
+
} from "./chunk-IS37FHDN.mjs";
|
|
9
|
+
import {
|
|
10
|
+
createLevelDetectorNode
|
|
11
|
+
} from "./chunk-QNQK6QFB.mjs";
|
|
12
|
+
import {
|
|
13
|
+
LevelBasedVAD
|
|
14
|
+
} from "./chunk-AQ5RVY33.mjs";
|
|
15
|
+
|
|
16
|
+
// src/pipeline/audio-pipeline.ts
|
|
17
|
+
import mitt from "mitt";
|
|
18
|
+
async function createAudioPipeline(sourceTrack, config = {}) {
|
|
19
|
+
const context = getAudioContext();
|
|
20
|
+
registerPipeline();
|
|
21
|
+
const nsConfig = {
|
|
22
|
+
enabled: config.noiseSuppression?.enabled ?? true,
|
|
23
|
+
noiseReductionLevel: config.noiseSuppression?.noiseReductionLevel ?? 60
|
|
24
|
+
};
|
|
25
|
+
if (config.noiseSuppression?.assetConfig) {
|
|
26
|
+
nsConfig.assetConfig = config.noiseSuppression.assetConfig;
|
|
27
|
+
}
|
|
28
|
+
const fullConfig = {
|
|
29
|
+
noiseSuppression: nsConfig,
|
|
30
|
+
speaking: {
|
|
31
|
+
minDb: config.speaking?.minDb ?? -60,
|
|
32
|
+
maxDb: config.speaking?.maxDb ?? -20,
|
|
33
|
+
speakOnRatio: config.speaking?.speakOnRatio ?? 0.6,
|
|
34
|
+
speakOffRatio: config.speaking?.speakOffRatio ?? 0.3,
|
|
35
|
+
hangoverMs: config.speaking?.hangoverMs ?? 350,
|
|
36
|
+
attackMs: config.speaking?.attackMs ?? 50,
|
|
37
|
+
releaseMs: config.speaking?.releaseMs ?? 120
|
|
38
|
+
},
|
|
39
|
+
output: {
|
|
40
|
+
speechGain: config.output?.speechGain ?? 1,
|
|
41
|
+
silenceGain: config.output?.silenceGain ?? 0,
|
|
42
|
+
gainRampTime: config.output?.gainRampTime ?? 0.015,
|
|
43
|
+
maxGainDb: config.output?.maxGainDb ?? 6,
|
|
44
|
+
smoothTransitions: config.output?.smoothTransitions ?? true
|
|
45
|
+
},
|
|
46
|
+
muteWhenSilent: config.muteWhenSilent ?? false
|
|
47
|
+
};
|
|
48
|
+
if (!sourceTrack || sourceTrack.kind !== "audio") {
|
|
49
|
+
throw new Error(
|
|
50
|
+
"createAudioPipeline requires a valid audio MediaStreamTrack"
|
|
51
|
+
);
|
|
52
|
+
}
|
|
53
|
+
if (sourceTrack.readyState === "ended") {
|
|
54
|
+
throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
|
|
55
|
+
}
|
|
56
|
+
const sourceStream = new MediaStream([sourceTrack]);
|
|
57
|
+
const sourceNode = context.createMediaStreamSource(sourceStream);
|
|
58
|
+
const emitter = mitt();
|
|
59
|
+
const vad = new LevelBasedVAD(fullConfig.speaking);
|
|
60
|
+
let lastState = { speaking: false, levelDb: -Infinity };
|
|
61
|
+
const nsHandle = await createDeepFilterNet3Node(
|
|
62
|
+
context,
|
|
63
|
+
fullConfig.noiseSuppression
|
|
64
|
+
);
|
|
65
|
+
const levelHandle = await createLevelDetectorNode(context, (levelDb) => {
|
|
66
|
+
try {
|
|
67
|
+
const timestamp = context.currentTime * 1e3;
|
|
68
|
+
const nextState = vad.process(levelDb, timestamp);
|
|
69
|
+
const speakingChanged = nextState.speaking !== lastState.speaking;
|
|
70
|
+
const levelChanged = Math.abs(nextState.levelDb - lastState.levelDb) > 0.5;
|
|
71
|
+
if (speakingChanged || levelChanged) {
|
|
72
|
+
lastState = nextState;
|
|
73
|
+
updateGain(nextState);
|
|
74
|
+
emitter.emit("speakingChange", nextState);
|
|
75
|
+
}
|
|
76
|
+
} catch (error) {
|
|
77
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
78
|
+
emitter.emit("error", err);
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
const splitter = context.createGain();
|
|
82
|
+
sourceNode.connect(nsHandle.node);
|
|
83
|
+
nsHandle.node.connect(splitter);
|
|
84
|
+
splitter.connect(levelHandle.node);
|
|
85
|
+
const gainNode = context.createGain();
|
|
86
|
+
gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
|
|
87
|
+
splitter.connect(gainNode);
|
|
88
|
+
const destination = context.createMediaStreamDestination();
|
|
89
|
+
gainNode.connect(destination);
|
|
90
|
+
function updateGain(state) {
|
|
91
|
+
const {
|
|
92
|
+
speechGain = 1,
|
|
93
|
+
silenceGain = 0,
|
|
94
|
+
gainRampTime = 0.015,
|
|
95
|
+
smoothTransitions = true,
|
|
96
|
+
maxGainDb = 6
|
|
97
|
+
} = fullConfig.output ?? {};
|
|
98
|
+
const maxGainLinear = Math.pow(10, maxGainDb / 20);
|
|
99
|
+
const limitedSpeechGain = Math.min(speechGain ?? 1, maxGainLinear);
|
|
100
|
+
const target = state.speaking ? limitedSpeechGain : silenceGain ?? 0;
|
|
101
|
+
const now = context.currentTime;
|
|
102
|
+
gainNode.gain.cancelScheduledValues(now);
|
|
103
|
+
gainNode.gain.setValueAtTime(gainNode.gain.value, now);
|
|
104
|
+
if (smoothTransitions) {
|
|
105
|
+
gainNode.gain.setTargetAtTime(target, now, gainRampTime / 3);
|
|
106
|
+
} else {
|
|
107
|
+
gainNode.gain.setValueAtTime(target, now);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
const audioTracks = destination.stream.getAudioTracks();
|
|
111
|
+
if (audioTracks.length === 0) {
|
|
112
|
+
nsHandle.dispose();
|
|
113
|
+
levelHandle.dispose();
|
|
114
|
+
unregisterPipeline();
|
|
115
|
+
throw new Error("Failed to create processed audio track");
|
|
116
|
+
}
|
|
117
|
+
const processedTrack = audioTracks[0];
|
|
118
|
+
function dispose() {
|
|
119
|
+
try {
|
|
120
|
+
sourceNode.disconnect();
|
|
121
|
+
nsHandle.node.disconnect();
|
|
122
|
+
splitter.disconnect();
|
|
123
|
+
levelHandle.node.disconnect();
|
|
124
|
+
gainNode.disconnect();
|
|
125
|
+
destination.stream.getTracks().forEach((t) => t.stop());
|
|
126
|
+
levelHandle.dispose();
|
|
127
|
+
nsHandle.dispose();
|
|
128
|
+
} catch (error) {
|
|
129
|
+
console.error("Error during pipeline disposal", error);
|
|
130
|
+
} finally {
|
|
131
|
+
unregisterPipeline();
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
const handle = {
|
|
135
|
+
processedTrack,
|
|
136
|
+
events: emitter,
|
|
137
|
+
get state() {
|
|
138
|
+
return lastState;
|
|
139
|
+
},
|
|
140
|
+
setConfig: (next) => {
|
|
141
|
+
try {
|
|
142
|
+
if (next.speaking) {
|
|
143
|
+
vad.updateConfig(next.speaking);
|
|
144
|
+
fullConfig.speaking = { ...fullConfig.speaking, ...next.speaking };
|
|
145
|
+
}
|
|
146
|
+
if (next.output) {
|
|
147
|
+
fullConfig.output = { ...fullConfig.output, ...next.output };
|
|
148
|
+
updateGain(lastState);
|
|
149
|
+
}
|
|
150
|
+
if (next.noiseSuppression) {
|
|
151
|
+
const ns = next.noiseSuppression;
|
|
152
|
+
fullConfig.noiseSuppression = {
|
|
153
|
+
...fullConfig.noiseSuppression,
|
|
154
|
+
...ns
|
|
155
|
+
};
|
|
156
|
+
if (typeof ns.noiseReductionLevel === "number") {
|
|
157
|
+
nsHandle.processor.setSuppressionLevel(ns.noiseReductionLevel);
|
|
158
|
+
}
|
|
159
|
+
if (typeof ns.enabled === "boolean") {
|
|
160
|
+
nsHandle.processor.setNoiseSuppressionEnabled(ns.enabled);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
if (typeof next.muteWhenSilent === "boolean") {
|
|
164
|
+
fullConfig.muteWhenSilent = next.muteWhenSilent;
|
|
165
|
+
}
|
|
166
|
+
} catch (error) {
|
|
167
|
+
const err = error instanceof Error ? error : new Error(String(error));
|
|
168
|
+
emitter.emit("error", err);
|
|
169
|
+
}
|
|
170
|
+
},
|
|
171
|
+
dispose
|
|
172
|
+
};
|
|
173
|
+
return handle;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export {
|
|
177
|
+
createAudioPipeline
|
|
178
|
+
};
|