@tensamin/audio 0.1.14 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/README.md +48 -231
  2. package/dist/chunk-6BJ4XGSA.mjs +80 -0
  3. package/dist/chunk-AQ5RVY33.mjs +74 -0
  4. package/dist/chunk-IS37FHDN.mjs +33 -0
  5. package/dist/chunk-K4J3UUOR.mjs +178 -0
  6. package/dist/chunk-QNQK6QFB.mjs +71 -0
  7. package/dist/context/audio-context.d.mts +0 -24
  8. package/dist/context/audio-context.d.ts +0 -24
  9. package/dist/index.d.mts +2 -8
  10. package/dist/index.d.ts +2 -8
  11. package/dist/index.js +285 -680
  12. package/dist/index.mjs +8 -43
  13. package/dist/livekit/integration.d.mts +3 -7
  14. package/dist/livekit/integration.d.ts +3 -7
  15. package/dist/livekit/integration.js +280 -626
  16. package/dist/livekit/integration.mjs +7 -8
  17. package/dist/noise-suppression/deepfilternet-node.d.mts +12 -0
  18. package/dist/noise-suppression/deepfilternet-node.d.ts +12 -0
  19. package/dist/noise-suppression/deepfilternet-node.js +57 -0
  20. package/dist/noise-suppression/deepfilternet-node.mjs +6 -0
  21. package/dist/pipeline/audio-pipeline.d.mts +2 -2
  22. package/dist/pipeline/audio-pipeline.d.ts +2 -2
  23. package/dist/pipeline/audio-pipeline.js +219 -554
  24. package/dist/pipeline/audio-pipeline.mjs +4 -5
  25. package/dist/types.d.mts +42 -257
  26. package/dist/types.d.ts +42 -257
  27. package/dist/vad/vad-node.d.mts +7 -9
  28. package/dist/vad/vad-node.d.ts +7 -9
  29. package/dist/vad/vad-node.js +47 -156
  30. package/dist/vad/vad-node.mjs +3 -3
  31. package/dist/vad/vad-state.d.mts +9 -11
  32. package/dist/vad/vad-state.d.ts +9 -11
  33. package/dist/vad/vad-state.js +50 -79
  34. package/dist/vad/vad-state.mjs +3 -3
  35. package/package.json +21 -21
  36. package/dist/chunk-2G2JFHJY.mjs +0 -180
  37. package/dist/chunk-6F2HZUYO.mjs +0 -91
  38. package/dist/chunk-K4YLH73B.mjs +0 -103
  39. package/dist/chunk-R5M2DGAQ.mjs +0 -311
  40. package/dist/chunk-UFKIAMG3.mjs +0 -47
  41. package/dist/chunk-XO6B3D4A.mjs +0 -67
  42. package/dist/extensibility/plugins.d.mts +0 -9
  43. package/dist/extensibility/plugins.d.ts +0 -9
  44. package/dist/extensibility/plugins.js +0 -320
  45. package/dist/extensibility/plugins.mjs +0 -14
  46. package/dist/noise-suppression/rnnoise-node.d.mts +0 -10
  47. package/dist/noise-suppression/rnnoise-node.d.ts +0 -10
  48. package/dist/noise-suppression/rnnoise-node.js +0 -101
  49. package/dist/noise-suppression/rnnoise-node.mjs +0 -6
package/README.md CHANGED
@@ -1,16 +1,13 @@
1
1
  # @tensamin/audio
2
2
 
3
- Audio processing library for the web with RNNoise-based noise suppression and Voice Activity Detection (VAD). Designed for voice communication applications with LiveKit integration support.
3
+ DeepFilterNet3-based noise suppression and realtime speaking detection for LiveKit.
4
4
 
5
5
  ## Features
6
6
 
7
- - Configurable Voice Activity Detection with energy-based algorithm
8
- - RNNoise noise suppression via `@sapphi-red/web-noise-suppressor`
9
- - Automatic audio gating based on voice detection
10
- - Runtime configuration updates
11
- - LiveKit `LocalAudioTrack` integration
12
- - Plugin system for custom audio processors
13
- - Optional dynamic range compression
7
+ - DeepFilterNet3 WASM noise suppression
8
+ - Realtime `speaking` boolean + dB level
9
+ - Automatic mute/unmute for LiveKit tracks
10
+ - Simple min/max dB speaking thresholds
14
11
 
15
12
  ## Installation
16
13
 
@@ -18,261 +15,81 @@ Audio processing library for the web with RNNoise-based noise suppression and Vo
18
15
  npm install @tensamin/audio livekit-client
19
16
  ```
20
17
 
21
- ## Requirements
22
-
23
- For noise suppression, the following files must be provided:
24
-
25
- - `rnnoise.wasm`
26
- - `rnnoise_simd.wasm`
27
- - `worklet.js`
28
-
29
- Available at: `https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/`
30
-
31
- Place these files in a publicly accessible directory (e.g., `public/audio-processor/`).
32
-
33
- ## Usage
34
-
35
- ### Basic Example
18
+ ## Quick Start (LiveKit)
36
19
 
37
20
  ```ts
38
- import { createAudioPipeline } from "@tensamin/audio";
39
-
40
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
41
- const track = stream.getAudioTracks()[0];
42
-
43
- const pipeline = await createAudioPipeline(track, {
44
- noiseSuppression: {
45
- enabled: true,
46
- wasmUrl: "/audio-processor/rnnoise.wasm",
47
- simdUrl: "/audio-processor/rnnoise_simd.wasm",
48
- workletUrl: "/audio-processor/worklet.js",
49
- },
50
- vad: { enabled: true },
51
- });
52
-
53
- const processedStream = new MediaStream([pipeline.processedTrack]);
54
- ```
55
-
56
- ### LiveKit Integration
57
-
58
- ```ts
59
- import { attachProcessingToTrack } from "@tensamin/audio";
60
21
  import { LocalAudioTrack } from "livekit-client";
22
+ import { attachSpeakingDetectionToTrack } from "@tensamin/audio";
61
23
 
62
24
  const localTrack = await LocalAudioTrack.create();
63
25
 
64
- const pipeline = await attachProcessingToTrack(localTrack, {
26
+ const controller = await attachSpeakingDetectionToTrack(localTrack, {
27
+ speaking: {
28
+ minDb: -60,
29
+ maxDb: -20,
30
+ },
65
31
  noiseSuppression: {
66
32
  enabled: true,
67
- wasmUrl: "/audio-processor/rnnoise.wasm",
68
- simdUrl: "/audio-processor/rnnoise_simd.wasm",
69
- workletUrl: "/audio-processor/worklet.js",
70
33
  },
71
- vad: { enabled: true },
72
- livekit: { manageTrackMute: true },
34
+ muteWhenSilent: true,
73
35
  });
74
36
 
75
- await room.localParticipant.publishTrack(localTrack);
76
- ```
77
-
78
- ### Monitoring VAD State
79
-
80
- ```ts
81
- pipeline.events.on("vadChange", (state) => {
82
- console.log("Speaking:", state.isSpeaking);
83
- console.log("Probability:", state.probability);
84
- console.log("State:", state.state);
37
+ controller.onChange((state) => {
38
+ console.log("speaking", state.speaking);
39
+ console.log("levelDb", state.levelDb);
85
40
  });
86
- ```
87
41
 
88
- ## Configuration
89
-
90
- ### Voice Activity Detection
91
-
92
- ```ts
93
- vad: {
94
- enabled: boolean;
95
- startThreshold: number; // Default: 0.6 (range: 0-1)
96
- stopThreshold: number; // Default: 0.45 (range: 0-1)
97
- hangoverMs: number; // Default: 400
98
- preRollMs: number; // Default: 250
99
- minSpeechDurationMs: number; // Default: 100
100
- minSilenceDurationMs: number; // Default: 150
101
- energyVad?: {
102
- smoothing: number; // Default: 0.95
103
- initialNoiseFloor: number; // Default: 0.001
104
- noiseFloorAdaptRateQuiet: number; // Default: 0.002
105
- noiseFloorAdaptRateLoud: number; // Default: 0.02
106
- minSNR: number; // Default: 12.0 (dB)
107
- snrRange: number; // Default: 10.0 (dB)
108
- minEnergy: number; // Default: 0.003
109
- };
110
- }
42
+ await room.localParticipant.publishTrack(localTrack);
111
43
  ```
112
44
 
113
- **Threshold Parameters:**
114
-
115
- - `startThreshold`: Probability threshold to unmute audio (Default: 0.8, ~18dB SNR)
116
- - `stopThreshold`: Probability threshold to mute audio (Default: 0.3, ~13dB SNR)
117
- - `hangoverMs`: Delay before muting after speech stops (Default: 300ms)
118
- - `preRollMs`: Audio buffer duration before speech onset
119
- - `minSpeechDurationMs`: Minimum duration to consider as valid speech (Default: 250ms)
120
- - `minSilenceDurationMs`: Minimum silence duration between speech segments
121
-
122
- **Energy VAD Parameters:**
45
+ ## Configuration
123
46
 
124
- - `smoothing`: Energy calculation smoothing factor (0-1)
125
- - `minSNR`: Minimum signal-to-noise ratio in dB for speech detection
126
- - `snrRange`: Range in dB for probability scaling from minSNR
127
- - `minEnergy`: Minimum absolute RMS energy to consider as speech (Default: 0.003, ~-50dB)
47
+ All options are passed via `LivekitSpeakingOptions` to `attachSpeakingDetectionToTrack`.
128
48
 
129
- ### Output Control
49
+ ### Noise suppression (DeepFilterNet3)
130
50
 
131
51
  ```ts
132
- output: {
133
- speechGain: number; // Default: 1.0
134
- silenceGain: number; // Default: 0.0
135
- gainRampTime: number; // Default: 0.015 (seconds)
136
- smoothTransitions: boolean; // Default: true
137
- maxGainDb: number; // Default: 6.0
138
- enableCompression: boolean; // Default: false
139
- compression?: {
140
- threshold: number; // Default: -24.0 (dB)
141
- ratio: number; // Default: 3.0
142
- attack: number; // Default: 0.003 (seconds)
143
- release: number; // Default: 0.05 (seconds)
52
+ noiseSuppression: {
53
+ enabled?: boolean; // default: true
54
+ noiseReductionLevel?: number; // 0-100, default: 60
55
+ assetConfig?: {
56
+ cdnUrl?: string;
144
57
  };
145
58
  }
146
59
  ```
147
60
 
148
- **Gain Parameters:**
149
-
150
- - `speechGain`: Gain multiplier when speaking (1.0 = unity)
151
- - `silenceGain`: Gain multiplier when silent (0.0 = mute)
152
- - `gainRampTime`: Transition duration for gain changes
153
- - `maxGainDb`: Maximum gain limit to prevent clipping
154
-
155
- **Compression Parameters:**
156
-
157
- - `threshold`: Level above which compression is applied
158
- - `ratio`: Compression ratio (e.g., 3.0 = 3:1)
159
- - `attack`: Time to reach full compression
160
- - `release`: Time to release compression
161
-
162
- ### Runtime Configuration Updates
163
-
164
- ```ts
165
- pipeline.setConfig({
166
- vad: {
167
- startThreshold: 0.7,
168
- stopThreshold: 0.55,
169
- },
170
- output: {
171
- speechGain: 1.3,
172
- },
173
- });
174
- ```
175
-
176
- ## Configuration Examples
177
-
178
- ### Noisy Environment
179
-
180
- ```ts
181
- {
182
- vad: {
183
- startThreshold: 0.7,
184
- stopThreshold: 0.55,
185
- minSpeechDurationMs: 150,
186
- energyVad: { minSNR: 3.0 }
187
- }
188
- }
189
- ```
190
-
191
- ### Quiet Speaker
61
+ ### Speaking detection (dB-based)
192
62
 
193
63
  ```ts
194
- {
195
- vad: {
196
- startThreshold: 0.4,
197
- stopThreshold: 0.25,
198
- energyVad: { minSNR: 1.5 }
199
- },
200
- output: {
201
- speechGain: 1.5
202
- }
64
+ speaking: {
65
+ minDb: number; // e.g. -60
66
+ maxDb: number; // e.g. -20
67
+ speakOnRatio?: number; // default: 0.6
68
+ speakOffRatio?: number; // default: 0.3
69
+ hangoverMs?: number; // default: 350
70
+ attackMs?: number; // default: 50
71
+ releaseMs?: number; // default: 120
203
72
  }
204
73
  ```
205
74
 
206
- ### Natural Conversation
75
+ `minDb` / `maxDb` define the dynamic range used for level normalization. `speakOnRatio` and `speakOffRatio` (0–1) control when speech starts/stops within that range.
207
76
 
208
- ```ts
209
- {
210
- vad: {
211
- startThreshold: 0.5,
212
- stopThreshold: 0.3,
213
- hangoverMs: 600,
214
- },
215
- output: {
216
- silenceGain: 0.2
217
- }
218
- }
219
- ```
220
-
221
- ## API Reference
222
-
223
- ### `createAudioPipeline(track, config)`
224
-
225
- Creates an audio processing pipeline from a MediaStreamTrack.
226
-
227
- **Parameters:**
77
+ ### Output gain control
228
78
 
229
- - `track`: MediaStreamTrack - Source audio track
230
- - `config`: AudioProcessingConfig - Configuration object
231
-
232
- **Returns:** `Promise<AudioPipelineHandle>`
233
-
234
- ### AudioPipelineHandle
235
-
236
- ```ts
237
- interface AudioPipelineHandle {
238
- processedTrack: MediaStreamTrack;
239
- events: Emitter<AudioPipelineEvents>;
240
- state: VADState;
241
- setConfig(config: Partial<AudioProcessingConfig>): void;
242
- dispose(): void;
79
+ ````ts
80
+ output: {
81
+ speechGain?: number; // default: 1.0
82
+ silenceGain?: number; // default: 0.0
83
+ gainRampTime?: number; // default: 0.015 (s)
84
+ maxGainDb?: number; // default: 6.0
85
+ smoothTransitions?: boolean;// default: true
243
86
  }
244
- ```
87
+ ``+
245
88
 
246
- ### AudioPipelineEvents
89
+ ### LiveKit mute handling
247
90
 
248
91
  ```ts
249
- type AudioPipelineEvents = {
250
- vadChange: VADState;
251
- error: Error;
252
- };
253
- ```
254
-
255
- ### VADState
256
-
257
- ```ts
258
- interface VADState {
259
- isSpeaking: boolean;
260
- probability: number;
261
- state: "silent" | "speech_starting" | "speaking" | "speech_ending";
262
- }
263
- ```
264
-
265
- ## Default Values
92
+ muteWhenSilent?: boolean; // default: false
93
+ ````
266
94
 
267
- | Parameter | Default | Description |
268
- | ---------------------- | ------- | -------------------------------- |
269
- | `startThreshold` | 0.6 | Unmute at 60% confidence |
270
- | `stopThreshold` | 0.45 | Mute below 45% confidence |
271
- | `hangoverMs` | 400 | Wait 400ms before muting |
272
- | `preRollMs` | 250 | Buffer 250ms before speech |
273
- | `minSpeechDurationMs` | 100 | Minimum valid speech duration |
274
- | `minSilenceDurationMs` | 150 | Minimum silence between speech |
275
- | `silenceGain` | 0.0 | Complete mute when silent |
276
- | `speechGain` | 1.0 | Unity gain when speaking |
277
- | `minSNR` | 2.0 | Voice must be 2x noise floor |
278
- | `snrRange` | 8.0 | Probability scales over SNR 2-10 |
95
+ When `muteWhenSilent` is `true`, the library automatically calls `track.mute()` when silence is detected and `track.unmute()` when speech resumes (only if it muted the track itself).
@@ -0,0 +1,80 @@
1
+ import {
2
+ createAudioPipeline
3
+ } from "./chunk-K4J3UUOR.mjs";
4
+
5
+ // src/livekit/integration.ts
6
+ import "mitt";
7
+ async function attachSpeakingDetectionToTrack(track, options = {}) {
8
+ if (!track) {
9
+ throw new Error(
10
+ "attachSpeakingDetectionToTrack requires a valid LocalAudioTrack"
11
+ );
12
+ }
13
+ const originalTrack = track.mediaStreamTrack;
14
+ if (!originalTrack || originalTrack.readyState === "ended") {
15
+ throw new Error("LocalAudioTrack has no live MediaStreamTrack to process");
16
+ }
17
+ const pipeline = await createAudioPipeline(originalTrack, options);
18
+ await track.replaceTrack(pipeline.processedTrack);
19
+ const listeners = /* @__PURE__ */ new Set();
20
+ let mutedByController = false;
21
+ let currentState = pipeline.state;
22
+ const speakingHandler = (state) => {
23
+ currentState = state;
24
+ listeners.forEach((listener) => listener(state));
25
+ if (options.muteWhenSilent) {
26
+ if (!state.speaking && !track.isMuted) {
27
+ track.mute().catch((error) => console.error("mute failed", error));
28
+ mutedByController = true;
29
+ }
30
+ if (state.speaking && mutedByController) {
31
+ track.unmute().catch((error) => console.error("unmute failed", error));
32
+ mutedByController = false;
33
+ }
34
+ }
35
+ };
36
+ pipeline.events.on("speakingChange", speakingHandler);
37
+ const errorHandler = (error) => {
38
+ console.error("Audio pipeline error", error);
39
+ };
40
+ pipeline.events.on("error", errorHandler);
41
+ const controller = {
42
+ get speaking() {
43
+ return currentState.speaking;
44
+ },
45
+ get levelDb() {
46
+ return currentState.levelDb;
47
+ },
48
+ onChange: (listener) => {
49
+ listeners.add(listener);
50
+ listener(currentState);
51
+ return () => listeners.delete(listener);
52
+ },
53
+ setConfig: (config) => {
54
+ pipeline.setConfig(config);
55
+ if (typeof config.muteWhenSilent === "boolean") {
56
+ options.muteWhenSilent = config.muteWhenSilent;
57
+ }
58
+ },
59
+ dispose: () => {
60
+ pipeline.events.off("speakingChange", speakingHandler);
61
+ pipeline.events.off("error", errorHandler);
62
+ listeners.clear();
63
+ if (mutedByController && !track.isMuted) {
64
+ track.unmute().catch((error) => console.error("unmute failed", error));
65
+ mutedByController = false;
66
+ }
67
+ pipeline.dispose();
68
+ if (originalTrack.readyState === "live") {
69
+ track.replaceTrack(originalTrack).catch((error) => {
70
+ console.error("Failed to restore original track", error);
71
+ });
72
+ }
73
+ }
74
+ };
75
+ return controller;
76
+ }
77
+
78
+ export {
79
+ attachSpeakingDetectionToTrack
80
+ };
@@ -0,0 +1,74 @@
1
+ // src/vad/vad-state.ts
2
+ var LevelBasedVAD = class {
3
+ config;
4
+ speaking = false;
5
+ pendingSpeechSince = null;
6
+ pendingSilenceSince = null;
7
+ constructor(config) {
8
+ this.config = {
9
+ minDb: config.minDb,
10
+ maxDb: config.maxDb,
11
+ speakOnRatio: config.speakOnRatio ?? 0.6,
12
+ speakOffRatio: config.speakOffRatio ?? 0.3,
13
+ hangoverMs: config.hangoverMs ?? 350,
14
+ attackMs: config.attackMs ?? 50,
15
+ releaseMs: config.releaseMs ?? 120
16
+ };
17
+ }
18
+ updateConfig(config) {
19
+ this.config = {
20
+ ...this.config,
21
+ ...config,
22
+ speakOnRatio: config.speakOnRatio ?? this.config.speakOnRatio,
23
+ speakOffRatio: config.speakOffRatio ?? this.config.speakOffRatio,
24
+ hangoverMs: config.hangoverMs ?? this.config.hangoverMs,
25
+ attackMs: config.attackMs ?? this.config.attackMs,
26
+ releaseMs: config.releaseMs ?? this.config.releaseMs
27
+ };
28
+ }
29
+ process(levelDb, timestampMs) {
30
+ const {
31
+ minDb,
32
+ maxDb,
33
+ speakOnRatio,
34
+ speakOffRatio,
35
+ hangoverMs,
36
+ attackMs,
37
+ releaseMs
38
+ } = this.config;
39
+ const clamped = Math.min(maxDb, Math.max(minDb, levelDb));
40
+ const norm = (clamped - minDb) / Math.max(1, maxDb - minDb);
41
+ if (!this.speaking) {
42
+ if (norm >= speakOnRatio) {
43
+ this.pendingSpeechSince = this.pendingSpeechSince ?? timestampMs;
44
+ if (timestampMs - this.pendingSpeechSince >= attackMs) {
45
+ this.speaking = true;
46
+ this.pendingSpeechSince = null;
47
+ this.pendingSilenceSince = null;
48
+ }
49
+ } else {
50
+ this.pendingSpeechSince = null;
51
+ }
52
+ } else {
53
+ if (norm <= speakOffRatio) {
54
+ this.pendingSilenceSince = this.pendingSilenceSince ?? timestampMs;
55
+ const releaseWindow = Math.max(releaseMs, hangoverMs);
56
+ if (timestampMs - this.pendingSilenceSince >= releaseWindow) {
57
+ this.speaking = false;
58
+ this.pendingSilenceSince = null;
59
+ this.pendingSpeechSince = null;
60
+ }
61
+ } else {
62
+ this.pendingSilenceSince = null;
63
+ }
64
+ }
65
+ return {
66
+ speaking: this.speaking,
67
+ levelDb: clamped
68
+ };
69
+ }
70
+ };
71
+
72
+ export {
73
+ LevelBasedVAD
74
+ };
@@ -0,0 +1,33 @@
1
+ // src/noise-suppression/deepfilternet-node.ts
2
+ import { DeepFilterNet3Processor } from "deepfilternet3-noise-filter";
3
+ async function createDeepFilterNet3Node(context, config) {
4
+ const processorConfig = {
5
+ sampleRate: context.sampleRate,
6
+ noiseReductionLevel: config?.noiseReductionLevel ?? 60
7
+ };
8
+ if (config?.assetConfig) {
9
+ processorConfig.assetConfig = config.assetConfig;
10
+ }
11
+ const processor = new DeepFilterNet3Processor(processorConfig);
12
+ await processor.initialize();
13
+ const node = await processor.createAudioWorkletNode(context);
14
+ const enabled = config?.enabled ?? true;
15
+ if (!enabled) {
16
+ processor.setNoiseSuppressionEnabled(false);
17
+ }
18
+ return {
19
+ node,
20
+ processor,
21
+ dispose: () => {
22
+ try {
23
+ processor.destroy();
24
+ } catch (error) {
25
+ console.error("Failed to dispose DeepFilterNet3 processor", error);
26
+ }
27
+ }
28
+ };
29
+ }
30
+
31
+ export {
32
+ createDeepFilterNet3Node
33
+ };
@@ -0,0 +1,178 @@
1
+ import {
2
+ getAudioContext,
3
+ registerPipeline,
4
+ unregisterPipeline
5
+ } from "./chunk-OZ7KMC4S.mjs";
6
+ import {
7
+ createDeepFilterNet3Node
8
+ } from "./chunk-IS37FHDN.mjs";
9
+ import {
10
+ createLevelDetectorNode
11
+ } from "./chunk-QNQK6QFB.mjs";
12
+ import {
13
+ LevelBasedVAD
14
+ } from "./chunk-AQ5RVY33.mjs";
15
+
16
+ // src/pipeline/audio-pipeline.ts
17
+ import mitt from "mitt";
18
+ async function createAudioPipeline(sourceTrack, config = {}) {
19
+ const context = getAudioContext();
20
+ registerPipeline();
21
+ const nsConfig = {
22
+ enabled: config.noiseSuppression?.enabled ?? true,
23
+ noiseReductionLevel: config.noiseSuppression?.noiseReductionLevel ?? 60
24
+ };
25
+ if (config.noiseSuppression?.assetConfig) {
26
+ nsConfig.assetConfig = config.noiseSuppression.assetConfig;
27
+ }
28
+ const fullConfig = {
29
+ noiseSuppression: nsConfig,
30
+ speaking: {
31
+ minDb: config.speaking?.minDb ?? -60,
32
+ maxDb: config.speaking?.maxDb ?? -20,
33
+ speakOnRatio: config.speaking?.speakOnRatio ?? 0.6,
34
+ speakOffRatio: config.speaking?.speakOffRatio ?? 0.3,
35
+ hangoverMs: config.speaking?.hangoverMs ?? 350,
36
+ attackMs: config.speaking?.attackMs ?? 50,
37
+ releaseMs: config.speaking?.releaseMs ?? 120
38
+ },
39
+ output: {
40
+ speechGain: config.output?.speechGain ?? 1,
41
+ silenceGain: config.output?.silenceGain ?? 0,
42
+ gainRampTime: config.output?.gainRampTime ?? 0.015,
43
+ maxGainDb: config.output?.maxGainDb ?? 6,
44
+ smoothTransitions: config.output?.smoothTransitions ?? true
45
+ },
46
+ muteWhenSilent: config.muteWhenSilent ?? false
47
+ };
48
+ if (!sourceTrack || sourceTrack.kind !== "audio") {
49
+ throw new Error(
50
+ "createAudioPipeline requires a valid audio MediaStreamTrack"
51
+ );
52
+ }
53
+ if (sourceTrack.readyState === "ended") {
54
+ throw new Error("Cannot create pipeline from an ended MediaStreamTrack");
55
+ }
56
+ const sourceStream = new MediaStream([sourceTrack]);
57
+ const sourceNode = context.createMediaStreamSource(sourceStream);
58
+ const emitter = mitt();
59
+ const vad = new LevelBasedVAD(fullConfig.speaking);
60
+ let lastState = { speaking: false, levelDb: -Infinity };
61
+ const nsHandle = await createDeepFilterNet3Node(
62
+ context,
63
+ fullConfig.noiseSuppression
64
+ );
65
+ const levelHandle = await createLevelDetectorNode(context, (levelDb) => {
66
+ try {
67
+ const timestamp = context.currentTime * 1e3;
68
+ const nextState = vad.process(levelDb, timestamp);
69
+ const speakingChanged = nextState.speaking !== lastState.speaking;
70
+ const levelChanged = Math.abs(nextState.levelDb - lastState.levelDb) > 0.5;
71
+ if (speakingChanged || levelChanged) {
72
+ lastState = nextState;
73
+ updateGain(nextState);
74
+ emitter.emit("speakingChange", nextState);
75
+ }
76
+ } catch (error) {
77
+ const err = error instanceof Error ? error : new Error(String(error));
78
+ emitter.emit("error", err);
79
+ }
80
+ });
81
+ const splitter = context.createGain();
82
+ sourceNode.connect(nsHandle.node);
83
+ nsHandle.node.connect(splitter);
84
+ splitter.connect(levelHandle.node);
85
+ const gainNode = context.createGain();
86
+ gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
87
+ splitter.connect(gainNode);
88
+ const destination = context.createMediaStreamDestination();
89
+ gainNode.connect(destination);
90
+ function updateGain(state) {
91
+ const {
92
+ speechGain = 1,
93
+ silenceGain = 0,
94
+ gainRampTime = 0.015,
95
+ smoothTransitions = true,
96
+ maxGainDb = 6
97
+ } = fullConfig.output ?? {};
98
+ const maxGainLinear = Math.pow(10, maxGainDb / 20);
99
+ const limitedSpeechGain = Math.min(speechGain ?? 1, maxGainLinear);
100
+ const target = state.speaking ? limitedSpeechGain : silenceGain ?? 0;
101
+ const now = context.currentTime;
102
+ gainNode.gain.cancelScheduledValues(now);
103
+ gainNode.gain.setValueAtTime(gainNode.gain.value, now);
104
+ if (smoothTransitions) {
105
+ gainNode.gain.setTargetAtTime(target, now, gainRampTime / 3);
106
+ } else {
107
+ gainNode.gain.setValueAtTime(target, now);
108
+ }
109
+ }
110
+ const audioTracks = destination.stream.getAudioTracks();
111
+ if (audioTracks.length === 0) {
112
+ nsHandle.dispose();
113
+ levelHandle.dispose();
114
+ unregisterPipeline();
115
+ throw new Error("Failed to create processed audio track");
116
+ }
117
+ const processedTrack = audioTracks[0];
118
+ function dispose() {
119
+ try {
120
+ sourceNode.disconnect();
121
+ nsHandle.node.disconnect();
122
+ splitter.disconnect();
123
+ levelHandle.node.disconnect();
124
+ gainNode.disconnect();
125
+ destination.stream.getTracks().forEach((t) => t.stop());
126
+ levelHandle.dispose();
127
+ nsHandle.dispose();
128
+ } catch (error) {
129
+ console.error("Error during pipeline disposal", error);
130
+ } finally {
131
+ unregisterPipeline();
132
+ }
133
+ }
134
+ const handle = {
135
+ processedTrack,
136
+ events: emitter,
137
+ get state() {
138
+ return lastState;
139
+ },
140
+ setConfig: (next) => {
141
+ try {
142
+ if (next.speaking) {
143
+ vad.updateConfig(next.speaking);
144
+ fullConfig.speaking = { ...fullConfig.speaking, ...next.speaking };
145
+ }
146
+ if (next.output) {
147
+ fullConfig.output = { ...fullConfig.output, ...next.output };
148
+ updateGain(lastState);
149
+ }
150
+ if (next.noiseSuppression) {
151
+ const ns = next.noiseSuppression;
152
+ fullConfig.noiseSuppression = {
153
+ ...fullConfig.noiseSuppression,
154
+ ...ns
155
+ };
156
+ if (typeof ns.noiseReductionLevel === "number") {
157
+ nsHandle.processor.setSuppressionLevel(ns.noiseReductionLevel);
158
+ }
159
+ if (typeof ns.enabled === "boolean") {
160
+ nsHandle.processor.setNoiseSuppressionEnabled(ns.enabled);
161
+ }
162
+ }
163
+ if (typeof next.muteWhenSilent === "boolean") {
164
+ fullConfig.muteWhenSilent = next.muteWhenSilent;
165
+ }
166
+ } catch (error) {
167
+ const err = error instanceof Error ? error : new Error(String(error));
168
+ emitter.emit("error", err);
169
+ }
170
+ },
171
+ dispose
172
+ };
173
+ return handle;
174
+ }
175
+
176
+ export {
177
+ createAudioPipeline
178
+ };