@tensamin/audio 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,90 +1,56 @@
1
1
  # @tensamin/audio
2
2
 
3
- A audio processing library for the web, featuring RNNoise-based noise suppression and robust Voice Activity Detection (VAD). Designed for seamless integration with LiveKit.
3
+ Audio processing library for the web with RNNoise-based noise suppression and Voice Activity Detection (VAD). Designed for voice communication applications with LiveKit integration support.
4
4
 
5
5
  ## Features
6
6
 
7
- - **Noise Suppression**: Uses `@sapphi-red/web-noise-suppressor` (RNNoise) for high-quality noise reduction.
8
- - **Robust VAD**: Energy-based VAD with hysteresis, hangover, and pre-roll buffering to prevent cutting off speech onset.
9
- - **Intelligent Muting**: Automatically gates audio or mutes LiveKit tracks when silent.
10
- - **LiveKit Integration**: Good support for `LocalAudioTrack`.
11
- - **Extensible**: Plugin system for custom WASM/Worklet processors.
7
+ - Configurable Voice Activity Detection with energy-based algorithm
8
+ - RNNoise noise suppression via `@sapphi-red/web-noise-suppressor`
9
+ - Automatic audio gating based on voice detection
10
+ - Runtime configuration updates
11
+ - LiveKit `LocalAudioTrack` integration
12
+ - Plugin system for custom audio processors
13
+ - Optional dynamic range compression
12
14
 
13
15
  ## Installation
14
16
 
15
17
  ```bash
16
18
  npm install @tensamin/audio livekit-client
17
- bun add @tensamin/audio livekit-client
18
- pnpm install @tensamin/audio livekit-client
19
19
  ```
20
20
 
21
- ## Setup Assets
21
+ ## Requirements
22
22
 
23
- This library uses WASM and AudioWorklets for processing. **Asset setup is optional** - the pipeline can run in passthrough mode without them.
24
-
25
- ### For Noise Suppression (Optional)
26
-
27
- If you want to enable noise suppression, download these files from `https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/`:
23
+ For noise suppression, the following files must be provided:
28
24
 
29
25
  - `rnnoise.wasm`
30
26
  - `rnnoise_simd.wasm`
31
- - `noise-suppressor-worklet.min.js`
27
+ - `worklet.js`
32
28
 
33
- Place them in your project's public directory (e.g., `public/audio-processor/`).
29
+ Available at: `https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/`
34
30
 
35
- **Note:** The pipeline will automatically disable noise suppression if these URLs are not provided, and will use passthrough mode instead.
31
+ Place these files in a publicly accessible directory (e.g., `public/audio-processor/`).
36
32
 
37
33
  ## Usage
38
34
 
39
- ### Minimal Setup (Passthrough Mode)
40
-
41
- If you want to use the pipeline without noise suppression or VAD (e.g., for testing or when features are not needed), you can disable them:
35
+ ### Basic Example
42
36
 
43
37
  ```ts
44
38
  import { createAudioPipeline } from "@tensamin/audio";
45
39
 
46
- // Get a stream
47
40
  const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
48
41
  const track = stream.getAudioTracks()[0];
49
42
 
50
- // Create pipeline
51
- const pipeline = await createAudioPipeline(track, {
52
- noiseSuppression: { enabled: false },
53
- vad: { enabled: false },
54
- });
55
-
56
- // Use the processed track
57
- const processedStream = new MediaStream([pipeline.processedTrack]);
58
- ```
59
-
60
- ### Basic Usage (Raw MediaStream)
61
-
62
- ```ts
63
- import { createAudioPipeline } from "@tensamin/audio";
64
-
65
- // Get a stream
66
- const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
67
- const track = stream.getAudioTracks()[0];
68
-
69
- // Create pipeline
70
43
  const pipeline = await createAudioPipeline(track, {
71
44
  noiseSuppression: {
72
45
  enabled: true,
73
46
  wasmUrl: "/audio-processor/rnnoise.wasm",
74
47
  simdUrl: "/audio-processor/rnnoise_simd.wasm",
75
- workletUrl: "/audio-processor/noise-suppressor-worklet.min.js",
48
+ workletUrl: "/audio-processor/worklet.js",
76
49
  },
77
50
  vad: { enabled: true },
78
51
  });
79
52
 
80
- // Use the processed track
81
53
  const processedStream = new MediaStream([pipeline.processedTrack]);
82
- // audioElement.srcObject = processedStream;
83
-
84
- // Listen to VAD events
85
- pipeline.events.on("vadChange", (state) => {
86
- console.log("Is Speaking:", state.isSpeaking);
87
- });
88
54
  ```
89
55
 
90
56
  ### LiveKit Integration
@@ -93,21 +59,218 @@ pipeline.events.on("vadChange", (state) => {
93
59
  import { attachProcessingToTrack } from "@tensamin/audio";
94
60
  import { LocalAudioTrack } from "livekit-client";
95
61
 
96
- // Assume you have a LocalAudioTrack
97
62
  const localTrack = await LocalAudioTrack.create();
98
63
 
99
- // Attach processing (replaces the underlying track)
100
64
  const pipeline = await attachProcessingToTrack(localTrack, {
101
65
  noiseSuppression: {
102
66
  enabled: true,
103
67
  wasmUrl: "/audio-processor/rnnoise.wasm",
104
68
  simdUrl: "/audio-processor/rnnoise_simd.wasm",
105
- workletUrl: "/audio-processor/noise-suppressor-worklet.min.js",
69
+ workletUrl: "/audio-processor/worklet.js",
106
70
  },
107
71
  vad: { enabled: true },
108
- livekit: { manageTrackMute: true }, // Optional: mute the track object itself
72
+ livekit: { manageTrackMute: true },
109
73
  });
110
74
 
111
- // Publish the track
112
75
  await room.localParticipant.publishTrack(localTrack);
113
76
  ```
77
+
78
+ ### Monitoring VAD State
79
+
80
+ ```ts
81
+ pipeline.events.on("vadChange", (state) => {
82
+ console.log("Speaking:", state.isSpeaking);
83
+ console.log("Probability:", state.probability);
84
+ console.log("State:", state.state);
85
+ });
86
+ ```
87
+
88
+ ## Configuration
89
+
90
+ ### Voice Activity Detection
91
+
92
+ ```ts
93
+ vad: {
94
+ enabled: boolean;
95
+ startThreshold: number; // Default: 0.6 (range: 0-1)
96
+ stopThreshold: number; // Default: 0.45 (range: 0-1)
97
+ hangoverMs: number; // Default: 400
98
+ preRollMs: number; // Default: 250
99
+ minSpeechDurationMs: number; // Default: 100
100
+ minSilenceDurationMs: number; // Default: 150
101
+ energyVad?: {
102
+ smoothing: number; // Default: 0.95
103
+ initialNoiseFloor: number; // Default: 0.001
104
+ noiseFloorAdaptRateQuiet: number; // Default: 0.01
105
+ noiseFloorAdaptRateLoud: number; // Default: 0.001
106
+ minSNR: number; // Default: 2.0
107
+ snrRange: number; // Default: 8.0
108
+ };
109
+ }
110
+ ```
111
+
112
+ **Threshold Parameters:**
113
+
114
+ - `startThreshold`: Probability threshold to unmute audio
115
+ - `stopThreshold`: Probability threshold to mute audio (after hangover)
116
+ - `hangoverMs`: Delay before muting after speech stops
117
+ - `preRollMs`: Audio buffer duration before speech onset
118
+ - `minSpeechDurationMs`: Minimum duration to consider as valid speech
119
+ - `minSilenceDurationMs`: Minimum silence duration between speech segments
120
+
121
+ **Energy VAD Parameters:**
122
+
123
+ - `smoothing`: Energy calculation smoothing factor (0-1)
124
+ - `minSNR`: Minimum signal-to-noise ratio for speech detection
125
+ - `snrRange`: Range for probability scaling from minSNR
126
+
127
+ ### Output Control
128
+
129
+ ```ts
130
+ output: {
131
+ speechGain: number; // Default: 1.0
132
+ silenceGain: number; // Default: 0.0
133
+ gainRampTime: number; // Default: 0.015 (seconds)
134
+ smoothTransitions: boolean; // Default: true
135
+ maxGainDb: number; // Default: 6.0
136
+ enableCompression: boolean; // Default: false
137
+ compression?: {
138
+ threshold: number; // Default: -24.0 (dB)
139
+ ratio: number; // Default: 3.0
140
+ attack: number; // Default: 0.003 (seconds)
141
+ release: number; // Default: 0.05 (seconds)
142
+ };
143
+ }
144
+ ```
145
+
146
+ **Gain Parameters:**
147
+
148
+ - `speechGain`: Gain multiplier when speaking (1.0 = unity)
149
+ - `silenceGain`: Gain multiplier when silent (0.0 = mute)
150
+ - `gainRampTime`: Transition duration for gain changes
151
+ - `maxGainDb`: Maximum gain limit to prevent clipping
152
+
153
+ **Compression Parameters:**
154
+
155
+ - `threshold`: Level above which compression is applied
156
+ - `ratio`: Compression ratio (e.g., 3.0 = 3:1)
157
+ - `attack`: Time to reach full compression
158
+ - `release`: Time to release compression
159
+
160
+ ### Runtime Configuration Updates
161
+
162
+ ```ts
163
+ pipeline.setConfig({
164
+ vad: {
165
+ startThreshold: 0.7,
166
+ stopThreshold: 0.55,
167
+ },
168
+ output: {
169
+ speechGain: 1.3,
170
+ },
171
+ });
172
+ ```
173
+
174
+ ## Configuration Examples
175
+
176
+ ### Noisy Environment
177
+
178
+ ```ts
179
+ {
180
+ vad: {
181
+ startThreshold: 0.7,
182
+ stopThreshold: 0.55,
183
+ minSpeechDurationMs: 150,
184
+ energyVad: { minSNR: 3.0 }
185
+ }
186
+ }
187
+ ```
188
+
189
+ ### Quiet Speaker
190
+
191
+ ```ts
192
+ {
193
+ vad: {
194
+ startThreshold: 0.4,
195
+ stopThreshold: 0.25,
196
+ energyVad: { minSNR: 1.5 }
197
+ },
198
+ output: {
199
+ speechGain: 1.5
200
+ }
201
+ }
202
+ ```
203
+
204
+ ### Natural Conversation
205
+
206
+ ```ts
207
+ {
208
+ vad: {
209
+ startThreshold: 0.5,
210
+ stopThreshold: 0.3,
211
+ hangoverMs: 600,
212
+ },
213
+ output: {
214
+ silenceGain: 0.2
215
+ }
216
+ }
217
+ ```
218
+
219
+ ## API Reference
220
+
221
+ ### `createAudioPipeline(track, config)`
222
+
223
+ Creates an audio processing pipeline from a MediaStreamTrack.
224
+
225
+ **Parameters:**
226
+
227
+ - `track`: MediaStreamTrack - Source audio track
228
+ - `config`: AudioProcessingConfig - Configuration object
229
+
230
+ **Returns:** `Promise<AudioPipelineHandle>`
231
+
232
+ ### AudioPipelineHandle
233
+
234
+ ```ts
235
+ interface AudioPipelineHandle {
236
+ processedTrack: MediaStreamTrack;
237
+ events: Emitter<AudioPipelineEvents>;
238
+ state: VADState;
239
+ setConfig(config: Partial<AudioProcessingConfig>): void;
240
+ dispose(): void;
241
+ }
242
+ ```
243
+
244
+ ### AudioPipelineEvents
245
+
246
+ ```ts
247
+ type AudioPipelineEvents = {
248
+ vadChange: VADState;
249
+ error: Error;
250
+ };
251
+ ```
252
+
253
+ ### VADState
254
+
255
+ ```ts
256
+ interface VADState {
257
+ isSpeaking: boolean;
258
+ probability: number;
259
+ state: "silent" | "speech_starting" | "speaking" | "speech_ending";
260
+ }
261
+ ```
262
+
263
+ ## Default Values
264
+
265
+ | Parameter | Default | Description |
266
+ | ---------------------- | ------- | -------------------------------- |
267
+ | `startThreshold` | 0.6 | Unmute at 60% confidence |
268
+ | `stopThreshold` | 0.45 | Mute below 45% confidence |
269
+ | `hangoverMs` | 400 | Wait 400ms before muting |
270
+ | `preRollMs` | 250 | Buffer 250ms before speech |
271
+ | `minSpeechDurationMs` | 100 | Minimum valid speech duration |
272
+ | `minSilenceDurationMs` | 150 | Minimum silence between speech |
273
+ | `silenceGain` | 0.0 | Complete mute when silent |
274
+ | `speechGain` | 1.0 | Unity gain when speaking |
275
+ | `minSNR` | 2.0 | Voice must be 2x noise floor |
276
+ | `snrRange` | 8.0 | Probability scales over SNR 2-10 |
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  createAudioPipeline
3
- } from "./chunk-EXH2PNUE.mjs";
3
+ } from "./chunk-XXTNAUYX.mjs";
4
4
 
5
5
  // src/livekit/integration.ts
6
6
  async function attachProcessingToTrack(track, config = {}) {
@@ -3,7 +3,7 @@ import {
3
3
  } from "./chunk-XO6B3D4A.mjs";
4
4
  import {
5
5
  EnergyVADPlugin
6
- } from "./chunk-R5JVHKWA.mjs";
6
+ } from "./chunk-VEJXAEMM.mjs";
7
7
 
8
8
  // src/extensibility/plugins.ts
9
9
  var nsPlugins = /* @__PURE__ */ new Map();
@@ -0,0 +1,93 @@
1
+ // src/vad/vad-state.ts
2
+ var VADStateMachine = class {
3
+ config;
4
+ currentState = "silent";
5
+ lastSpeechTime = 0;
6
+ speechStartTime = 0;
7
+ lastSilenceTime = 0;
8
+ frameDurationMs = 20;
9
+ // Assumed frame duration, updated by calls
10
+ constructor(config) {
11
+ this.config = {
12
+ enabled: config?.enabled ?? true,
13
+ pluginName: config?.pluginName ?? "energy-vad",
14
+ // Voice-optimized defaults
15
+ startThreshold: config?.startThreshold ?? 0.6,
16
+ // Higher threshold to avoid noise
17
+ stopThreshold: config?.stopThreshold ?? 0.45,
18
+ // Balanced for voice
19
+ hangoverMs: config?.hangoverMs ?? 400,
20
+ // Smooth for natural speech
21
+ preRollMs: config?.preRollMs ?? 250,
22
+ // Generous pre-roll
23
+ minSpeechDurationMs: config?.minSpeechDurationMs ?? 100,
24
+ minSilenceDurationMs: config?.minSilenceDurationMs ?? 150,
25
+ energyVad: {
26
+ smoothing: config?.energyVad?.smoothing ?? 0.95,
27
+ initialNoiseFloor: config?.energyVad?.initialNoiseFloor ?? 1e-3,
28
+ noiseFloorAdaptRateQuiet: config?.energyVad?.noiseFloorAdaptRateQuiet ?? 0.01,
29
+ noiseFloorAdaptRateLoud: config?.energyVad?.noiseFloorAdaptRateLoud ?? 1e-3,
30
+ minSNR: config?.energyVad?.minSNR ?? 2,
31
+ snrRange: config?.energyVad?.snrRange ?? 8
32
+ }
33
+ };
34
+ this.lastSilenceTime = Date.now();
35
+ }
36
+ updateConfig(config) {
37
+ this.config = { ...this.config, ...config };
38
+ }
39
+ processFrame(probability, timestamp) {
40
+ const {
41
+ startThreshold,
42
+ stopThreshold,
43
+ hangoverMs,
44
+ minSpeechDurationMs,
45
+ minSilenceDurationMs
46
+ } = this.config;
47
+ let newState = this.currentState;
48
+ if (this.currentState === "silent" || this.currentState === "speech_ending") {
49
+ if (probability >= startThreshold) {
50
+ const silenceDuration = timestamp - this.lastSilenceTime;
51
+ if (silenceDuration >= minSilenceDurationMs) {
52
+ newState = "speech_starting";
53
+ this.speechStartTime = timestamp;
54
+ this.lastSpeechTime = timestamp;
55
+ } else {
56
+ newState = "silent";
57
+ }
58
+ } else {
59
+ newState = "silent";
60
+ this.lastSilenceTime = timestamp;
61
+ }
62
+ } else if (this.currentState === "speech_starting" || this.currentState === "speaking") {
63
+ if (probability >= stopThreshold) {
64
+ newState = "speaking";
65
+ this.lastSpeechTime = timestamp;
66
+ } else {
67
+ const timeSinceSpeech = timestamp - this.lastSpeechTime;
68
+ const speechDuration = timestamp - this.speechStartTime;
69
+ if (timeSinceSpeech < hangoverMs) {
70
+ newState = "speaking";
71
+ } else if (speechDuration < minSpeechDurationMs) {
72
+ newState = "silent";
73
+ this.lastSilenceTime = timestamp;
74
+ } else {
75
+ newState = "speech_ending";
76
+ this.lastSilenceTime = timestamp;
77
+ }
78
+ }
79
+ }
80
+ if (newState === "speech_starting") newState = "speaking";
81
+ if (newState === "speech_ending") newState = "silent";
82
+ this.currentState = newState;
83
+ return {
84
+ isSpeaking: newState === "speaking",
85
+ probability,
86
+ state: newState
87
+ };
88
+ }
89
+ };
90
+
91
+ export {
92
+ VADStateMachine
93
+ };
@@ -0,0 +1,136 @@
1
+ // src/vad/vad-node.ts
2
+ var createEnergyVadWorkletCode = (vadConfig) => {
3
+ const energyParams = vadConfig?.energyVad || {};
4
+ const smoothing = energyParams.smoothing ?? 0.95;
5
+ const initialNoiseFloor = energyParams.initialNoiseFloor ?? 1e-3;
6
+ const noiseFloorAdaptRateQuiet = energyParams.noiseFloorAdaptRateQuiet ?? 0.01;
7
+ const noiseFloorAdaptRateLoud = energyParams.noiseFloorAdaptRateLoud ?? 1e-3;
8
+ const minSNR = energyParams.minSNR ?? 2;
9
+ const snrRange = energyParams.snrRange ?? 8;
10
+ return `
11
+ class EnergyVadProcessor extends AudioWorkletProcessor {
12
+ constructor() {
13
+ super();
14
+ this.smoothing = ${smoothing};
15
+ this.energy = 0;
16
+ this.noiseFloor = ${initialNoiseFloor};
17
+ this.noiseFloorAdaptRateQuiet = ${noiseFloorAdaptRateQuiet};
18
+ this.noiseFloorAdaptRateLoud = ${noiseFloorAdaptRateLoud};
19
+ this.minSNR = ${minSNR};
20
+ this.snrRange = ${snrRange};
21
+ this.isSpeaking = false;
22
+
23
+ this.port.onmessage = (event) => {
24
+ if (event.data && event.data.isSpeaking !== undefined) {
25
+ this.isSpeaking = event.data.isSpeaking;
26
+ }
27
+ };
28
+ }
29
+
30
+ process(inputs, outputs, parameters) {
31
+ const input = inputs[0];
32
+ if (!input || !input.length) return true;
33
+ const channel = input[0];
34
+
35
+ // Calculate RMS (Root Mean Square) energy
36
+ let sum = 0;
37
+ for (let i = 0; i < channel.length; i++) {
38
+ sum += channel[i] * channel[i];
39
+ }
40
+ const rms = Math.sqrt(sum / channel.length);
41
+
42
+ // Adaptive noise floor estimation - ONLY during silence
43
+ // This prevents the noise floor from rising during speech
44
+ if (!this.isSpeaking) {
45
+ if (rms < this.noiseFloor) {
46
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateQuiet) + rms * this.noiseFloorAdaptRateQuiet;
47
+ } else {
48
+ // Even during silence, if we detect a loud signal, adapt very slowly
49
+ // This could be brief noise we haven't classified as speech yet
50
+ this.noiseFloor = this.noiseFloor * (1 - this.noiseFloorAdaptRateLoud) + rms * this.noiseFloorAdaptRateLoud;
51
+ }
52
+ }
53
+ // During speech, freeze the noise floor to maintain consistent detection
54
+
55
+ // Calculate Signal-to-Noise Ratio (SNR)
56
+ const snr = rms / (this.noiseFloor + 1e-6);
57
+
58
+ // Map SNR to probability (0-1)
59
+ // Probability is 0 when SNR <= minSNR
60
+ // Probability scales linearly from 0 to 1 between minSNR and (minSNR + snrRange)
61
+ // Probability is 1 when SNR >= (minSNR + snrRange)
62
+ const probability = Math.min(1, Math.max(0, (snr - this.minSNR) / this.snrRange));
63
+
64
+ this.port.postMessage({ probability, snr, noiseFloor: this.noiseFloor, rms });
65
+
66
+ return true;
67
+ }
68
+ }
69
+ registerProcessor('energy-vad-processor', EnergyVadProcessor);
70
+ `;
71
+ };
72
+ var EnergyVADPlugin = class {
73
+ name = "energy-vad";
74
+ workletNode = null;
75
+ async createNode(context, config, onDecision) {
76
+ if (!config?.enabled) {
77
+ console.log("VAD disabled, using passthrough node");
78
+ const pass = context.createGain();
79
+ return pass;
80
+ }
81
+ const workletCode = createEnergyVadWorkletCode(config);
82
+ const blob = new Blob([workletCode], {
83
+ type: "application/javascript"
84
+ });
85
+ const url = URL.createObjectURL(blob);
86
+ try {
87
+ await context.audioWorklet.addModule(url);
88
+ console.log("Energy VAD worklet loaded successfully");
89
+ } catch (e) {
90
+ const error = new Error(
91
+ `Failed to load Energy VAD worklet: ${e instanceof Error ? e.message : String(e)}`
92
+ );
93
+ console.error(error.message);
94
+ URL.revokeObjectURL(url);
95
+ throw error;
96
+ }
97
+ URL.revokeObjectURL(url);
98
+ let node;
99
+ try {
100
+ node = new AudioWorkletNode(context, "energy-vad-processor");
101
+ this.workletNode = node;
102
+ console.log("Energy VAD node created successfully");
103
+ } catch (e) {
104
+ const error = new Error(
105
+ `Failed to create Energy VAD node: ${e instanceof Error ? e.message : String(e)}`
106
+ );
107
+ console.error(error.message);
108
+ throw error;
109
+ }
110
+ node.port.onmessage = (event) => {
111
+ try {
112
+ const { probability } = event.data;
113
+ if (typeof probability === "number" && !isNaN(probability)) {
114
+ onDecision(probability);
115
+ } else {
116
+ console.warn("Invalid VAD probability received:", event.data);
117
+ }
118
+ } catch (error) {
119
+ console.error("Error in VAD message handler:", error);
120
+ }
121
+ };
122
+ node.port.onmessageerror = (event) => {
123
+ console.error("VAD port message error:", event);
124
+ };
125
+ return node;
126
+ }
127
+ updateSpeakingState(isSpeaking) {
128
+ if (this.workletNode) {
129
+ this.workletNode.port.postMessage({ isSpeaking });
130
+ }
131
+ }
132
+ };
133
+
134
+ export {
135
+ EnergyVADPlugin
136
+ };