@tensamin/audio 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/publish.yml +23 -0
- package/LICENSE +21 -0
- package/README.md +66 -0
- package/package.json +44 -0
- package/src/context/audio-context.ts +69 -0
- package/src/extensibility/plugins.ts +45 -0
- package/src/index.ts +8 -0
- package/src/livekit/integration.ts +61 -0
- package/src/noise-suppression/rnnoise-node.ts +62 -0
- package/src/pipeline/audio-pipeline.ts +154 -0
- package/src/types.ts +167 -0
- package/src/vad/vad-node.ts +78 -0
- package/src/vad/vad-state.ts +71 -0
- package/tsconfig.json +29 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
|
|
6
|
+
permissions:
|
|
7
|
+
contents: write
|
|
8
|
+
id-token: write
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
publish:
|
|
12
|
+
runs-on: ubuntu-latest
|
|
13
|
+
steps:
|
|
14
|
+
- uses: actions/checkout@v4
|
|
15
|
+
- uses: actions/setup-node@v4
|
|
16
|
+
|
|
17
|
+
- name: Publish
|
|
18
|
+
run: |
|
|
19
|
+
npm ci
|
|
20
|
+
npm run build
|
|
21
|
+
npm pack
|
|
22
|
+
echo "//registry.npmjs.org/:_authToken=${{ secrets.NPM_TOKEN }}" > ~/.npmrc
|
|
23
|
+
npm publish --access public
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2025] [Methanium]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
# @tensamin/audio
|
|
2
|
+
|
|
3
|
+
A audio processing library for the web, featuring RNNoise-based noise suppression and robust Voice Activity Detection (VAD). Designed for seamless integration with LiveKit.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Noise Suppression**: Uses `@sapphi-red/web-noise-suppressor` (RNNoise) for high-quality noise reduction.
|
|
8
|
+
- **Robust VAD**: Energy-based VAD with hysteresis, hangover, and pre-roll buffering to prevent cutting off speech onset.
|
|
9
|
+
- **Intelligent Muting**: Automatically gates audio or mutes LiveKit tracks when silent.
|
|
10
|
+
- **LiveKit Integration**: First-class support for `LocalAudioTrack`.
|
|
11
|
+
- **Extensible**: Plugin system for custom WASM/Worklet processors.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
npm install @tensamin/audio livekit-client
|
|
17
|
+
bun add @tensamin/audio livekit-client
|
|
18
|
+
pnpm install @tensamin/audio livekit-client
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Usage
|
|
22
|
+
|
|
23
|
+
### Basic Usage (Raw MediaStream)
|
|
24
|
+
|
|
25
|
+
```ts
|
|
26
|
+
import { createAudioPipeline } from "@tensamin/audio";
|
|
27
|
+
|
|
28
|
+
// Get a stream
|
|
29
|
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
30
|
+
const track = stream.getAudioTracks()[0];
|
|
31
|
+
|
|
32
|
+
// Create pipeline
|
|
33
|
+
const pipeline = await createAudioPipeline(track, {
|
|
34
|
+
noiseSuppression: { enabled: true },
|
|
35
|
+
vad: { enabled: true },
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
// Use the processed track
|
|
39
|
+
const processedStream = new MediaStream([pipeline.processedTrack]);
|
|
40
|
+
// audioElement.srcObject = processedStream;
|
|
41
|
+
|
|
42
|
+
// Listen to VAD events
|
|
43
|
+
pipeline.events.on("vadChange", (state) => {
|
|
44
|
+
console.log("Is Speaking:", state.isSpeaking);
|
|
45
|
+
});
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### LiveKit Integration
|
|
49
|
+
|
|
50
|
+
```ts
|
|
51
|
+
import { attachProcessingToTrack } from "@tensamin/audio";
|
|
52
|
+
import { LocalAudioTrack } from "livekit-client";
|
|
53
|
+
|
|
54
|
+
// Assume you have a LocalAudioTrack
|
|
55
|
+
const localTrack = await LocalAudioTrack.create();
|
|
56
|
+
|
|
57
|
+
// Attach processing (replaces the underlying track)
|
|
58
|
+
const pipeline = await attachProcessingToTrack(localTrack, {
|
|
59
|
+
noiseSuppression: { enabled: true },
|
|
60
|
+
vad: { enabled: true },
|
|
61
|
+
livekit: { manageTrackMute: true }, // Optional: mute the track object itself
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
// Publish the track
|
|
65
|
+
await room.localParticipant.publishTrack(localTrack);
|
|
66
|
+
```
|
package/package.json
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@tensamin/audio",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"module": "dist/index.mjs",
|
|
5
|
+
"types": "dist/index.d.ts",
|
|
6
|
+
"type": "module",
|
|
7
|
+
"author": {
|
|
8
|
+
"email": "aloisianer@proton.me",
|
|
9
|
+
"name": "Alois"
|
|
10
|
+
},
|
|
11
|
+
"publishConfig": {
|
|
12
|
+
"registry": "https://registry.npmjs.org",
|
|
13
|
+
"access": "public"
|
|
14
|
+
},
|
|
15
|
+
"repository": {
|
|
16
|
+
"type": "git",
|
|
17
|
+
"url": "https://github.com/Tensamin/Audio"
|
|
18
|
+
},
|
|
19
|
+
"exports": {
|
|
20
|
+
".": {
|
|
21
|
+
"import": "./dist/index.mjs",
|
|
22
|
+
"types": "./dist/index.d.ts"
|
|
23
|
+
}
|
|
24
|
+
},
|
|
25
|
+
"scripts": {
|
|
26
|
+
"build": "tsup src/index.ts --format esm --dts --clean",
|
|
27
|
+
"dev": "tsup src/index.ts --format esm --dts --watch",
|
|
28
|
+
"format": "bunx prettier --write ."
|
|
29
|
+
},
|
|
30
|
+
"dependencies": {
|
|
31
|
+
"@sapphi-red/web-noise-suppressor": "^0.3.5",
|
|
32
|
+
"mitt": "^3.0.1"
|
|
33
|
+
},
|
|
34
|
+
"peerDependencies": {
|
|
35
|
+
"livekit-client": "^2.0.0"
|
|
36
|
+
},
|
|
37
|
+
"devDependencies": {
|
|
38
|
+
"@types/bun": "latest",
|
|
39
|
+
"@types/web": "^0.0.298",
|
|
40
|
+
"livekit-client": "^2.16.0",
|
|
41
|
+
"tsup": "^8.5.1",
|
|
42
|
+
"typescript": "^5.9.3"
|
|
43
|
+
}
|
|
44
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Manages a shared AudioContext for the application.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
let sharedContext: AudioContext | null = null;
|
|
6
|
+
let activePipelines = 0;
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Gets the shared AudioContext, creating it if necessary.
|
|
10
|
+
* @param options Optional AudioContextOptions
|
|
11
|
+
*/
|
|
12
|
+
export function getAudioContext(options?: AudioContextOptions): AudioContext {
|
|
13
|
+
if (typeof window === "undefined" || typeof AudioContext === "undefined") {
|
|
14
|
+
throw new Error(
|
|
15
|
+
"AudioContext is not supported in this environment (browser only).",
|
|
16
|
+
);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
if (!sharedContext || sharedContext.state === "closed") {
|
|
20
|
+
sharedContext = new AudioContext(options);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
return sharedContext;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Registers a pipeline usage. Keeps track of active users.
|
|
28
|
+
*/
|
|
29
|
+
export function registerPipeline(): void {
|
|
30
|
+
activePipelines++;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Unregisters a pipeline usage.
|
|
35
|
+
* Optionally closes the context if no pipelines are active (not implemented by default to avoid churn).
|
|
36
|
+
*/
|
|
37
|
+
export function unregisterPipeline(): void {
|
|
38
|
+
activePipelines = Math.max(0, activePipelines - 1);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Resumes the shared AudioContext.
|
|
43
|
+
* Should be called in response to a user gesture.
|
|
44
|
+
*/
|
|
45
|
+
export async function resumeAudioContext(): Promise<void> {
|
|
46
|
+
if (sharedContext && sharedContext.state === "suspended") {
|
|
47
|
+
await sharedContext.resume();
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Suspends the shared AudioContext.
|
|
53
|
+
*/
|
|
54
|
+
export async function suspendAudioContext(): Promise<void> {
|
|
55
|
+
if (sharedContext && sharedContext.state === "running") {
|
|
56
|
+
await sharedContext.suspend();
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Closes the shared AudioContext and releases resources.
|
|
62
|
+
*/
|
|
63
|
+
export async function closeAudioContext(): Promise<void> {
|
|
64
|
+
if (sharedContext && sharedContext.state !== "closed") {
|
|
65
|
+
await sharedContext.close();
|
|
66
|
+
}
|
|
67
|
+
sharedContext = null;
|
|
68
|
+
activePipelines = 0;
|
|
69
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import type { NoiseSuppressionPlugin, VADPlugin } from "../types.js";
|
|
2
|
+
import { RNNoisePlugin } from "../noise-suppression/rnnoise-node.js";
|
|
3
|
+
import { EnergyVADPlugin } from "../vad/vad-node.js";
|
|
4
|
+
|
|
5
|
+
const nsPlugins = new Map<string, NoiseSuppressionPlugin>();
|
|
6
|
+
const vadPlugins = new Map<string, VADPlugin>();
|
|
7
|
+
|
|
8
|
+
// Register defaults
|
|
9
|
+
const defaultNs = new RNNoisePlugin();
|
|
10
|
+
nsPlugins.set(defaultNs.name, defaultNs);
|
|
11
|
+
|
|
12
|
+
const defaultVad = new EnergyVADPlugin();
|
|
13
|
+
vadPlugins.set(defaultVad.name, defaultVad);
|
|
14
|
+
|
|
15
|
+
export function registerNoiseSuppressionPlugin(plugin: NoiseSuppressionPlugin) {
|
|
16
|
+
nsPlugins.set(plugin.name, plugin);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function registerVADPlugin(plugin: VADPlugin) {
|
|
20
|
+
vadPlugins.set(plugin.name, plugin);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function getNoiseSuppressionPlugin(
|
|
24
|
+
name?: string,
|
|
25
|
+
): NoiseSuppressionPlugin {
|
|
26
|
+
if (!name) return defaultNs;
|
|
27
|
+
const plugin = nsPlugins.get(name);
|
|
28
|
+
if (!plugin) {
|
|
29
|
+
console.warn(
|
|
30
|
+
`Noise suppression plugin '${name}' not found, falling back to default.`,
|
|
31
|
+
);
|
|
32
|
+
return defaultNs;
|
|
33
|
+
}
|
|
34
|
+
return plugin;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export function getVADPlugin(name?: string): VADPlugin {
|
|
38
|
+
if (!name) return defaultVad;
|
|
39
|
+
const plugin = vadPlugins.get(name);
|
|
40
|
+
if (!plugin) {
|
|
41
|
+
console.warn(`VAD plugin '${name}' not found, falling back to default.`);
|
|
42
|
+
return defaultVad;
|
|
43
|
+
}
|
|
44
|
+
return plugin;
|
|
45
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
export * from "./types.js";
|
|
2
|
+
export * from "./context/audio-context.js";
|
|
3
|
+
export * from "./pipeline/audio-pipeline.js";
|
|
4
|
+
export * from "./livekit/integration.js";
|
|
5
|
+
export * from "./extensibility/plugins.js";
|
|
6
|
+
export * from "./noise-suppression/rnnoise-node.js";
|
|
7
|
+
export * from "./vad/vad-node.js";
|
|
8
|
+
export * from "./vad/vad-state.js";
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import type { LocalAudioTrack } from "livekit-client";
|
|
2
|
+
import { createAudioPipeline } from "../pipeline/audio-pipeline.js";
|
|
3
|
+
import type { AudioPipelineHandle, AudioProcessingConfig } from "../types.js";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Attaches the audio processing pipeline to a LiveKit LocalAudioTrack.
|
|
7
|
+
* This replaces the underlying MediaStreamTrack with the processed one.
|
|
8
|
+
*/
|
|
9
|
+
export async function attachProcessingToTrack(
|
|
10
|
+
track: LocalAudioTrack,
|
|
11
|
+
config: AudioProcessingConfig = {},
|
|
12
|
+
): Promise<AudioPipelineHandle> {
|
|
13
|
+
// 1. Get the original track
|
|
14
|
+
const originalTrack = track.mediaStreamTrack;
|
|
15
|
+
|
|
16
|
+
// 2. Create pipeline
|
|
17
|
+
const pipeline = await createAudioPipeline(originalTrack, config);
|
|
18
|
+
|
|
19
|
+
// 3. Replace the track in LiveKit
|
|
20
|
+
// Use replaceTrack which is the public API to swap the underlying MediaStreamTrack.
|
|
21
|
+
await track.replaceTrack(pipeline.processedTrack);
|
|
22
|
+
|
|
23
|
+
// 4. Handle intelligent muting if enabled
|
|
24
|
+
if (config.livekit?.manageTrackMute) {
|
|
25
|
+
let isVadMuted = false;
|
|
26
|
+
|
|
27
|
+
pipeline.events.on("vadChange", async (state) => {
|
|
28
|
+
if (state.isSpeaking) {
|
|
29
|
+
if (isVadMuted) {
|
|
30
|
+
// Only unmute if we were the ones who muted it
|
|
31
|
+
// And check if the track is not globally muted by user?
|
|
32
|
+
// This is tricky. If user muted manually, track.isMuted is true.
|
|
33
|
+
// We should probably check a separate flag or assume VAD overrides only when "active".
|
|
34
|
+
// For safety, we only unmute if we muted.
|
|
35
|
+
await track.unmute();
|
|
36
|
+
isVadMuted = false;
|
|
37
|
+
}
|
|
38
|
+
} else {
|
|
39
|
+
// Silence
|
|
40
|
+
if (!track.isMuted) {
|
|
41
|
+
await track.mute();
|
|
42
|
+
isVadMuted = true;
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// 5. Handle cleanup
|
|
49
|
+
const originalDispose = pipeline.dispose;
|
|
50
|
+
pipeline.dispose = () => {
|
|
51
|
+
// Restore original track?
|
|
52
|
+
// Or just stop.
|
|
53
|
+
// If we dispose, we should probably try to restore the original track if it's still alive.
|
|
54
|
+
if (originalTrack.readyState === "live") {
|
|
55
|
+
track.replaceTrack(originalTrack).catch(console.error);
|
|
56
|
+
}
|
|
57
|
+
originalDispose();
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
return pipeline;
|
|
61
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
import {
|
|
2
|
+
RnnoiseWorkletNode,
|
|
3
|
+
loadRnnoise,
|
|
4
|
+
} from "@sapphi-red/web-noise-suppressor";
|
|
5
|
+
import type {
|
|
6
|
+
AudioProcessingConfig,
|
|
7
|
+
NoiseSuppressionPlugin,
|
|
8
|
+
} from "../types.js";
|
|
9
|
+
|
|
10
|
+
// Default URLs (can be overridden by config)
|
|
11
|
+
// These defaults assume the assets are served from the same origin or a known CDN.
|
|
12
|
+
// In a real package, we might want to bundle them or require the user to provide them.
|
|
13
|
+
const DEFAULT_WASM_URL =
|
|
14
|
+
"https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/rnnoise.wasm";
|
|
15
|
+
const DEFAULT_SIMD_WASM_URL =
|
|
16
|
+
"https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/rnnoise_simd.wasm";
|
|
17
|
+
const DEFAULT_WORKLET_URL =
|
|
18
|
+
"https://unpkg.com/@sapphi-red/web-noise-suppressor@0.3.5/dist/noise-suppressor-worklet.min.js";
|
|
19
|
+
|
|
20
|
+
export class RNNoisePlugin implements NoiseSuppressionPlugin {
|
|
21
|
+
name = "rnnoise-ns";
|
|
22
|
+
private wasmBuffer: ArrayBuffer | null = null;
|
|
23
|
+
|
|
24
|
+
async createNode(
|
|
25
|
+
context: AudioContext,
|
|
26
|
+
config: AudioProcessingConfig["noiseSuppression"],
|
|
27
|
+
): Promise<AudioNode> {
|
|
28
|
+
if (!config?.enabled) {
|
|
29
|
+
// Return a passthrough gain node if disabled but requested (though pipeline usually handles this)
|
|
30
|
+
const pass = context.createGain();
|
|
31
|
+
return pass;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// 1. Load WASM if not loaded
|
|
35
|
+
// We use the library's loader which handles SIMD detection if we provide both URLs.
|
|
36
|
+
// But wait, loadRnnoise returns ArrayBuffer.
|
|
37
|
+
if (!this.wasmBuffer) {
|
|
38
|
+
this.wasmBuffer = await loadRnnoise({
|
|
39
|
+
url: config.wasmUrl || DEFAULT_WASM_URL,
|
|
40
|
+
simdUrl: DEFAULT_SIMD_WASM_URL, // We should probably allow config for this too, but for now default is fine.
|
|
41
|
+
});
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// 2. Load Worklet
|
|
45
|
+
const workletUrl = config.workletUrl || DEFAULT_WORKLET_URL;
|
|
46
|
+
|
|
47
|
+
try {
|
|
48
|
+
await context.audioWorklet.addModule(workletUrl);
|
|
49
|
+
} catch (e) {
|
|
50
|
+
console.warn("Failed to add RNNoise worklet module:", e);
|
|
51
|
+
// Proceeding, assuming it might be already loaded.
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// 3. Create Node
|
|
55
|
+
const node = new RnnoiseWorkletNode(context, {
|
|
56
|
+
wasmBinary: this.wasmBuffer,
|
|
57
|
+
maxChannels: 1, // Mono for now
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
return node;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import mitt from "mitt";
|
|
2
|
+
import {
|
|
3
|
+
getAudioContext,
|
|
4
|
+
registerPipeline,
|
|
5
|
+
unregisterPipeline,
|
|
6
|
+
} from "../context/audio-context.js";
|
|
7
|
+
import {
|
|
8
|
+
getNoiseSuppressionPlugin,
|
|
9
|
+
getVADPlugin,
|
|
10
|
+
} from "../extensibility/plugins.js";
|
|
11
|
+
import { VADStateMachine } from "../vad/vad-state.js";
|
|
12
|
+
import type {
|
|
13
|
+
AudioPipelineEvents,
|
|
14
|
+
AudioPipelineHandle,
|
|
15
|
+
AudioProcessingConfig,
|
|
16
|
+
VADState,
|
|
17
|
+
} from "../types.js";
|
|
18
|
+
|
|
19
|
+
export async function createAudioPipeline(
|
|
20
|
+
sourceTrack: MediaStreamTrack,
|
|
21
|
+
config: AudioProcessingConfig = {},
|
|
22
|
+
): Promise<AudioPipelineHandle> {
|
|
23
|
+
const context = getAudioContext();
|
|
24
|
+
registerPipeline();
|
|
25
|
+
|
|
26
|
+
// Defaults
|
|
27
|
+
const fullConfig: AudioProcessingConfig = {
|
|
28
|
+
noiseSuppression: { enabled: true, ...config.noiseSuppression },
|
|
29
|
+
vad: { enabled: true, ...config.vad },
|
|
30
|
+
output: {
|
|
31
|
+
speechGain: 1.0,
|
|
32
|
+
silenceGain: 0.0,
|
|
33
|
+
gainRampTime: 0.02,
|
|
34
|
+
...config.output,
|
|
35
|
+
},
|
|
36
|
+
livekit: { manageTrackMute: false, ...config.livekit },
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
// 1. Source
|
|
40
|
+
const sourceStream = new MediaStream([sourceTrack]);
|
|
41
|
+
const sourceNode = context.createMediaStreamSource(sourceStream);
|
|
42
|
+
|
|
43
|
+
// 2. Noise Suppression
|
|
44
|
+
const nsPlugin = getNoiseSuppressionPlugin(
|
|
45
|
+
fullConfig.noiseSuppression?.pluginName,
|
|
46
|
+
);
|
|
47
|
+
const nsNode = await nsPlugin.createNode(
|
|
48
|
+
context,
|
|
49
|
+
fullConfig.noiseSuppression,
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
// 3. VAD
|
|
53
|
+
const vadPlugin = getVADPlugin(fullConfig.vad?.pluginName);
|
|
54
|
+
const vadStateMachine = new VADStateMachine(fullConfig.vad);
|
|
55
|
+
const emitter = mitt<AudioPipelineEvents>();
|
|
56
|
+
|
|
57
|
+
const vadNode = await vadPlugin.createNode(
|
|
58
|
+
context,
|
|
59
|
+
fullConfig.vad,
|
|
60
|
+
(prob) => {
|
|
61
|
+
const timestamp = context.currentTime * 1000;
|
|
62
|
+
const newState = vadStateMachine.processFrame(prob, timestamp);
|
|
63
|
+
|
|
64
|
+
// Emit if state changed or periodically?
|
|
65
|
+
// For now, emit on change.
|
|
66
|
+
if (
|
|
67
|
+
newState.state !== lastVadState.state ||
|
|
68
|
+
Math.abs(newState.probability - lastVadState.probability) > 0.1
|
|
69
|
+
) {
|
|
70
|
+
emitter.emit("vadChange", newState);
|
|
71
|
+
lastVadState = newState;
|
|
72
|
+
updateGain(newState);
|
|
73
|
+
}
|
|
74
|
+
},
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
let lastVadState: VADState = {
|
|
78
|
+
isSpeaking: false,
|
|
79
|
+
probability: 0,
|
|
80
|
+
state: "silent",
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
// 4. Pipeline Wiring
|
|
84
|
+
// Source -> NS -> Splitter
|
|
85
|
+
// Splitter -> VAD
|
|
86
|
+
// Splitter -> Delay -> Gain -> Destination
|
|
87
|
+
|
|
88
|
+
const splitter = context.createGain(); // Using Gain as splitter (fan-out)
|
|
89
|
+
|
|
90
|
+
sourceNode.connect(nsNode);
|
|
91
|
+
nsNode.connect(splitter);
|
|
92
|
+
|
|
93
|
+
// Path 1: VAD
|
|
94
|
+
splitter.connect(vadNode);
|
|
95
|
+
// vadNode usually doesn't output audio, or we don't connect it to destination.
|
|
96
|
+
|
|
97
|
+
// Path 2: Audio Output
|
|
98
|
+
const delayNode = context.createDelay(1.0); // Max 1 sec
|
|
99
|
+
const preRollSeconds = (fullConfig.vad?.preRollMs ?? 200) / 1000;
|
|
100
|
+
delayNode.delayTime.value = preRollSeconds;
|
|
101
|
+
|
|
102
|
+
const gainNode = context.createGain();
|
|
103
|
+
gainNode.gain.value = fullConfig.output?.silenceGain ?? 0;
|
|
104
|
+
|
|
105
|
+
const destination = context.createMediaStreamDestination();
|
|
106
|
+
|
|
107
|
+
splitter.connect(delayNode);
|
|
108
|
+
delayNode.connect(gainNode);
|
|
109
|
+
gainNode.connect(destination);
|
|
110
|
+
|
|
111
|
+
// Helper to update gain
|
|
112
|
+
function updateGain(state: VADState) {
|
|
113
|
+
const { speechGain, silenceGain, gainRampTime } = fullConfig.output!;
|
|
114
|
+
const targetGain = state.isSpeaking
|
|
115
|
+
? (speechGain ?? 1.0)
|
|
116
|
+
: (silenceGain ?? 0.0);
|
|
117
|
+
|
|
118
|
+
// Ramp to target
|
|
119
|
+
const now = context.currentTime;
|
|
120
|
+
gainNode.gain.setTargetAtTime(targetGain, now, gainRampTime ?? 0.02);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
// Handle disposal
|
|
124
|
+
function dispose() {
|
|
125
|
+
sourceNode.disconnect();
|
|
126
|
+
nsNode.disconnect();
|
|
127
|
+
splitter.disconnect();
|
|
128
|
+
vadNode.disconnect();
|
|
129
|
+
delayNode.disconnect();
|
|
130
|
+
gainNode.disconnect();
|
|
131
|
+
|
|
132
|
+
// Stop tracks? No, we don't own the source track.
|
|
133
|
+
// But we own the destination track.
|
|
134
|
+
destination.stream.getTracks().forEach((t) => t.stop());
|
|
135
|
+
|
|
136
|
+
unregisterPipeline();
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
processedTrack: destination.stream.getAudioTracks()[0]!,
|
|
141
|
+
events: emitter,
|
|
142
|
+
get state() {
|
|
143
|
+
return lastVadState;
|
|
144
|
+
},
|
|
145
|
+
setConfig: (newConfig) => {
|
|
146
|
+
// TODO: Implement runtime config updates
|
|
147
|
+
// For now, just update the VAD state machine config
|
|
148
|
+
if (newConfig.vad) {
|
|
149
|
+
vadStateMachine.updateConfig(newConfig.vad);
|
|
150
|
+
}
|
|
151
|
+
},
|
|
152
|
+
dispose,
|
|
153
|
+
};
|
|
154
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import type { LocalAudioTrack, TrackPublication } from "livekit-client";
|
|
2
|
+
import type { Emitter } from "mitt";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Configuration for the audio processing pipeline.
|
|
6
|
+
*/
|
|
7
|
+
export interface AudioProcessingConfig {
|
|
8
|
+
/**
|
|
9
|
+
* Noise suppression configuration.
|
|
10
|
+
*/
|
|
11
|
+
noiseSuppression?: {
|
|
12
|
+
enabled: boolean;
|
|
13
|
+
/**
|
|
14
|
+
* Path or URL to the RNNoise WASM binary.
|
|
15
|
+
* If not provided, the default from @sapphi-red/web-noise-suppressor will be used (if bundler supports it).
|
|
16
|
+
*/
|
|
17
|
+
wasmUrl?: string;
|
|
18
|
+
/**
|
|
19
|
+
* Path or URL to the RNNoise worklet script.
|
|
20
|
+
*/
|
|
21
|
+
workletUrl?: string;
|
|
22
|
+
/**
|
|
23
|
+
* Plugin name to use. Defaults to 'rnnoise-ns'.
|
|
24
|
+
*/
|
|
25
|
+
pluginName?: string;
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Voice Activity Detection (VAD) configuration.
|
|
30
|
+
*/
|
|
31
|
+
vad?: {
|
|
32
|
+
enabled: boolean;
|
|
33
|
+
/**
|
|
34
|
+
* Plugin name to use. Defaults to 'rnnoise-vad' or 'energy-vad'.
|
|
35
|
+
*/
|
|
36
|
+
pluginName?: string;
|
|
37
|
+
/**
|
|
38
|
+
* Probability threshold for speech onset (0-1).
|
|
39
|
+
* Default: 0.5
|
|
40
|
+
*/
|
|
41
|
+
startThreshold?: number;
|
|
42
|
+
/**
|
|
43
|
+
* Probability threshold for speech offset (0-1).
|
|
44
|
+
* Default: 0.4
|
|
45
|
+
*/
|
|
46
|
+
stopThreshold?: number;
|
|
47
|
+
/**
|
|
48
|
+
* Time in ms to wait after speech stops before considering it silent.
|
|
49
|
+
* Default: 300ms
|
|
50
|
+
*/
|
|
51
|
+
hangoverMs?: number;
|
|
52
|
+
/**
|
|
53
|
+
* Time in ms of audio to buffer before speech onset to avoid cutting the start.
|
|
54
|
+
* Default: 200ms
|
|
55
|
+
*/
|
|
56
|
+
preRollMs?: number;
|
|
57
|
+
};
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Output gain and muting configuration.
|
|
61
|
+
*/
|
|
62
|
+
output?: {
|
|
63
|
+
/**
|
|
64
|
+
* Gain to apply when speaking (0-1+). Default: 1.0
|
|
65
|
+
*/
|
|
66
|
+
speechGain?: number;
|
|
67
|
+
/**
|
|
68
|
+
* Gain to apply when silent (0-1). Default: 0.0 (mute)
|
|
69
|
+
*/
|
|
70
|
+
silenceGain?: number;
|
|
71
|
+
/**
|
|
72
|
+
* Time in seconds to ramp gain changes. Default: 0.02
|
|
73
|
+
*/
|
|
74
|
+
gainRampTime?: number;
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* LiveKit integration configuration.
|
|
79
|
+
*/
|
|
80
|
+
livekit?: {
|
|
81
|
+
/**
|
|
82
|
+
* Whether to call track.mute()/unmute() on the LocalAudioTrack based on VAD.
|
|
83
|
+
* This saves bandwidth but has more signaling overhead.
|
|
84
|
+
* Default: false (uses gain gating only)
|
|
85
|
+
*/
|
|
86
|
+
manageTrackMute?: boolean;
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/**
|
|
91
|
+
* Represents the state of Voice Activity Detection.
|
|
92
|
+
*/
|
|
93
|
+
export interface VADState {
|
|
94
|
+
/**
|
|
95
|
+
* Whether speech is currently detected (after hysteresis).
|
|
96
|
+
*/
|
|
97
|
+
isSpeaking: boolean;
|
|
98
|
+
/**
|
|
99
|
+
* Raw probability of speech from the VAD model (0-1).
|
|
100
|
+
*/
|
|
101
|
+
probability: number;
|
|
102
|
+
/**
|
|
103
|
+
* Current state enum.
|
|
104
|
+
*/
|
|
105
|
+
state: "silent" | "speech_starting" | "speaking" | "speech_ending";
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
* Events emitted by the audio pipeline.
|
|
110
|
+
*/
|
|
111
|
+
export type AudioPipelineEvents = {
|
|
112
|
+
vadChange: VADState;
|
|
113
|
+
error: Error;
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Handle to a running audio processing pipeline.
|
|
118
|
+
*/
|
|
119
|
+
export interface AudioPipelineHandle {
|
|
120
|
+
/**
|
|
121
|
+
* The processed MediaStreamTrack.
|
|
122
|
+
*/
|
|
123
|
+
readonly processedTrack: MediaStreamTrack;
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Event emitter for VAD state and errors.
|
|
127
|
+
*/
|
|
128
|
+
readonly events: Emitter<AudioPipelineEvents>;
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Current VAD state.
|
|
132
|
+
*/
|
|
133
|
+
readonly state: VADState;
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* Update configuration at runtime.
|
|
137
|
+
*/
|
|
138
|
+
setConfig(config: Partial<AudioProcessingConfig>): void;
|
|
139
|
+
|
|
140
|
+
/**
|
|
141
|
+
* Stop processing and release resources.
|
|
142
|
+
*/
|
|
143
|
+
dispose(): void;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Interface for a Noise Suppression Plugin.
|
|
148
|
+
*/
|
|
149
|
+
export interface NoiseSuppressionPlugin {
|
|
150
|
+
name: string;
|
|
151
|
+
createNode(
|
|
152
|
+
context: AudioContext,
|
|
153
|
+
config: AudioProcessingConfig["noiseSuppression"],
|
|
154
|
+
): Promise<AudioNode>;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Interface for a VAD Plugin.
|
|
159
|
+
*/
|
|
160
|
+
export interface VADPlugin {
|
|
161
|
+
name: string;
|
|
162
|
+
createNode(
|
|
163
|
+
context: AudioContext,
|
|
164
|
+
config: AudioProcessingConfig["vad"],
|
|
165
|
+
onDecision: (probability: number) => void,
|
|
166
|
+
): Promise<AudioNode>;
|
|
167
|
+
}
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import type { AudioProcessingConfig, VADPlugin } from "../types.js";
|
|
2
|
+
|
|
3
|
+
// Inline AudioWorklet processor for Energy VAD
|
|
4
|
+
const energyVadWorkletCode = `
|
|
5
|
+
class EnergyVadProcessor extends AudioWorkletProcessor {
|
|
6
|
+
constructor() {
|
|
7
|
+
super();
|
|
8
|
+
this.smoothing = 0.95;
|
|
9
|
+
this.energy = 0;
|
|
10
|
+
this.noiseFloor = 0.001;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
process(inputs, outputs, parameters) {
|
|
14
|
+
const input = inputs[0];
|
|
15
|
+
if (!input || !input.length) return true;
|
|
16
|
+
const channel = input[0];
|
|
17
|
+
|
|
18
|
+
// Calculate RMS
|
|
19
|
+
let sum = 0;
|
|
20
|
+
for (let i = 0; i < channel.length; i++) {
|
|
21
|
+
sum += channel[i] * channel[i];
|
|
22
|
+
}
|
|
23
|
+
const rms = Math.sqrt(sum / channel.length);
|
|
24
|
+
|
|
25
|
+
// Simple adaptive noise floor (very basic)
|
|
26
|
+
if (rms < this.noiseFloor) {
|
|
27
|
+
this.noiseFloor = this.noiseFloor * 0.99 + rms * 0.01;
|
|
28
|
+
} else {
|
|
29
|
+
this.noiseFloor = this.noiseFloor * 0.999 + rms * 0.001;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Calculate "probability" based on SNR
|
|
33
|
+
// This is a heuristic mapping from energy to 0-1
|
|
34
|
+
const snr = rms / (this.noiseFloor + 1e-6);
|
|
35
|
+
const probability = Math.min(1, Math.max(0, (snr - 1.5) / 10)); // Arbitrary scaling
|
|
36
|
+
|
|
37
|
+
this.port.postMessage({ probability });
|
|
38
|
+
|
|
39
|
+
return true;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
registerProcessor('energy-vad-processor', EnergyVadProcessor);
|
|
43
|
+
`;
|
|
44
|
+
|
|
45
|
+
export class EnergyVADPlugin implements VADPlugin {
|
|
46
|
+
name = "energy-vad";
|
|
47
|
+
|
|
48
|
+
async createNode(
|
|
49
|
+
context: AudioContext,
|
|
50
|
+
config: AudioProcessingConfig["vad"],
|
|
51
|
+
onDecision: (probability: number) => void,
|
|
52
|
+
): Promise<AudioNode> {
|
|
53
|
+
// 1. Create Worklet
|
|
54
|
+
const blob = new Blob([energyVadWorkletCode], {
|
|
55
|
+
type: "application/javascript",
|
|
56
|
+
});
|
|
57
|
+
const url = URL.createObjectURL(blob);
|
|
58
|
+
|
|
59
|
+
try {
|
|
60
|
+
await context.audioWorklet.addModule(url);
|
|
61
|
+
} catch (e) {
|
|
62
|
+
console.warn("Failed to add Energy VAD worklet:", e);
|
|
63
|
+
throw e;
|
|
64
|
+
} finally {
|
|
65
|
+
URL.revokeObjectURL(url);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// 2. Create Node
|
|
69
|
+
const node = new AudioWorkletNode(context, "energy-vad-processor");
|
|
70
|
+
|
|
71
|
+
node.port.onmessage = (event) => {
|
|
72
|
+
const { probability } = event.data;
|
|
73
|
+
onDecision(probability);
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
return node;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import type { AudioProcessingConfig, VADState } from "../types.js";
|
|
2
|
+
|
|
3
|
+
export class VADStateMachine {
|
|
4
|
+
private config: Required<NonNullable<AudioProcessingConfig["vad"]>>;
|
|
5
|
+
private currentState: VADState["state"] = "silent";
|
|
6
|
+
private lastSpeechTime = 0;
|
|
7
|
+
private speechStartTime = 0;
|
|
8
|
+
private frameDurationMs = 20; // Assumed frame duration, updated by calls
|
|
9
|
+
|
|
10
|
+
constructor(config: AudioProcessingConfig["vad"]) {
|
|
11
|
+
this.config = {
|
|
12
|
+
enabled: config?.enabled ?? true,
|
|
13
|
+
pluginName: config?.pluginName ?? "energy-vad",
|
|
14
|
+
startThreshold: config?.startThreshold ?? 0.5,
|
|
15
|
+
stopThreshold: config?.stopThreshold ?? 0.4,
|
|
16
|
+
hangoverMs: config?.hangoverMs ?? 300,
|
|
17
|
+
preRollMs: config?.preRollMs ?? 200,
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
updateConfig(config: Partial<AudioProcessingConfig["vad"]>) {
|
|
22
|
+
this.config = { ...this.config, ...config };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
processFrame(probability: number, timestamp: number): VADState {
|
|
26
|
+
const { startThreshold, stopThreshold, hangoverMs } = this.config;
|
|
27
|
+
|
|
28
|
+
let newState = this.currentState;
|
|
29
|
+
|
|
30
|
+
if (
|
|
31
|
+
this.currentState === "silent" ||
|
|
32
|
+
this.currentState === "speech_ending"
|
|
33
|
+
) {
|
|
34
|
+
if (probability >= startThreshold) {
|
|
35
|
+
newState = "speech_starting";
|
|
36
|
+
this.speechStartTime = timestamp;
|
|
37
|
+
this.lastSpeechTime = timestamp;
|
|
38
|
+
} else {
|
|
39
|
+
newState = "silent";
|
|
40
|
+
}
|
|
41
|
+
} else if (
|
|
42
|
+
this.currentState === "speech_starting" ||
|
|
43
|
+
this.currentState === "speaking"
|
|
44
|
+
) {
|
|
45
|
+
if (probability >= stopThreshold) {
|
|
46
|
+
newState = "speaking";
|
|
47
|
+
this.lastSpeechTime = timestamp;
|
|
48
|
+
} else {
|
|
49
|
+
// Check hangover
|
|
50
|
+
const timeSinceSpeech = timestamp - this.lastSpeechTime;
|
|
51
|
+
if (timeSinceSpeech < hangoverMs) {
|
|
52
|
+
newState = "speaking"; // Still in hangover
|
|
53
|
+
} else {
|
|
54
|
+
newState = "speech_ending";
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Transition from starting/ending to stable states
|
|
60
|
+
if (newState === "speech_starting") newState = "speaking";
|
|
61
|
+
if (newState === "speech_ending") newState = "silent";
|
|
62
|
+
|
|
63
|
+
this.currentState = newState;
|
|
64
|
+
|
|
65
|
+
return {
|
|
66
|
+
isSpeaking: newState === "speaking",
|
|
67
|
+
probability,
|
|
68
|
+
state: newState,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
}
|
package/tsconfig.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
// Environment setup & latest features
|
|
4
|
+
"lib": ["ESNext", "DOM", "DOM.Iterable"],
|
|
5
|
+
"target": "ESNext",
|
|
6
|
+
"module": "Preserve",
|
|
7
|
+
"moduleDetection": "force",
|
|
8
|
+
"jsx": "react-jsx",
|
|
9
|
+
"allowJs": true,
|
|
10
|
+
|
|
11
|
+
// Bundler mode
|
|
12
|
+
"moduleResolution": "bundler",
|
|
13
|
+
"allowImportingTsExtensions": true,
|
|
14
|
+
"verbatimModuleSyntax": true,
|
|
15
|
+
"noEmit": true,
|
|
16
|
+
|
|
17
|
+
// Best practices
|
|
18
|
+
"strict": true,
|
|
19
|
+
"skipLibCheck": true,
|
|
20
|
+
"noFallthroughCasesInSwitch": true,
|
|
21
|
+
"noUncheckedIndexedAccess": true,
|
|
22
|
+
"noImplicitOverride": true,
|
|
23
|
+
|
|
24
|
+
// Some stricter flags (disabled by default)
|
|
25
|
+
"noUnusedLocals": false,
|
|
26
|
+
"noUnusedParameters": false,
|
|
27
|
+
"noPropertyAccessFromIndexSignature": false
|
|
28
|
+
}
|
|
29
|
+
}
|