@newgameplusinc/odyssey-audio-video-sdk-dev 1.0.56 → 1.0.58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -255
- package/dist/index.d.ts +0 -15
- package/dist/index.js +1 -67
- package/package.json +3 -4
- package/dist/MLNoiseSuppressor.d.ts +0 -76
- package/dist/MLNoiseSuppressor.js +0 -439
- package/dist/UltimateMLNoiseSuppressor.d.ts +0 -74
- package/dist/UltimateMLNoiseSuppressor.js +0 -309
package/README.md
CHANGED
|
@@ -11,69 +11,12 @@ It mirrors the production SDK used by Odyssey V2 and ships ready-to-drop into an
|
|
|
11
11
|
## Feature Highlights
|
|
12
12
|
- 🔌 **One class to rule it all** – `OdysseySpatialComms` wires transports, producers, consumers, and room state.
|
|
13
13
|
- 🧭 **Accurate pose propagation** – `updatePosition()` streams listener pose to the SFU while `participant-position-updated` keeps the local store in sync.
|
|
14
|
-
- 🤖 **AI-Powered Noise Suppression** – Deep learning model (TensorFlow.js) runs client-side to remove background noise BEFORE audio reaches MediaSoup. Uses trained LSTM-based mask prediction for superior noise cancellation without affecting voice quality.
|
|
15
14
|
- 🎧 **Studio-grade spatial audio** – each remote participant gets a dedicated Web Audio graph: denoiser → high-pass → low-pass → HRTF `PannerNode` → adaptive gain → master compressor. Uses Web Audio API's HRTF panning model for accurate left/right/front/back positioning based on distance and direction, with custom AudioWorklet processors for noise cancellation and voice tuning.
|
|
16
15
|
- 🎥 **Camera-ready streams** – video tracks are exposed separately so UI layers can render muted `<video>` tags while audio stays inside Web Audio.
|
|
17
16
|
- 🔁 **EventEmitter contract** – subscribe to `room-joined`, `consumer-created`, `participant-position-updated`, etc., without touching Socket.IO directly.
|
|
18
17
|
|
|
19
18
|
## Quick Start
|
|
20
19
|
|
|
21
|
-
### With ML Noise Suppression (Recommended)
|
|
22
|
-
|
|
23
|
-
```ts
|
|
24
|
-
import {
|
|
25
|
-
OdysseySpatialComms,
|
|
26
|
-
Direction,
|
|
27
|
-
Position,
|
|
28
|
-
} from "@newgameplusinc/odyssey-audio-video-sdk-dev";
|
|
29
|
-
|
|
30
|
-
const sdk = new OdysseySpatialComms("https://mediasoup-server.example.com");
|
|
31
|
-
|
|
32
|
-
// 1) Initialize ML noise suppression (place model files in public/models/)
|
|
33
|
-
await sdk.initializeMLNoiseSuppression(
|
|
34
|
-
'/models/odyssey_noise_suppressor_v1/model.json'
|
|
35
|
-
);
|
|
36
|
-
|
|
37
|
-
// 2) Join a room
|
|
38
|
-
await sdk.joinRoom({
|
|
39
|
-
roomId: "demo-room",
|
|
40
|
-
userId: "user-123",
|
|
41
|
-
deviceId: "device-123",
|
|
42
|
-
position: { x: 0, y: 0, z: 0 },
|
|
43
|
-
direction: { x: 0, y: 1, z: 0 },
|
|
44
|
-
});
|
|
45
|
-
|
|
46
|
-
// 3) Produce local media (ML cleaning applied automatically to audio)
|
|
47
|
-
const stream = await navigator.mediaDevices.getUserMedia({
|
|
48
|
-
audio: {
|
|
49
|
-
echoCancellation: true,
|
|
50
|
-
noiseSuppression: false, // Disable browser NS, use ML instead!
|
|
51
|
-
autoGainControl: true,
|
|
52
|
-
sampleRate: 48000,
|
|
53
|
-
},
|
|
54
|
-
video: true
|
|
55
|
-
});
|
|
56
|
-
for (const track of stream.getTracks()) {
|
|
57
|
-
await sdk.produceTrack(track); // ML processes audio tracks automatically
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
// 4) Toggle ML noise suppression on/off
|
|
61
|
-
sdk.toggleMLNoiseSuppression(true); // or false
|
|
62
|
-
|
|
63
|
-
// 5) Handle remote tracks
|
|
64
|
-
sdk.on("consumer-created", async ({ participant, track }) => {
|
|
65
|
-
if (track.kind === "video") {
|
|
66
|
-
attachVideo(track, participant.participantId);
|
|
67
|
-
}
|
|
68
|
-
});
|
|
69
|
-
|
|
70
|
-
// 6) Keep spatial audio honest
|
|
71
|
-
sdk.updatePosition(currentPos, currentDir);
|
|
72
|
-
sdk.setListenerFromLSD(listenerPos, cameraPos, lookAtPos);
|
|
73
|
-
```
|
|
74
|
-
|
|
75
|
-
### Without ML Noise Suppression (Legacy)
|
|
76
|
-
|
|
77
20
|
```ts
|
|
78
21
|
import {
|
|
79
22
|
OdysseySpatialComms,
|
|
@@ -113,83 +56,23 @@ sdk.setListenerFromLSD(listenerPos, cameraPos, lookAtPos);
|
|
|
113
56
|
## Audio Flow (Server ↔ Browser)
|
|
114
57
|
|
|
115
58
|
```
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
│
|
|
122
|
-
│
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
│ SDK: mediasoupManager.produce() │
|
|
134
|
-
│ (Sends clean track to server) │
|
|
135
|
-
└────────┬─────────────────────────────────┘
|
|
136
|
-
│
|
|
137
|
-
│ WebRTC/RTP
|
|
138
|
-
▼
|
|
139
|
-
┌─────────────────────────────────────────────┐
|
|
140
|
-
│ SERVER-SIDE ROUTING │
|
|
141
|
-
└─────────────────────────────────────────────┘
|
|
142
|
-
|
|
143
|
-
┌──────────────┐ update-position ┌──────────────┐ route clean audio ┌──────────────────┐
|
|
144
|
-
│ Browser LSD │ ──────────────────▶ │ MediaSoup SFU│ ───────────────────▶ │ Other Clients │
|
|
145
|
-
│ (Unreal data)│ │ + Socket.IO │ │ (Receive RTP) │
|
|
146
|
-
└──────────────┘ └──────┬───────┘ └──────┬───────────┘
|
|
147
|
-
│ │
|
|
148
|
-
│ consumer-created event │
|
|
149
|
-
▼ ▼
|
|
150
|
-
┌─────────────────────────────────────────────┐
|
|
151
|
-
│ REMOTE AUDIO PLAYBACK │
|
|
152
|
-
└─────────────────────────────────────────────┘
|
|
153
|
-
|
|
154
|
-
┌──────────────────┐
|
|
155
|
-
│ SDK Event Bus │
|
|
156
|
-
│ (EventManager) │
|
|
157
|
-
└────────┬─────────┘
|
|
158
|
-
│ track + pose
|
|
159
|
-
▼
|
|
160
|
-
┌──────────────────┐
|
|
161
|
-
│ SpatialAudioMgr │
|
|
162
|
-
│ (Web Audio API) │
|
|
163
|
-
│ • Denoiser │◀─── Traditional noise reduction
|
|
164
|
-
│ • HP/LP Filters │ (runs on received audio)
|
|
165
|
-
│ • HRTF Panner │
|
|
166
|
-
│ • Distance Gain │
|
|
167
|
-
│ • Compressor │
|
|
168
|
-
└────────┬─────────┘
|
|
169
|
-
│
|
|
170
|
-
▼
|
|
171
|
-
┌──────────────────┐
|
|
172
|
-
│ Web Audio Graph │
|
|
173
|
-
└────────┬─────────┘
|
|
174
|
-
│
|
|
175
|
-
▼
|
|
176
|
-
Listener ears (Left/Right)
|
|
177
|
-
│
|
|
178
|
-
▼
|
|
179
|
-
System Output
|
|
180
|
-
```
|
|
181
|
-
|
|
182
|
-
### ML Noise Suppression Pipeline (Client-Side)
|
|
183
|
-
```
|
|
184
|
-
Mic → getUserMedia()
|
|
185
|
-
↓
|
|
186
|
-
Vue: sdk.produceTrack(audioTrack)
|
|
187
|
-
↓
|
|
188
|
-
SDK: mlNoiseSuppressor.processMediaStream() [TensorFlow.js runs here]
|
|
189
|
-
↓
|
|
190
|
-
SDK: mediasoupManager.produce(cleanTrack)
|
|
191
|
-
↓
|
|
192
|
-
MediaSoup Server → Other participants hear clean audio ✅
|
|
59
|
+
┌──────────────┐ update-position ┌──────────────┐ pose + tracks ┌──────────────────┐
|
|
60
|
+
│ Browser LSD │ ──────────────────▶ │ MediaSoup SFU│ ────────────────▶ │ SDK Event Bus │
|
|
61
|
+
│ (Unreal data)│ │ + Socket.IO │ │ (EventManager) │
|
|
62
|
+
└──────┬───────┘ └──────┬───────┘ └──────────┬────────┘
|
|
63
|
+
│ │ track + pose
|
|
64
|
+
│ │ ▼
|
|
65
|
+
│ ┌────────▼────────┐ ┌──────────────────┐
|
|
66
|
+
│ audio RTP │ consumer-created│ │ SpatialAudioMgr │
|
|
67
|
+
└──────────────────────────▶│ setup per-user │◀──────────────────────│ (Web Audio API) │
|
|
68
|
+
└────────┬────────┘ │ - Denoiser │
|
|
69
|
+
│ │ - HP / LP │
|
|
70
|
+
│ │ - HRTF Panner │
|
|
71
|
+
▼ │ - Gain + Comp │
|
|
72
|
+
Web Audio Graph └──────────┬───────┘
|
|
73
|
+
│ │
|
|
74
|
+
▼ ▼
|
|
75
|
+
Listener ears (Left/Right) System Output
|
|
193
76
|
```
|
|
194
77
|
|
|
195
78
|
### Web Audio Algorithms
|
|
@@ -236,121 +119,7 @@ These layers run entirely in Web Audio, so you can ship “AirPods-style” back
|
|
|
236
119
|
3. **Position + direction updates** – every `participant-position-updated` event calls `updateSpatialAudio(participantId, position, direction)`. The position feeds the panner’s XYZ, while the direction vector sets the source orientation so voices project forward relative to avatar facing.
|
|
237
120
|
4. **Distance-aware gain** – the manager stores the latest listener pose and computes the Euclidean distance to each remote participant on every update. A custom rolloff curve adjusts gain before the compressor, giving the “someone on my left / far away” perception without blowing out master levels.
|
|
238
121
|
5. **Left/right rendering** – because the panner uses `panningModel = "HRTF"`, browsers feed the processed signal into the user’s audio hardware with head-related transfer functions, producing natural interaural time/intensity differences.
|
|
239
|
-
## ML Noise Suppression (Deep Learning Pre-Processing)
|
|
240
|
-
|
|
241
|
-
**NEW:** The SDK now includes an optional **AI-powered noise suppression** layer that runs **BEFORE** audio reaches MediaSoup, using a trained TensorFlow.js model.
|
|
242
|
-
|
|
243
|
-
### Why ML Noise Suppression?
|
|
244
|
-
- **Superior noise removal** – Deep learning models learn complex noise patterns that traditional DSP can't handle (keyboard typing, paper rustling, traffic, etc.)
|
|
245
|
-
- **Voice preservation** – LSTM-based mask prediction preserves natural voice quality while removing background noise
|
|
246
|
-
- **Client-side processing** – Runs entirely in the browser using TensorFlow.js (WebGL/WebAssembly acceleration)
|
|
247
|
-
- **Privacy-first** – Audio never leaves the user's device; processing happens locally
|
|
248
|
-
- **Zero latency** – <10ms processing time per frame, suitable for real-time communication
|
|
249
|
-
|
|
250
|
-
### Architecture
|
|
251
|
-
```
|
|
252
|
-
Raw Mic Audio → ML Model (TF.js) → Clean Audio → MediaSoup → Traditional Denoiser → Spatial Audio
|
|
253
|
-
```
|
|
254
|
-
|
|
255
|
-
The ML model applies **mask-based spectral subtraction** trained on diverse noise datasets:
|
|
256
|
-
1. Extracts mel-spectrogram from raw audio
|
|
257
|
-
2. Predicts a noise mask (0-1 per frequency bin) using Bidirectional LSTM
|
|
258
|
-
3. Applies mask to remove noise while preserving speech
|
|
259
|
-
4. Reconstructs clean audio waveform
|
|
260
|
-
|
|
261
|
-
### Setup ML Noise Suppression
|
|
262
|
-
|
|
263
|
-
**1. Place Model Files:**
|
|
264
|
-
```
|
|
265
|
-
YourApp/public/models/odyssey_noise_suppressor_v1/
|
|
266
|
-
├── model.json # TF.js model architecture
|
|
267
|
-
├── group1-shard*.bin # Model weights (multiple files)
|
|
268
|
-
├── normalization_stats.json # Preprocessing parameters
|
|
269
|
-
└── model_config.json # Audio config (48kHz, n_mels, etc.)
|
|
270
|
-
```
|
|
271
|
-
|
|
272
|
-
**2. Initialize in Code:**
|
|
273
|
-
```ts
|
|
274
|
-
const sdk = new OdysseySpatialComms('wss://your-server.com');
|
|
275
|
-
|
|
276
|
-
// Initialize ML noise suppression
|
|
277
|
-
try {
|
|
278
|
-
await sdk.initializeMLNoiseSuppression(
|
|
279
|
-
'/models/odyssey_noise_suppressor_v1/model.json'
|
|
280
|
-
);
|
|
281
|
-
console.log('✅ ML Noise Suppression enabled');
|
|
282
|
-
} catch (error) {
|
|
283
|
-
console.error('ML initialization failed:', error);
|
|
284
|
-
// Graceful degradation - SDK continues without ML
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
// Produce audio tracks (ML cleaning applied automatically)
|
|
288
|
-
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
289
|
-
await sdk.produceTrack(stream.getAudioTracks()[0]);
|
|
290
|
-
|
|
291
|
-
// Toggle ML on/off at runtime
|
|
292
|
-
sdk.toggleMLNoiseSuppression(false); // Disable
|
|
293
|
-
sdk.toggleMLNoiseSuppression(true); // Re-enable
|
|
294
|
-
|
|
295
|
-
// Check ML status
|
|
296
|
-
if (sdk.isMLNoiseSuppressionEnabled()) {
|
|
297
|
-
console.log('ML is active');
|
|
298
|
-
}
|
|
299
|
-
```
|
|
300
|
-
|
|
301
|
-
**3. Recommended Audio Constraints:**
|
|
302
|
-
```ts
|
|
303
|
-
const stream = await navigator.mediaDevices.getUserMedia({
|
|
304
|
-
audio: {
|
|
305
|
-
echoCancellation: true, // Keep echo cancellation
|
|
306
|
-
noiseSuppression: false, // Disable browser NS (ML replaces it)
|
|
307
|
-
autoGainControl: true, // Keep AGC
|
|
308
|
-
sampleRate: 48000, // Match model training (48kHz)
|
|
309
|
-
},
|
|
310
|
-
});
|
|
311
|
-
```
|
|
312
|
-
|
|
313
|
-
### ML Model Details
|
|
314
|
-
- **Architecture:** Bidirectional LSTM (2 layers, 256 units) + Dense layers
|
|
315
|
-
- **Input:** 48kHz audio → Mel-spectrogram (128 bins, 8-frame sequences)
|
|
316
|
-
- **Output:** Time-frequency mask (0-1 values per bin)
|
|
317
|
-
- **Latency:** ~5-8ms per chunk (AudioWorklet processing)
|
|
318
|
-
- **Model Size:** ~2-3 MB (quantized to uint8)
|
|
319
|
-
- **Training:** LibriSpeech (clean speech) + AudioSet (noise) datasets
|
|
320
|
-
|
|
321
|
-
### When to Use ML vs Traditional Denoiser
|
|
322
|
-
|
|
323
|
-
| Feature | ML Noise Suppression | Traditional Denoiser (AudioWorklet) |
|
|
324
|
-
|---------|---------------------|-------------------------------------|
|
|
325
|
-
| **Noise Types** | Complex (keyboard, traffic, music) | Stationary (fan, HVAC, hiss) |
|
|
326
|
-
| **Voice Quality** | Excellent (learned patterns) | Good (spectral shaping) |
|
|
327
|
-
| **CPU Usage** | Medium (TF.js optimized) | Low (simple DSP) |
|
|
328
|
-
| **Latency** | ~5-8ms | ~1-2ms |
|
|
329
|
-
| **Use Case** | Noisy environments | Quiet rooms with constant noise |
|
|
330
|
-
|
|
331
|
-
**Best Practice:** Enable **both** for maximum quality:
|
|
332
|
-
- ML suppresses complex noise (pre-MediaSoup)
|
|
333
|
-
- Traditional denoiser handles residual stationary noise (post-receive)
|
|
334
|
-
|
|
335
|
-
### Troubleshooting
|
|
336
|
-
|
|
337
|
-
**Model fails to load:**
|
|
338
|
-
- Ensure model files are served as static assets (check browser Network tab)
|
|
339
|
-
- Verify CORS headers if serving from CDN
|
|
340
|
-
- Check browser console for TensorFlow.js errors
|
|
341
|
-
|
|
342
|
-
**High CPU usage:**
|
|
343
|
-
- TF.js automatically uses WebGL when available (much faster)
|
|
344
|
-
- Disable ML on low-end devices: `sdk.toggleMLNoiseSuppression(false)`
|
|
345
|
-
|
|
346
|
-
**Voice sounds muffled:**
|
|
347
|
-
- Model trained on 48kHz audio; ensure mic uses same sample rate
|
|
348
|
-
- Check if browser is downsampling to 16kHz (some mobile browsers do this)
|
|
349
122
|
|
|
350
|
-
**Doesn't remove all noise:**
|
|
351
|
-
- ML works best on noise types seen during training
|
|
352
|
-
- Combine with traditional denoiser for residual cleanup
|
|
353
|
-
- Extremely loud noise (>30 dB SNR) may leak through
|
|
354
123
|
## Video Flow (Capture ↔ Rendering)
|
|
355
124
|
|
|
356
125
|
```
|
|
@@ -368,19 +137,17 @@ const stream = await navigator.mediaDevices.getUserMedia({
|
|
|
368
137
|
```
|
|
369
138
|
|
|
370
139
|
## Core Classes
|
|
371
|
-
- `src/index.ts` – `OdysseySpatialComms` (socket lifecycle, producers/consumers, event surface
|
|
140
|
+
- `src/index.ts` – `OdysseySpatialComms` (socket lifecycle, producers/consumers, event surface).
|
|
372
141
|
- `src/MediasoupManager.ts` – transport helpers for produce/consume/resume.
|
|
373
142
|
- `src/SpatialAudioManager.ts` – Web Audio orchestration (listener transforms, per-participant chains, denoiser, distance math).
|
|
374
|
-
- `src/MLNoiseSuppressor.ts` – TensorFlow.js-based deep learning noise suppression (mel-spectrogram extraction, LSTM inference, mask application).
|
|
375
143
|
- `src/EventManager.ts` – lightweight EventEmitter used by the entire SDK.
|
|
376
144
|
|
|
377
145
|
## Integration Checklist
|
|
378
146
|
1. **Instantiate once** per page/tab and keep it in a store (Vuex, Redux, Zustand, etc.).
|
|
379
|
-
2. **
|
|
380
|
-
3. **
|
|
381
|
-
4. **
|
|
382
|
-
5. **
|
|
383
|
-
6. **Monitor logs** – browser console shows `🎧 SDK`, `📍 SDK`, `🎚️ [Spatial Audio]`, and `🎤 ML` statements for every critical hop.
|
|
147
|
+
2. **Pipe LSD/Lap data** from your rendering engine into `updatePosition()` + `setListenerFromLSD()` at ~10 Hz.
|
|
148
|
+
3. **Render videos muted** – never attach remote audio tracks straight to DOM; let `SpatialAudioManager` own playback.
|
|
149
|
+
4. **Push avatar telemetry back to Unreal** so `remoteSpatialData` can render minimaps/circles (see Odyssey V2 `sendMediaSoupParticipantsToUnreal`).
|
|
150
|
+
5. **Monitor logs** – browser console shows `🎧 SDK`, `📍 SDK`, and `🎚️ [Spatial Audio]` statements for every critical hop.
|
|
384
151
|
|
|
385
152
|
## Server Contract (Socket.IO events)
|
|
386
153
|
| Event | Direction | Payload |
|
package/dist/index.d.ts
CHANGED
|
@@ -10,8 +10,6 @@ export declare class OdysseySpatialComms extends EventManager {
|
|
|
10
10
|
private localParticipant;
|
|
11
11
|
private mediasoupManager;
|
|
12
12
|
private spatialAudioManager;
|
|
13
|
-
private mlNoiseSuppressor;
|
|
14
|
-
private mlNoiseSuppressionEnabled;
|
|
15
13
|
constructor(serverUrl: string, spatialOptions?: SpatialAudioOptions);
|
|
16
14
|
on(event: OdysseyEvent, listener: (...args: any[]) => void): this;
|
|
17
15
|
emit(event: OdysseyEvent, ...args: any[]): boolean;
|
|
@@ -30,19 +28,6 @@ export declare class OdysseySpatialComms extends EventManager {
|
|
|
30
28
|
leaveRoom(): void;
|
|
31
29
|
resumeAudio(): Promise<void>;
|
|
32
30
|
getAudioContextState(): AudioContextState;
|
|
33
|
-
/**
|
|
34
|
-
* Initialize ML noise suppression
|
|
35
|
-
* @param modelUrl - URL to model.json (e.g., '/models/odyssey_noise_suppressor_v1/model.json')
|
|
36
|
-
*/
|
|
37
|
-
initializeMLNoiseSuppression(modelUrl: string): Promise<void>;
|
|
38
|
-
/**
|
|
39
|
-
* Toggle ML noise suppression on/off
|
|
40
|
-
*/
|
|
41
|
-
toggleMLNoiseSuppression(enabled: boolean): void;
|
|
42
|
-
/**
|
|
43
|
-
* Check if ML noise suppression is enabled
|
|
44
|
-
*/
|
|
45
|
-
isMLNoiseSuppressionEnabled(): boolean;
|
|
46
31
|
produceTrack(track: MediaStreamTrack, appData?: {
|
|
47
32
|
isScreenshare?: boolean;
|
|
48
33
|
}): Promise<any>;
|
package/dist/index.js
CHANGED
|
@@ -5,14 +5,11 @@ const socket_io_client_1 = require("socket.io-client");
|
|
|
5
5
|
const EventManager_1 = require("./EventManager");
|
|
6
6
|
const MediasoupManager_1 = require("./MediasoupManager");
|
|
7
7
|
const SpatialAudioManager_1 = require("./SpatialAudioManager");
|
|
8
|
-
const MLNoiseSuppressor_1 = require("./MLNoiseSuppressor");
|
|
9
8
|
class OdysseySpatialComms extends EventManager_1.EventManager {
|
|
10
9
|
constructor(serverUrl, spatialOptions) {
|
|
11
10
|
super(); // Initialize the EventEmitter base class
|
|
12
11
|
this.room = null;
|
|
13
12
|
this.localParticipant = null;
|
|
14
|
-
this.mlNoiseSuppressor = null;
|
|
15
|
-
this.mlNoiseSuppressionEnabled = false;
|
|
16
13
|
this.socket = (0, socket_io_client_1.io)(serverUrl, {
|
|
17
14
|
transports: ["websocket"],
|
|
18
15
|
});
|
|
@@ -104,71 +101,8 @@ class OdysseySpatialComms extends EventManager_1.EventManager {
|
|
|
104
101
|
getAudioContextState() {
|
|
105
102
|
return this.spatialAudioManager.getAudioContextState();
|
|
106
103
|
}
|
|
107
|
-
/**
|
|
108
|
-
* Initialize ML noise suppression
|
|
109
|
-
* @param modelUrl - URL to model.json (e.g., '/models/odyssey_noise_suppressor_v1/model.json')
|
|
110
|
-
*/
|
|
111
|
-
async initializeMLNoiseSuppression(modelUrl) {
|
|
112
|
-
if (this.mlNoiseSuppressor) {
|
|
113
|
-
console.log('ML Noise Suppression already initialized');
|
|
114
|
-
return;
|
|
115
|
-
}
|
|
116
|
-
try {
|
|
117
|
-
console.log('🎤 Initializing ML Noise Suppression...');
|
|
118
|
-
this.mlNoiseSuppressor = new MLNoiseSuppressor_1.MLNoiseSuppressor();
|
|
119
|
-
await this.mlNoiseSuppressor.initialize(modelUrl, this.spatialAudioManager.getAudioContext());
|
|
120
|
-
this.mlNoiseSuppressionEnabled = true;
|
|
121
|
-
console.log('✅ ML Noise Suppression enabled');
|
|
122
|
-
}
|
|
123
|
-
catch (error) {
|
|
124
|
-
console.error('❌ Failed to initialize ML Noise Suppression:', error);
|
|
125
|
-
this.mlNoiseSuppressor = null;
|
|
126
|
-
this.mlNoiseSuppressionEnabled = false;
|
|
127
|
-
throw error;
|
|
128
|
-
}
|
|
129
|
-
}
|
|
130
|
-
/**
|
|
131
|
-
* Toggle ML noise suppression on/off
|
|
132
|
-
*/
|
|
133
|
-
toggleMLNoiseSuppression(enabled) {
|
|
134
|
-
if (!this.mlNoiseSuppressor) {
|
|
135
|
-
console.warn('ML Noise Suppression not initialized. Call initializeMLNoiseSuppression() first.');
|
|
136
|
-
return;
|
|
137
|
-
}
|
|
138
|
-
this.mlNoiseSuppressionEnabled = enabled;
|
|
139
|
-
console.log(`🎤 ML Noise Suppression: ${enabled ? 'ON' : 'OFF'}`);
|
|
140
|
-
}
|
|
141
|
-
/**
|
|
142
|
-
* Check if ML noise suppression is enabled
|
|
143
|
-
*/
|
|
144
|
-
isMLNoiseSuppressionEnabled() {
|
|
145
|
-
return this.mlNoiseSuppressionEnabled && this.mlNoiseSuppressor !== null;
|
|
146
|
-
}
|
|
147
104
|
async produceTrack(track, appData) {
|
|
148
|
-
|
|
149
|
-
let processedTrack = track;
|
|
150
|
-
// Apply ML noise suppression to audio BEFORE sending to MediaSoup
|
|
151
|
-
if (track.kind === 'audio' && this.mlNoiseSuppressionEnabled && this.mlNoiseSuppressor) {
|
|
152
|
-
try {
|
|
153
|
-
console.log('🎤 [SDK] Applying ML noise suppression to audio...');
|
|
154
|
-
const inputStream = new MediaStream([track]);
|
|
155
|
-
console.log('🎤 [SDK] Created input stream with track');
|
|
156
|
-
const cleanedStream = await this.mlNoiseSuppressor.processMediaStream(inputStream);
|
|
157
|
-
console.log('🎤 [SDK] Got cleaned stream from ML');
|
|
158
|
-
processedTrack = cleanedStream.getAudioTracks()[0];
|
|
159
|
-
console.log(`✅ [SDK] ML noise suppression applied - processed track state: ${processedTrack.readyState}`);
|
|
160
|
-
}
|
|
161
|
-
catch (error) {
|
|
162
|
-
console.error('❌ [SDK] ML noise suppression failed, using original track:', error);
|
|
163
|
-
processedTrack = track; // Fallback to original track
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
else {
|
|
167
|
-
console.log(`ℹ️ [SDK] Skipping ML - kind: ${track.kind}, ML enabled: ${this.mlNoiseSuppressionEnabled}`);
|
|
168
|
-
}
|
|
169
|
-
console.log(`📤 [SDK] Producing track to MediaSoup - kind: ${processedTrack.kind}, state: ${processedTrack.readyState}`);
|
|
170
|
-
const producer = await this.mediasoupManager.produce(processedTrack, appData);
|
|
171
|
-
console.log(`✅ [SDK] Producer created - id: ${producer.id}, kind: ${producer.kind}`);
|
|
105
|
+
const producer = await this.mediasoupManager.produce(track, appData);
|
|
172
106
|
if (this.localParticipant) {
|
|
173
107
|
const isFirstProducer = this.localParticipant.producers.size === 0;
|
|
174
108
|
this.localParticipant.producers.set(producer.id, producer);
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@newgameplusinc/odyssey-audio-video-sdk-dev",
|
|
3
|
-
"version": "1.0.
|
|
4
|
-
"description": "Odyssey Spatial Audio & Video SDK using MediaSoup for real-time communication
|
|
3
|
+
"version": "1.0.58",
|
|
4
|
+
"description": "Odyssey Spatial Audio & Video SDK using MediaSoup for real-time communication",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
7
7
|
"scripts": {
|
|
@@ -31,8 +31,7 @@
|
|
|
31
31
|
"socket.io-client": "^4.7.2",
|
|
32
32
|
"webrtc-adapter": "^8.2.3",
|
|
33
33
|
"mediasoup-client": "^3.6.90",
|
|
34
|
-
"events": "^3.3.0"
|
|
35
|
-
"@tensorflow/tfjs": "^4.22.0"
|
|
34
|
+
"events": "^3.3.0"
|
|
36
35
|
},
|
|
37
36
|
"devDependencies": {
|
|
38
37
|
"@types/node": "^20.0.0",
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* ML-Based Noise Suppressor for Odyssey MediaSoup SDK
|
|
3
|
-
* Uses trained TensorFlow.js BiLSTM model for real-time noise suppression
|
|
4
|
-
*
|
|
5
|
-
* Architecture: BiLSTM (256 units x 2) + Dense layers
|
|
6
|
-
* Input: Mel-spectrogram features (16 frames x 128 mels)
|
|
7
|
-
* Output: Noise suppression mask (0-1 per frequency bin)
|
|
8
|
-
*
|
|
9
|
-
* Trained on: LibriSpeech + UrbanSound8K + MS-SNSD datasets
|
|
10
|
-
* Performance: val_loss=0.038, SNR improvement ~12dB
|
|
11
|
-
*/
|
|
12
|
-
export declare class MLNoiseSuppressor {
|
|
13
|
-
private model;
|
|
14
|
-
private config;
|
|
15
|
-
private normStats;
|
|
16
|
-
private audioContext;
|
|
17
|
-
private isInitialized;
|
|
18
|
-
private processingNode;
|
|
19
|
-
private highPassFilter;
|
|
20
|
-
private frameBuffer;
|
|
21
|
-
private prevMask;
|
|
22
|
-
private readonly SMOOTHING_ALPHA;
|
|
23
|
-
private melFilterbank;
|
|
24
|
-
private fftSize;
|
|
25
|
-
private hannWindow;
|
|
26
|
-
/**
|
|
27
|
-
* Initialize the ML noise suppressor
|
|
28
|
-
* @param modelUrl URL to the model.json file
|
|
29
|
-
* @param audioContext Web Audio API AudioContext
|
|
30
|
-
*/
|
|
31
|
-
initialize(modelUrl: string, audioContext: AudioContext): Promise<void>;
|
|
32
|
-
/**
|
|
33
|
-
* Create Hann window for FFT
|
|
34
|
-
*/
|
|
35
|
-
private createHannWindow;
|
|
36
|
-
/**
|
|
37
|
-
* Create mel filterbank matrix
|
|
38
|
-
*/
|
|
39
|
-
private createMelFilterbank;
|
|
40
|
-
/**
|
|
41
|
-
* Compute FFT magnitude spectrum (optimized DFT for real-time)
|
|
42
|
-
*/
|
|
43
|
-
private computeFFT;
|
|
44
|
-
/**
|
|
45
|
-
* Compute mel-spectrogram features from audio frame
|
|
46
|
-
*/
|
|
47
|
-
private computeMelFeatures;
|
|
48
|
-
/**
|
|
49
|
-
* Process audio buffer with ML noise suppression
|
|
50
|
-
* @param inputBuffer Audio buffer to process (Float32Array)
|
|
51
|
-
* @returns Processed audio buffer
|
|
52
|
-
*/
|
|
53
|
-
processAudio(inputBuffer: Float32Array): Promise<Float32Array>;
|
|
54
|
-
/**
|
|
55
|
-
* Apply temporal smoothing to reduce artifacts (Apple-style)
|
|
56
|
-
*/
|
|
57
|
-
private applyTemporalSmoothing;
|
|
58
|
-
/**
|
|
59
|
-
* Apply mask with voice frequency preservation
|
|
60
|
-
*/
|
|
61
|
-
private applyMaskWithVoicePreservation;
|
|
62
|
-
/**
|
|
63
|
-
* Process MediaStream with ML noise suppression
|
|
64
|
-
* @param inputStream MediaStream to process
|
|
65
|
-
* @returns Cleaned MediaStream
|
|
66
|
-
*/
|
|
67
|
-
processMediaStream(inputStream: MediaStream): Promise<MediaStream>;
|
|
68
|
-
/**
|
|
69
|
-
* Cleanup resources
|
|
70
|
-
*/
|
|
71
|
-
dispose(): void;
|
|
72
|
-
/**
|
|
73
|
-
* Check if initialized
|
|
74
|
-
*/
|
|
75
|
-
isReady(): boolean;
|
|
76
|
-
}
|
|
@@ -1,439 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
/**
|
|
3
|
-
* ML-Based Noise Suppressor for Odyssey MediaSoup SDK
|
|
4
|
-
* Uses trained TensorFlow.js BiLSTM model for real-time noise suppression
|
|
5
|
-
*
|
|
6
|
-
* Architecture: BiLSTM (256 units x 2) + Dense layers
|
|
7
|
-
* Input: Mel-spectrogram features (16 frames x 128 mels)
|
|
8
|
-
* Output: Noise suppression mask (0-1 per frequency bin)
|
|
9
|
-
*
|
|
10
|
-
* Trained on: LibriSpeech + UrbanSound8K + MS-SNSD datasets
|
|
11
|
-
* Performance: val_loss=0.038, SNR improvement ~12dB
|
|
12
|
-
*/
|
|
13
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
14
|
-
if (k2 === undefined) k2 = k;
|
|
15
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
16
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
17
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
18
|
-
}
|
|
19
|
-
Object.defineProperty(o, k2, desc);
|
|
20
|
-
}) : (function(o, m, k, k2) {
|
|
21
|
-
if (k2 === undefined) k2 = k;
|
|
22
|
-
o[k2] = m[k];
|
|
23
|
-
}));
|
|
24
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
25
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
26
|
-
}) : function(o, v) {
|
|
27
|
-
o["default"] = v;
|
|
28
|
-
});
|
|
29
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
30
|
-
var ownKeys = function(o) {
|
|
31
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
32
|
-
var ar = [];
|
|
33
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
34
|
-
return ar;
|
|
35
|
-
};
|
|
36
|
-
return ownKeys(o);
|
|
37
|
-
};
|
|
38
|
-
return function (mod) {
|
|
39
|
-
if (mod && mod.__esModule) return mod;
|
|
40
|
-
var result = {};
|
|
41
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
42
|
-
__setModuleDefault(result, mod);
|
|
43
|
-
return result;
|
|
44
|
-
};
|
|
45
|
-
})();
|
|
46
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
47
|
-
exports.MLNoiseSuppressor = void 0;
|
|
48
|
-
const tf = __importStar(require("@tensorflow/tfjs"));
|
|
49
|
-
class MLNoiseSuppressor {
|
|
50
|
-
constructor() {
|
|
51
|
-
this.model = null;
|
|
52
|
-
this.config = null;
|
|
53
|
-
this.normStats = null;
|
|
54
|
-
this.audioContext = null;
|
|
55
|
-
this.isInitialized = false;
|
|
56
|
-
// Real-time processing state
|
|
57
|
-
this.processingNode = null;
|
|
58
|
-
this.highPassFilter = null;
|
|
59
|
-
// Frame buffer for sequence-based processing
|
|
60
|
-
this.frameBuffer = [];
|
|
61
|
-
this.prevMask = null;
|
|
62
|
-
// Temporal smoothing (CRITICAL for quality - like Apple!)
|
|
63
|
-
this.SMOOTHING_ALPHA = 0.85; // Higher = smoother transitions
|
|
64
|
-
// Mel filterbank cache
|
|
65
|
-
this.melFilterbank = null;
|
|
66
|
-
this.fftSize = 512;
|
|
67
|
-
// FFT workspace
|
|
68
|
-
this.hannWindow = null;
|
|
69
|
-
}
|
|
70
|
-
/**
|
|
71
|
-
* Initialize the ML noise suppressor
|
|
72
|
-
* @param modelUrl URL to the model.json file
|
|
73
|
-
* @param audioContext Web Audio API AudioContext
|
|
74
|
-
*/
|
|
75
|
-
async initialize(modelUrl, audioContext) {
|
|
76
|
-
console.log("🚀 Initializing ML Noise Suppressor (BiLSTM v2)...");
|
|
77
|
-
this.audioContext = audioContext;
|
|
78
|
-
try {
|
|
79
|
-
// Load model
|
|
80
|
-
console.log(`📂 Loading model from ${modelUrl}`);
|
|
81
|
-
this.model = await tf.loadLayersModel(modelUrl);
|
|
82
|
-
console.log("✅ Model loaded successfully");
|
|
83
|
-
console.log(` Parameters: ${this.model.countParams().toLocaleString()}`);
|
|
84
|
-
// Load config
|
|
85
|
-
const baseUrl = modelUrl.substring(0, modelUrl.lastIndexOf("/"));
|
|
86
|
-
const configUrl = `${baseUrl}/model_config.json`;
|
|
87
|
-
const configResponse = await fetch(configUrl);
|
|
88
|
-
this.config = await configResponse.json();
|
|
89
|
-
console.log("⚙️ Config loaded:", this.config);
|
|
90
|
-
// Load normalization stats
|
|
91
|
-
const normUrl = `${baseUrl}/normalization_stats.json`;
|
|
92
|
-
const normResponse = await fetch(normUrl);
|
|
93
|
-
this.normStats = await normResponse.json();
|
|
94
|
-
console.log(`📏 Normalization stats: mean=${this.normStats.mean.toFixed(4)}, std=${this.normStats.std.toFixed(4)}`);
|
|
95
|
-
// Initialize FFT workspace
|
|
96
|
-
this.fftSize = this.config.frame_size || 512;
|
|
97
|
-
this.hannWindow = this.createHannWindow(this.fftSize);
|
|
98
|
-
// Create mel filterbank
|
|
99
|
-
this.melFilterbank = this.createMelFilterbank(this.fftSize, this.config.sample_rate, this.config.n_mels, 20, // fmin
|
|
100
|
-
8000 // fmax for voice
|
|
101
|
-
);
|
|
102
|
-
this.isInitialized = true;
|
|
103
|
-
console.log("✅ ML Noise Suppressor initialized!");
|
|
104
|
-
}
|
|
105
|
-
catch (error) {
|
|
106
|
-
console.error("❌ Failed to initialize ML Noise Suppressor:", error);
|
|
107
|
-
throw error;
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
/**
|
|
111
|
-
* Create Hann window for FFT
|
|
112
|
-
*/
|
|
113
|
-
createHannWindow(size) {
|
|
114
|
-
const window = new Float32Array(size);
|
|
115
|
-
for (let i = 0; i < size; i++) {
|
|
116
|
-
window[i] = 0.5 * (1 - Math.cos((2 * Math.PI * i) / (size - 1)));
|
|
117
|
-
}
|
|
118
|
-
return window;
|
|
119
|
-
}
|
|
120
|
-
/**
|
|
121
|
-
* Create mel filterbank matrix
|
|
122
|
-
*/
|
|
123
|
-
createMelFilterbank(fftSize, sampleRate, nMels, fmin, fmax) {
|
|
124
|
-
const nFft = Math.floor(fftSize / 2) + 1;
|
|
125
|
-
// Convert Hz to Mel scale
|
|
126
|
-
const hzToMel = (hz) => 2595 * Math.log10(1 + hz / 700);
|
|
127
|
-
const melToHz = (mel) => 700 * (Math.pow(10, mel / 2595) - 1);
|
|
128
|
-
const melMin = hzToMel(fmin);
|
|
129
|
-
const melMax = hzToMel(fmax);
|
|
130
|
-
// Create mel center frequencies
|
|
131
|
-
const melPoints = [];
|
|
132
|
-
for (let i = 0; i < nMels + 2; i++) {
|
|
133
|
-
melPoints.push(melMin + ((melMax - melMin) * i) / (nMels + 1));
|
|
134
|
-
}
|
|
135
|
-
// Convert to Hz
|
|
136
|
-
const hzPoints = melPoints.map(melToHz);
|
|
137
|
-
// Convert to FFT bins
|
|
138
|
-
const binPoints = hzPoints.map((hz) => Math.floor(((fftSize + 1) * hz) / sampleRate));
|
|
139
|
-
// Create triangular filterbank
|
|
140
|
-
const filterbank = [];
|
|
141
|
-
for (let m = 0; m < nMels; m++) {
|
|
142
|
-
const filter = new Float32Array(nFft);
|
|
143
|
-
const left = binPoints[m];
|
|
144
|
-
const center = binPoints[m + 1];
|
|
145
|
-
const right = binPoints[m + 2];
|
|
146
|
-
// Rising slope
|
|
147
|
-
for (let k = left; k < center && k < nFft; k++) {
|
|
148
|
-
filter[k] = (k - left) / (center - left);
|
|
149
|
-
}
|
|
150
|
-
// Falling slope
|
|
151
|
-
for (let k = center; k < right && k < nFft; k++) {
|
|
152
|
-
filter[k] = (right - k) / (right - center);
|
|
153
|
-
}
|
|
154
|
-
filterbank.push(filter);
|
|
155
|
-
}
|
|
156
|
-
return filterbank;
|
|
157
|
-
}
|
|
158
|
-
/**
|
|
159
|
-
* Compute FFT magnitude spectrum (optimized DFT for real-time)
|
|
160
|
-
*/
|
|
161
|
-
computeFFT(frame) {
|
|
162
|
-
const N = frame.length;
|
|
163
|
-
const magnitude = new Float32Array(Math.floor(N / 2) + 1);
|
|
164
|
-
// Apply Hann window
|
|
165
|
-
const windowed = new Float32Array(N);
|
|
166
|
-
for (let i = 0; i < N; i++) {
|
|
167
|
-
windowed[i] = frame[i] * (this.hannWindow?.[i] || 1);
|
|
168
|
-
}
|
|
169
|
-
// Compute DFT for positive frequencies only
|
|
170
|
-
for (let k = 0; k <= N / 2; k++) {
|
|
171
|
-
let real = 0;
|
|
172
|
-
let imag = 0;
|
|
173
|
-
const twoPiKOverN = (2 * Math.PI * k) / N;
|
|
174
|
-
for (let n = 0; n < N; n++) {
|
|
175
|
-
const angle = twoPiKOverN * n;
|
|
176
|
-
real += windowed[n] * Math.cos(angle);
|
|
177
|
-
imag -= windowed[n] * Math.sin(angle);
|
|
178
|
-
}
|
|
179
|
-
magnitude[k] = Math.sqrt(real * real + imag * imag);
|
|
180
|
-
}
|
|
181
|
-
return magnitude;
|
|
182
|
-
}
|
|
183
|
-
/**
|
|
184
|
-
* Compute mel-spectrogram features from audio frame
|
|
185
|
-
*/
|
|
186
|
-
computeMelFeatures(audio) {
|
|
187
|
-
if (!this.config || !this.melFilterbank) {
|
|
188
|
-
throw new Error("Config or filterbank not loaded");
|
|
189
|
-
}
|
|
190
|
-
// Compute FFT magnitude
|
|
191
|
-
const spectrum = this.computeFFT(audio);
|
|
192
|
-
// Apply mel filterbank and log compression
|
|
193
|
-
const melFeatures = new Array(this.config.n_mels);
|
|
194
|
-
for (let m = 0; m < this.config.n_mels; m++) {
|
|
195
|
-
let sum = 0;
|
|
196
|
-
const filter = this.melFilterbank[m];
|
|
197
|
-
for (let k = 0; k < spectrum.length && k < filter.length; k++) {
|
|
198
|
-
sum += spectrum[k] * spectrum[k] * filter[k]; // Power spectrum
|
|
199
|
-
}
|
|
200
|
-
// Log compression (matching training)
|
|
201
|
-
melFeatures[m] = Math.log(Math.max(sum, 1e-10) + 1);
|
|
202
|
-
}
|
|
203
|
-
return melFeatures;
|
|
204
|
-
}
|
|
205
|
-
/**
|
|
206
|
-
* Process audio buffer with ML noise suppression
|
|
207
|
-
* @param inputBuffer Audio buffer to process (Float32Array)
|
|
208
|
-
* @returns Processed audio buffer
|
|
209
|
-
*/
|
|
210
|
-
async processAudio(inputBuffer) {
|
|
211
|
-
if (!this.isInitialized || !this.model || !this.config || !this.normStats) {
|
|
212
|
-
return inputBuffer;
|
|
213
|
-
}
|
|
214
|
-
try {
|
|
215
|
-
const hopLength = this.config.hop_length;
|
|
216
|
-
const frameSize = this.config.frame_size || 512;
|
|
217
|
-
const numFrames = Math.floor((inputBuffer.length - frameSize) / hopLength) + 1;
|
|
218
|
-
if (numFrames < 1) {
|
|
219
|
-
return inputBuffer;
|
|
220
|
-
}
|
|
221
|
-
// Extract mel features for each frame
|
|
222
|
-
const features = [];
|
|
223
|
-
for (let i = 0; i < numFrames; i++) {
|
|
224
|
-
const start = i * hopLength;
|
|
225
|
-
const frame = inputBuffer.slice(start, start + frameSize);
|
|
226
|
-
const melFeatures = this.computeMelFeatures(frame);
|
|
227
|
-
features.push(melFeatures);
|
|
228
|
-
}
|
|
229
|
-
// Add to frame buffer for sequence processing
|
|
230
|
-
this.frameBuffer.push(...features);
|
|
231
|
-
// Keep only recent frames (2x sequence length for overlap)
|
|
232
|
-
const seqLength = this.config.sequence_length;
|
|
233
|
-
while (this.frameBuffer.length > seqLength * 2) {
|
|
234
|
-
this.frameBuffer.shift();
|
|
235
|
-
}
|
|
236
|
-
// Need enough frames for one sequence
|
|
237
|
-
if (this.frameBuffer.length < seqLength) {
|
|
238
|
-
return inputBuffer; // Not enough frames yet, pass through
|
|
239
|
-
}
|
|
240
|
-
// Create sequence from recent frames
|
|
241
|
-
const sequence = this.frameBuffer.slice(-seqLength);
|
|
242
|
-
// Normalize features (using training stats)
|
|
243
|
-
const normalizedSeq = sequence.map((frame) => frame.map((val) => (val - this.normStats.mean) / this.normStats.std));
|
|
244
|
-
// Run model inference
|
|
245
|
-
const mask = await tf.tidy(() => {
|
|
246
|
-
const inputTensor = tf.tensor3d([normalizedSeq]);
|
|
247
|
-
const output = this.model.predict(inputTensor);
|
|
248
|
-
return output.arraySync();
|
|
249
|
-
});
|
|
250
|
-
// Get mask for the last frame (most recent prediction)
|
|
251
|
-
const lastMaskFrame = mask[0][seqLength - 1];
|
|
252
|
-
const currentMask = new Float32Array(lastMaskFrame);
|
|
253
|
-
// Apply temporal smoothing (CRITICAL for Apple-quality audio!)
|
|
254
|
-
const smoothedMask = this.applyTemporalSmoothing(currentMask);
|
|
255
|
-
// Apply mask to audio with voice preservation
|
|
256
|
-
const output = this.applyMaskWithVoicePreservation(inputBuffer, smoothedMask, numFrames);
|
|
257
|
-
return output;
|
|
258
|
-
}
|
|
259
|
-
catch (error) {
|
|
260
|
-
console.error("❌ Error processing audio:", error);
|
|
261
|
-
return inputBuffer;
|
|
262
|
-
}
|
|
263
|
-
}
|
|
264
|
-
/**
|
|
265
|
-
* Apply temporal smoothing to reduce artifacts (Apple-style)
|
|
266
|
-
*/
|
|
267
|
-
applyTemporalSmoothing(currentMask) {
|
|
268
|
-
if (!this.prevMask || this.prevMask.length !== currentMask.length) {
|
|
269
|
-
this.prevMask = new Float32Array(currentMask);
|
|
270
|
-
return currentMask;
|
|
271
|
-
}
|
|
272
|
-
const smoothed = new Float32Array(currentMask.length);
|
|
273
|
-
for (let i = 0; i < currentMask.length; i++) {
|
|
274
|
-
// Exponential moving average for smooth transitions
|
|
275
|
-
smoothed[i] =
|
|
276
|
-
this.SMOOTHING_ALPHA * currentMask[i] +
|
|
277
|
-
(1 - this.SMOOTHING_ALPHA) * this.prevMask[i];
|
|
278
|
-
// Never completely mute (preserve minimum 3% - prevents artifacts)
|
|
279
|
-
smoothed[i] = Math.max(0.03, Math.min(1.0, smoothed[i]));
|
|
280
|
-
}
|
|
281
|
-
this.prevMask = smoothed;
|
|
282
|
-
return smoothed;
|
|
283
|
-
}
|
|
284
|
-
/**
|
|
285
|
-
* Apply mask with voice frequency preservation
|
|
286
|
-
*/
|
|
287
|
-
applyMaskWithVoicePreservation(audio, mask, numFrames) {
|
|
288
|
-
const output = new Float32Array(audio.length);
|
|
289
|
-
const hopLength = this.config.hop_length;
|
|
290
|
-
const nMels = this.config.n_mels;
|
|
291
|
-
// Calculate frequency-weighted gain
|
|
292
|
-
// Voice fundamentals are in lower mel bins, preserve them more
|
|
293
|
-
let voiceGain = 0;
|
|
294
|
-
let noiseGain = 0;
|
|
295
|
-
// Lower 1/4 of mels = voice fundamentals (80-500Hz)
|
|
296
|
-
const voiceBins = Math.floor(nMels / 4);
|
|
297
|
-
for (let i = 0; i < voiceBins; i++) {
|
|
298
|
-
voiceGain += mask[i];
|
|
299
|
-
}
|
|
300
|
-
voiceGain /= voiceBins;
|
|
301
|
-
// Upper 3/4 = potentially noise
|
|
302
|
-
for (let i = voiceBins; i < nMels; i++) {
|
|
303
|
-
noiseGain += mask[i];
|
|
304
|
-
}
|
|
305
|
-
noiseGain /= nMels - voiceBins;
|
|
306
|
-
// Blend gains (favor voice preservation)
|
|
307
|
-
const avgGain = voiceGain * 0.7 + noiseGain * 0.3;
|
|
308
|
-
// Apply gain per sample
|
|
309
|
-
for (let i = 0; i < audio.length; i++) {
|
|
310
|
-
// Use smooth gain
|
|
311
|
-
let gain = avgGain;
|
|
312
|
-
// Boost if mask indicates strong voice (> 0.5)
|
|
313
|
-
if (avgGain > 0.5) {
|
|
314
|
-
gain = Math.min(1.0, avgGain * 1.05);
|
|
315
|
-
}
|
|
316
|
-
output[i] = audio[i] * gain;
|
|
317
|
-
}
|
|
318
|
-
// Apply soft fade at edges to prevent clicks
|
|
319
|
-
const fadeLen = Math.min(64, output.length / 10);
|
|
320
|
-
for (let i = 0; i < fadeLen; i++) {
|
|
321
|
-
const fade = i / fadeLen;
|
|
322
|
-
output[i] *= fade;
|
|
323
|
-
output[output.length - 1 - i] *= fade;
|
|
324
|
-
}
|
|
325
|
-
return output;
|
|
326
|
-
}
|
|
327
|
-
/**
|
|
328
|
-
* Process MediaStream with ML noise suppression
|
|
329
|
-
* @param inputStream MediaStream to process
|
|
330
|
-
* @returns Cleaned MediaStream
|
|
331
|
-
*/
|
|
332
|
-
async processMediaStream(inputStream) {
|
|
333
|
-
if (!this.audioContext || !this.isInitialized) {
|
|
334
|
-
console.warn("⚠️ ML Noise Suppressor not initialized, returning original stream");
|
|
335
|
-
return inputStream;
|
|
336
|
-
}
|
|
337
|
-
try {
|
|
338
|
-
console.log("🎤 [ML] Setting up BiLSTM noise suppression pipeline...");
|
|
339
|
-
// Create MediaStreamSource from input
|
|
340
|
-
const source = this.audioContext.createMediaStreamSource(inputStream);
|
|
341
|
-
// Create high-pass filter (remove <80Hz rumble - like Apple)
|
|
342
|
-
this.highPassFilter = this.audioContext.createBiquadFilter();
|
|
343
|
-
this.highPassFilter.type = "highpass";
|
|
344
|
-
this.highPassFilter.frequency.value = 80;
|
|
345
|
-
this.highPassFilter.Q.value = 0.7;
|
|
346
|
-
// Create destination for output
|
|
347
|
-
const destination = this.audioContext.createMediaStreamDestination();
|
|
348
|
-
// Create ScriptProcessor for real-time ML processing
|
|
349
|
-
// Buffer size of 2048 = ~42ms latency at 48kHz (acceptable for real-time)
|
|
350
|
-
const bufferSize = 2048;
|
|
351
|
-
this.processingNode = this.audioContext.createScriptProcessor(bufferSize, 1, 1);
|
|
352
|
-
let frameCount = 0;
|
|
353
|
-
const startTime = performance.now();
|
|
354
|
-
// Double-buffering for async ML processing
|
|
355
|
-
// We store the PREVIOUS processed result and output it in the NEXT callback
|
|
356
|
-
// This adds one buffer of latency but ensures we never output zeros
|
|
357
|
-
let previousProcessedBuffer = null;
|
|
358
|
-
let processingInFlight = false;
|
|
359
|
-
// Process audio frames with ML model
|
|
360
|
-
// IMPORTANT: onaudioprocess is synchronous! We use double-buffering to handle async ML
|
|
361
|
-
this.processingNode.onaudioprocess = (event) => {
|
|
362
|
-
const inputData = event.inputBuffer.getChannelData(0);
|
|
363
|
-
const outputData = event.outputBuffer.getChannelData(0);
|
|
364
|
-
frameCount++;
|
|
365
|
-
// OUTPUT: Use previously processed audio (or passthrough if not ready yet)
|
|
366
|
-
if (previousProcessedBuffer) {
|
|
367
|
-
outputData.set(previousProcessedBuffer);
|
|
368
|
-
}
|
|
369
|
-
else {
|
|
370
|
-
// First frame or ML not ready - pass through original audio
|
|
371
|
-
outputData.set(inputData);
|
|
372
|
-
}
|
|
373
|
-
// PROCESS: Start async ML processing for the NEXT frame
|
|
374
|
-
// Only start new processing if previous one is complete
|
|
375
|
-
if (!processingInFlight) {
|
|
376
|
-
processingInFlight = true;
|
|
377
|
-
const inputCopy = new Float32Array(inputData);
|
|
378
|
-
// Fire-and-forget async processing
|
|
379
|
-
this.processAudio(inputCopy)
|
|
380
|
-
.then((processed) => {
|
|
381
|
-
previousProcessedBuffer = processed;
|
|
382
|
-
processingInFlight = false;
|
|
383
|
-
})
|
|
384
|
-
.catch((error) => {
|
|
385
|
-
// On error, store the original audio for passthrough
|
|
386
|
-
previousProcessedBuffer = inputCopy;
|
|
387
|
-
processingInFlight = false;
|
|
388
|
-
});
|
|
389
|
-
}
|
|
390
|
-
// Log performance every ~4 seconds
|
|
391
|
-
if (frameCount % 100 === 0) {
|
|
392
|
-
const elapsed = (performance.now() - startTime) / 1000;
|
|
393
|
-
const fps = frameCount / elapsed;
|
|
394
|
-
console.log(`🎤 [ML] BiLSTM: ${frameCount} frames @ ${fps.toFixed(1)} fps`);
|
|
395
|
-
}
|
|
396
|
-
};
|
|
397
|
-
// Connect: source -> highpass -> BiLSTM processor -> destination
|
|
398
|
-
source.connect(this.highPassFilter);
|
|
399
|
-
this.highPassFilter.connect(this.processingNode);
|
|
400
|
-
this.processingNode.connect(destination);
|
|
401
|
-
console.log("✅ [ML] Pipeline: mic → highpass(80Hz) → BiLSTM(256x2) → output");
|
|
402
|
-
console.log("✅ [ML] Latency: ~42ms, Sample rate: 48kHz");
|
|
403
|
-
return destination.stream;
|
|
404
|
-
}
|
|
405
|
-
catch (error) {
|
|
406
|
-
console.error("❌ [ML] Failed to process MediaStream:", error);
|
|
407
|
-
return inputStream;
|
|
408
|
-
}
|
|
409
|
-
}
|
|
410
|
-
/**
|
|
411
|
-
* Cleanup resources
|
|
412
|
-
*/
|
|
413
|
-
dispose() {
|
|
414
|
-
if (this.processingNode) {
|
|
415
|
-
this.processingNode.disconnect();
|
|
416
|
-
this.processingNode = null;
|
|
417
|
-
}
|
|
418
|
-
if (this.highPassFilter) {
|
|
419
|
-
this.highPassFilter.disconnect();
|
|
420
|
-
this.highPassFilter = null;
|
|
421
|
-
}
|
|
422
|
-
if (this.model) {
|
|
423
|
-
this.model.dispose();
|
|
424
|
-
this.model = null;
|
|
425
|
-
}
|
|
426
|
-
this.frameBuffer = [];
|
|
427
|
-
this.prevMask = null;
|
|
428
|
-
this.melFilterbank = null;
|
|
429
|
-
this.isInitialized = false;
|
|
430
|
-
console.log("🗑️ ML Noise Suppressor disposed");
|
|
431
|
-
}
|
|
432
|
-
/**
|
|
433
|
-
* Check if initialized
|
|
434
|
-
*/
|
|
435
|
-
isReady() {
|
|
436
|
-
return this.isInitialized;
|
|
437
|
-
}
|
|
438
|
-
}
|
|
439
|
-
exports.MLNoiseSuppressor = MLNoiseSuppressor;
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* ULTIMATE ML Noise Suppressor - Enhanced for Apple/Google Meet Quality
|
|
3
|
-
* Features:
|
|
4
|
-
* 1. Temporal smoothing (exponential moving average)
|
|
5
|
-
* 2. Voice frequency preservation (80-500 Hz)
|
|
6
|
-
* 3. Sub-bass filtering (remove < 80 Hz)
|
|
7
|
-
* 4. Adaptive processing
|
|
8
|
-
* 5. WebAssembly acceleration
|
|
9
|
-
*/
|
|
10
|
-
export declare class UltimateMLNoiseSuppressor {
|
|
11
|
-
private model;
|
|
12
|
-
private config;
|
|
13
|
-
private normStats;
|
|
14
|
-
private audioContext;
|
|
15
|
-
private isInitialized;
|
|
16
|
-
private prevMask;
|
|
17
|
-
private readonly SMOOTHING_ALPHA;
|
|
18
|
-
private highPassFilter;
|
|
19
|
-
private voiceBandFilter;
|
|
20
|
-
private processingQueue;
|
|
21
|
-
private isProcessing;
|
|
22
|
-
/**
|
|
23
|
-
* Initialize with enhanced setup
|
|
24
|
-
*/
|
|
25
|
-
initialize(modelUrl: string, audioContext: AudioContext): Promise<void>;
|
|
26
|
-
/**
|
|
27
|
-
* Setup filters for voice frequency preservation
|
|
28
|
-
*/
|
|
29
|
-
private setupVoiceFilters;
|
|
30
|
-
/**
|
|
31
|
-
* Process audio with ULTIMATE quality
|
|
32
|
-
* NOTE: This runs in the AudioWorklet thread. It must be synchronous and fast.
|
|
33
|
-
* The heavy ML inference should ideally happen in a Worker, communicating via SharedArrayBuffer.
|
|
34
|
-
* For this implementation, we use a simplified frame-based approach.
|
|
35
|
-
*/
|
|
36
|
-
processAudio(inputBuffer: Float32Array): Float32Array;
|
|
37
|
-
/**
|
|
38
|
-
* Placeholder for async processing (to be moved to a Web Worker)
|
|
39
|
-
*/
|
|
40
|
-
processFrameAsync(inputBuffer: Float32Array): Promise<void>;
|
|
41
|
-
/**
|
|
42
|
-
* CRITICAL: Temporal smoothing (biggest quality improvement!)
|
|
43
|
-
*/
|
|
44
|
-
private applyTemporalSmoothing;
|
|
45
|
-
/**
|
|
46
|
-
* Apply high-pass filter to remove rumble
|
|
47
|
-
*/
|
|
48
|
-
private applyHighPassFilter;
|
|
49
|
-
/**
|
|
50
|
-
* Apply mask with voice frequency preservation
|
|
51
|
-
*/
|
|
52
|
-
private applyMaskWithVoicePreservation;
|
|
53
|
-
/**
|
|
54
|
-
* Extract mel-spectrogram features
|
|
55
|
-
*/
|
|
56
|
-
private extractMelFeatures;
|
|
57
|
-
/**
|
|
58
|
-
* Compute mel bin (simplified)
|
|
59
|
-
*/
|
|
60
|
-
private computeMelBin;
|
|
61
|
-
/**
|
|
62
|
-
* Create sequences for LSTM input
|
|
63
|
-
*/
|
|
64
|
-
private createSequences;
|
|
65
|
-
/**
|
|
66
|
-
* Reset processing state (call when switching audio streams)
|
|
67
|
-
*/
|
|
68
|
-
reset(): void;
|
|
69
|
-
/**
|
|
70
|
-
* Get processing latency
|
|
71
|
-
*/
|
|
72
|
-
getLatency(): number;
|
|
73
|
-
}
|
|
74
|
-
export default UltimateMLNoiseSuppressor;
|
|
@@ -1,309 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
/**
|
|
3
|
-
* ULTIMATE ML Noise Suppressor - Enhanced for Apple/Google Meet Quality
|
|
4
|
-
* Features:
|
|
5
|
-
* 1. Temporal smoothing (exponential moving average)
|
|
6
|
-
* 2. Voice frequency preservation (80-500 Hz)
|
|
7
|
-
* 3. Sub-bass filtering (remove < 80 Hz)
|
|
8
|
-
* 4. Adaptive processing
|
|
9
|
-
* 5. WebAssembly acceleration
|
|
10
|
-
*/
|
|
11
|
-
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
12
|
-
if (k2 === undefined) k2 = k;
|
|
13
|
-
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
14
|
-
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
15
|
-
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
16
|
-
}
|
|
17
|
-
Object.defineProperty(o, k2, desc);
|
|
18
|
-
}) : (function(o, m, k, k2) {
|
|
19
|
-
if (k2 === undefined) k2 = k;
|
|
20
|
-
o[k2] = m[k];
|
|
21
|
-
}));
|
|
22
|
-
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
23
|
-
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
24
|
-
}) : function(o, v) {
|
|
25
|
-
o["default"] = v;
|
|
26
|
-
});
|
|
27
|
-
var __importStar = (this && this.__importStar) || (function () {
|
|
28
|
-
var ownKeys = function(o) {
|
|
29
|
-
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
30
|
-
var ar = [];
|
|
31
|
-
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
32
|
-
return ar;
|
|
33
|
-
};
|
|
34
|
-
return ownKeys(o);
|
|
35
|
-
};
|
|
36
|
-
return function (mod) {
|
|
37
|
-
if (mod && mod.__esModule) return mod;
|
|
38
|
-
var result = {};
|
|
39
|
-
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
40
|
-
__setModuleDefault(result, mod);
|
|
41
|
-
return result;
|
|
42
|
-
};
|
|
43
|
-
})();
|
|
44
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
45
|
-
exports.UltimateMLNoiseSuppressor = void 0;
|
|
46
|
-
const tf = __importStar(require("@tensorflow/tfjs"));
|
|
47
|
-
class UltimateMLNoiseSuppressor {
|
|
48
|
-
constructor() {
|
|
49
|
-
this.model = null;
|
|
50
|
-
this.config = null;
|
|
51
|
-
this.normStats = null;
|
|
52
|
-
this.audioContext = null;
|
|
53
|
-
this.isInitialized = false;
|
|
54
|
-
// CRITICAL: Temporal smoothing state
|
|
55
|
-
this.prevMask = null;
|
|
56
|
-
this.SMOOTHING_ALPHA = 0.85; // 85% current, 15% previous
|
|
57
|
-
// Voice frequency preservation
|
|
58
|
-
this.highPassFilter = null;
|
|
59
|
-
this.voiceBandFilter = null;
|
|
60
|
-
// Processing optimization
|
|
61
|
-
this.processingQueue = [];
|
|
62
|
-
this.isProcessing = false;
|
|
63
|
-
}
|
|
64
|
-
/**
|
|
65
|
-
* Initialize with enhanced setup
|
|
66
|
-
*/
|
|
67
|
-
async initialize(modelUrl, audioContext) {
|
|
68
|
-
console.log("🚀 Initializing ULTIMATE ML Noise Suppressor...");
|
|
69
|
-
this.audioContext = audioContext;
|
|
70
|
-
try {
|
|
71
|
-
// Load model
|
|
72
|
-
console.log(`📂 Loading model from ${modelUrl}`);
|
|
73
|
-
this.model = await tf.loadLayersModel(modelUrl);
|
|
74
|
-
console.log("✅ Model loaded");
|
|
75
|
-
// Load config
|
|
76
|
-
const baseUrl = modelUrl.substring(0, modelUrl.lastIndexOf("/"));
|
|
77
|
-
const configResponse = await fetch(`${baseUrl}/model_config.json`);
|
|
78
|
-
this.config = await configResponse.json();
|
|
79
|
-
// Load normalization stats
|
|
80
|
-
const normResponse = await fetch(`${baseUrl}/normalization_stats.json`);
|
|
81
|
-
this.normStats = await normResponse.json();
|
|
82
|
-
// Setup voice frequency filters
|
|
83
|
-
this.setupVoiceFilters();
|
|
84
|
-
this.isInitialized = true;
|
|
85
|
-
console.log("✅ ULTIMATE ML Noise Suppressor initialized!");
|
|
86
|
-
}
|
|
87
|
-
catch (error) {
|
|
88
|
-
console.error("❌ Failed to initialize:", error);
|
|
89
|
-
throw error;
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
/**
|
|
93
|
-
* Setup filters for voice frequency preservation
|
|
94
|
-
*/
|
|
95
|
-
setupVoiceFilters() {
|
|
96
|
-
if (!this.audioContext)
|
|
97
|
-
return;
|
|
98
|
-
// High-pass filter: Remove sub-bass rumble (< 80 Hz)
|
|
99
|
-
this.highPassFilter = this.audioContext.createBiquadFilter();
|
|
100
|
-
this.highPassFilter.type = "highpass";
|
|
101
|
-
this.highPassFilter.frequency.value = 80; // 80 Hz cutoff
|
|
102
|
-
this.highPassFilter.Q.value = 0.7;
|
|
103
|
-
// Bandpass filter: Enhance voice fundamentals (100-300 Hz)
|
|
104
|
-
this.voiceBandFilter = this.audioContext.createBiquadFilter();
|
|
105
|
-
this.voiceBandFilter.type = "bandpass";
|
|
106
|
-
this.voiceBandFilter.frequency.value = 200; // Center at 200 Hz
|
|
107
|
-
this.voiceBandFilter.Q.value = 1.4;
|
|
108
|
-
}
|
|
109
|
-
/**
|
|
110
|
-
* Process audio with ULTIMATE quality
|
|
111
|
-
* NOTE: This runs in the AudioWorklet thread. It must be synchronous and fast.
|
|
112
|
-
* The heavy ML inference should ideally happen in a Worker, communicating via SharedArrayBuffer.
|
|
113
|
-
* For this implementation, we use a simplified frame-based approach.
|
|
114
|
-
*/
|
|
115
|
-
processAudio(inputBuffer) {
|
|
116
|
-
if (!this.isInitialized || !this.model || !this.config || !this.normStats) {
|
|
117
|
-
return inputBuffer;
|
|
118
|
-
}
|
|
119
|
-
// 1. Pre-processing: Remove sub-bass rumble (High-pass)
|
|
120
|
-
// Note: In a real AudioWorklet, filters should be applied per-sample or per-block, not on the whole buffer at once if it's a stream.
|
|
121
|
-
// But assuming inputBuffer is a processing block (e.g. 128 samples):
|
|
122
|
-
const filtered = this.applyHighPassFilter(inputBuffer);
|
|
123
|
-
// ⚠️ CRITICAL ARCHITECTURE NOTE ⚠️
|
|
124
|
-
// We cannot await this.model.predict() here because this function must return immediately for real-time audio.
|
|
125
|
-
// The correct architecture is:
|
|
126
|
-
// 1. AudioWorklet writes audio to a RingBuffer (SharedArrayBuffer).
|
|
127
|
-
// 2. Web Worker reads RingBuffer, runs TFJS inference (async), writes Mask to another RingBuffer.
|
|
128
|
-
// 3. AudioWorklet reads latest Mask from RingBuffer and applies it.
|
|
129
|
-
// For now, we will return the filtered audio.
|
|
130
|
-
// To enable ML, you must implement the Worker architecture described above.
|
|
131
|
-
// Running TFJS on the main audio thread will cause stuttering.
|
|
132
|
-
return filtered;
|
|
133
|
-
}
|
|
134
|
-
/**
|
|
135
|
-
* Placeholder for async processing (to be moved to a Web Worker)
|
|
136
|
-
*/
|
|
137
|
-
async processFrameAsync(inputBuffer) {
|
|
138
|
-
// This logic belongs in a Web Worker
|
|
139
|
-
try {
|
|
140
|
-
const features = await this.extractMelFeatures(inputBuffer);
|
|
141
|
-
const normalizedFeatures = tf.tidy(() => {
|
|
142
|
-
const tensor = tf.tensor2d(features);
|
|
143
|
-
return tensor.sub(this.normStats.mean).div(this.normStats.std);
|
|
144
|
-
});
|
|
145
|
-
const featuresArray = await normalizedFeatures.array();
|
|
146
|
-
const sequences = this.createSequences(featuresArray, this.config.sequence_length);
|
|
147
|
-
if (sequences.length > 0) {
|
|
148
|
-
const sequenceTensor = tf.tensor3d([sequences[0]]);
|
|
149
|
-
const maskTensor = this.model.predict(sequenceTensor);
|
|
150
|
-
const maskData = await maskTensor.data();
|
|
151
|
-
const flatMask = Array.from(maskData);
|
|
152
|
-
// Update the current mask for the AudioWorklet to use
|
|
153
|
-
this.prevMask = this.applyTemporalSmoothing(flatMask);
|
|
154
|
-
normalizedFeatures.dispose();
|
|
155
|
-
sequenceTensor.dispose();
|
|
156
|
-
maskTensor.dispose();
|
|
157
|
-
}
|
|
158
|
-
}
|
|
159
|
-
catch (e) {
|
|
160
|
-
console.error(e);
|
|
161
|
-
}
|
|
162
|
-
}
|
|
163
|
-
/**
|
|
164
|
-
* CRITICAL: Temporal smoothing (biggest quality improvement!)
|
|
165
|
-
*/
|
|
166
|
-
applyTemporalSmoothing(currentMask) {
|
|
167
|
-
const smoothed = new Float32Array(currentMask.length);
|
|
168
|
-
if (!this.prevMask || this.prevMask.length !== currentMask.length) {
|
|
169
|
-
// First frame - no smoothing
|
|
170
|
-
this.prevMask = new Float32Array(currentMask);
|
|
171
|
-
return this.prevMask;
|
|
172
|
-
}
|
|
173
|
-
// Exponential moving average
|
|
174
|
-
for (let i = 0; i < currentMask.length; i++) {
|
|
175
|
-
smoothed[i] =
|
|
176
|
-
this.SMOOTHING_ALPHA * currentMask[i] +
|
|
177
|
-
(1 - this.SMOOTHING_ALPHA) * this.prevMask[i];
|
|
178
|
-
// Clamp to valid range [0.02, 1.0]
|
|
179
|
-
// Never completely mute (min 2%)
|
|
180
|
-
smoothed[i] = Math.max(0.02, Math.min(1.0, smoothed[i]));
|
|
181
|
-
}
|
|
182
|
-
this.prevMask = smoothed;
|
|
183
|
-
return smoothed;
|
|
184
|
-
}
|
|
185
|
-
/**
|
|
186
|
-
* Apply high-pass filter to remove rumble
|
|
187
|
-
*/
|
|
188
|
-
applyHighPassFilter(input) {
|
|
189
|
-
// Simple IIR high-pass filter (80 Hz @ 48kHz)
|
|
190
|
-
const output = new Float32Array(input.length);
|
|
191
|
-
const alpha = 0.98; // Filter coefficient
|
|
192
|
-
output[0] = input[0];
|
|
193
|
-
for (let i = 1; i < input.length; i++) {
|
|
194
|
-
output[i] = alpha * (output[i - 1] + input[i] - input[i - 1]);
|
|
195
|
-
}
|
|
196
|
-
return output;
|
|
197
|
-
}
|
|
198
|
-
/**
|
|
199
|
-
* Apply mask with voice frequency preservation
|
|
200
|
-
*/
|
|
201
|
-
applyMaskWithVoicePreservation(audio, mask, numFrames) {
|
|
202
|
-
const output = new Float32Array(audio.length);
|
|
203
|
-
// Simple overlap-add (proper implementation would use ISTFT)
|
|
204
|
-
const hopLength = Math.floor(audio.length / numFrames);
|
|
205
|
-
for (let i = 0; i < audio.length; i++) {
|
|
206
|
-
const frameIdx = Math.floor(i / hopLength);
|
|
207
|
-
const maskIdx = Math.min(frameIdx, numFrames - 1);
|
|
208
|
-
// Apply mask
|
|
209
|
-
let gain = 1.0;
|
|
210
|
-
if (maskIdx < mask.length / this.config.n_mels) {
|
|
211
|
-
// Average mask across frequency bins for this frame
|
|
212
|
-
let maskSum = 0;
|
|
213
|
-
const startBin = maskIdx * this.config.n_mels;
|
|
214
|
-
for (let j = 0; j < this.config.n_mels; j++) {
|
|
215
|
-
maskSum += mask[startBin + j];
|
|
216
|
-
}
|
|
217
|
-
gain = maskSum / this.config.n_mels;
|
|
218
|
-
}
|
|
219
|
-
// Apply gain with minimum threshold
|
|
220
|
-
output[i] = audio[i] * Math.max(0.02, gain);
|
|
221
|
-
}
|
|
222
|
-
// Apply fade-in/out to prevent clicks
|
|
223
|
-
const fadeLength = Math.min(256, output.length / 10);
|
|
224
|
-
for (let i = 0; i < fadeLength; i++) {
|
|
225
|
-
const fade = i / fadeLength;
|
|
226
|
-
output[i] *= fade;
|
|
227
|
-
output[output.length - 1 - i] *= fade;
|
|
228
|
-
}
|
|
229
|
-
return output;
|
|
230
|
-
}
|
|
231
|
-
/**
|
|
232
|
-
* Extract mel-spectrogram features
|
|
233
|
-
*/
|
|
234
|
-
async extractMelFeatures(audio) {
|
|
235
|
-
if (!this.config)
|
|
236
|
-
throw new Error("Config not loaded");
|
|
237
|
-
// Simplified feature extraction
|
|
238
|
-
// In production, use proper STFT + Mel filterbank
|
|
239
|
-
const frameLength = this.config.n_fft;
|
|
240
|
-
const hopLength = this.config.hop_length;
|
|
241
|
-
const numFrames = Math.floor((audio.length - frameLength) / hopLength) + 1;
|
|
242
|
-
const features = [];
|
|
243
|
-
for (let i = 0; i < numFrames; i++) {
|
|
244
|
-
const start = i * hopLength;
|
|
245
|
-
const frame = audio.slice(start, start + frameLength);
|
|
246
|
-
// Compute mel bins (simplified)
|
|
247
|
-
const frameFeatures = [];
|
|
248
|
-
for (let j = 0; j < this.config.n_mels; j++) {
|
|
249
|
-
const melBin = this.computeMelBin(frame, j);
|
|
250
|
-
frameFeatures.push(melBin);
|
|
251
|
-
}
|
|
252
|
-
features.push(frameFeatures);
|
|
253
|
-
}
|
|
254
|
-
return features;
|
|
255
|
-
}
|
|
256
|
-
/**
|
|
257
|
-
* Compute mel bin (simplified)
|
|
258
|
-
*/
|
|
259
|
-
computeMelBin(frame, binIndex) {
|
|
260
|
-
const start = Math.floor((binIndex / this.config.n_mels) * frame.length);
|
|
261
|
-
const end = Math.floor(((binIndex + 1) / this.config.n_mels) * frame.length);
|
|
262
|
-
let sum = 0;
|
|
263
|
-
for (let i = start; i < end && i < frame.length; i++) {
|
|
264
|
-
sum += Math.abs(frame[i]);
|
|
265
|
-
}
|
|
266
|
-
const avg = sum / (end - start);
|
|
267
|
-
// Convert to log scale (dB-like)
|
|
268
|
-
return Math.log10(avg + 1e-8) * 10;
|
|
269
|
-
}
|
|
270
|
-
/**
|
|
271
|
-
* Create sequences for LSTM input
|
|
272
|
-
*/
|
|
273
|
-
createSequences(features, seqLength) {
|
|
274
|
-
const sequences = [];
|
|
275
|
-
for (let i = 0; i <= features.length - seqLength; i++) {
|
|
276
|
-
sequences.push(features.slice(i, i + seqLength));
|
|
277
|
-
}
|
|
278
|
-
// If not enough frames, pad with last frame
|
|
279
|
-
if (sequences.length === 0 && features.length > 0) {
|
|
280
|
-
const paddedSeq = [];
|
|
281
|
-
for (let i = 0; i < seqLength; i++) {
|
|
282
|
-
paddedSeq.push(features[Math.min(i, features.length - 1)]);
|
|
283
|
-
}
|
|
284
|
-
sequences.push(paddedSeq);
|
|
285
|
-
}
|
|
286
|
-
return sequences;
|
|
287
|
-
}
|
|
288
|
-
/**
|
|
289
|
-
* Reset processing state (call when switching audio streams)
|
|
290
|
-
*/
|
|
291
|
-
reset() {
|
|
292
|
-
this.prevMask = null;
|
|
293
|
-
this.processingQueue = [];
|
|
294
|
-
}
|
|
295
|
-
/**
|
|
296
|
-
* Get processing latency
|
|
297
|
-
*/
|
|
298
|
-
getLatency() {
|
|
299
|
-
if (!this.config)
|
|
300
|
-
return 0;
|
|
301
|
-
// Approximate latency in milliseconds
|
|
302
|
-
const bufferLatency = (this.config.n_fft / this.config.sample_rate) * 1000;
|
|
303
|
-
const processingLatency = 10; // Model inference ~10ms
|
|
304
|
-
return bufferLatency + processingLatency;
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
exports.UltimateMLNoiseSuppressor = UltimateMLNoiseSuppressor;
|
|
308
|
-
// Export for use in AudioWorklet
|
|
309
|
-
exports.default = UltimateMLNoiseSuppressor;
|