@newgameplusinc/odyssey-audio-video-sdk-dev 1.0.57 → 1.0.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,439 +0,0 @@
1
- "use strict";
2
- /**
3
- * ML-Based Noise Suppressor for Odyssey MediaSoup SDK
4
- * Uses trained TensorFlow.js BiLSTM model for real-time noise suppression
5
- *
6
- * Architecture: BiLSTM (256 units x 2) + Dense layers
7
- * Input: Mel-spectrogram features (16 frames x 128 mels)
8
- * Output: Noise suppression mask (0-1 per frequency bin)
9
- *
10
- * Trained on: LibriSpeech + UrbanSound8K + MS-SNSD datasets
11
- * Performance: val_loss=0.038, SNR improvement ~12dB
12
- */
13
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
14
- if (k2 === undefined) k2 = k;
15
- var desc = Object.getOwnPropertyDescriptor(m, k);
16
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
17
- desc = { enumerable: true, get: function() { return m[k]; } };
18
- }
19
- Object.defineProperty(o, k2, desc);
20
- }) : (function(o, m, k, k2) {
21
- if (k2 === undefined) k2 = k;
22
- o[k2] = m[k];
23
- }));
24
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
25
- Object.defineProperty(o, "default", { enumerable: true, value: v });
26
- }) : function(o, v) {
27
- o["default"] = v;
28
- });
29
- var __importStar = (this && this.__importStar) || (function () {
30
- var ownKeys = function(o) {
31
- ownKeys = Object.getOwnPropertyNames || function (o) {
32
- var ar = [];
33
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
34
- return ar;
35
- };
36
- return ownKeys(o);
37
- };
38
- return function (mod) {
39
- if (mod && mod.__esModule) return mod;
40
- var result = {};
41
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
42
- __setModuleDefault(result, mod);
43
- return result;
44
- };
45
- })();
46
- Object.defineProperty(exports, "__esModule", { value: true });
47
- exports.MLNoiseSuppressor = void 0;
48
- const tf = __importStar(require("@tensorflow/tfjs"));
49
- class MLNoiseSuppressor {
50
- constructor() {
51
- this.model = null;
52
- this.config = null;
53
- this.normStats = null;
54
- this.audioContext = null;
55
- this.isInitialized = false;
56
- // Real-time processing state
57
- this.processingNode = null;
58
- this.highPassFilter = null;
59
- // Frame buffer for sequence-based processing
60
- this.frameBuffer = [];
61
- this.prevMask = null;
62
- // Temporal smoothing (CRITICAL for quality - like Apple!)
63
- this.SMOOTHING_ALPHA = 0.85; // Higher = smoother transitions
64
- // Mel filterbank cache
65
- this.melFilterbank = null;
66
- this.fftSize = 512;
67
- // FFT workspace
68
- this.hannWindow = null;
69
- }
70
- /**
71
- * Initialize the ML noise suppressor
72
- * @param modelUrl URL to the model.json file
73
- * @param audioContext Web Audio API AudioContext
74
- */
75
- async initialize(modelUrl, audioContext) {
76
- console.log("🚀 Initializing ML Noise Suppressor (BiLSTM v2)...");
77
- this.audioContext = audioContext;
78
- try {
79
- // Load model
80
- console.log(`📂 Loading model from ${modelUrl}`);
81
- this.model = await tf.loadLayersModel(modelUrl);
82
- console.log("✅ Model loaded successfully");
83
- console.log(` Parameters: ${this.model.countParams().toLocaleString()}`);
84
- // Load config
85
- const baseUrl = modelUrl.substring(0, modelUrl.lastIndexOf("/"));
86
- const configUrl = `${baseUrl}/model_config.json`;
87
- const configResponse = await fetch(configUrl);
88
- this.config = await configResponse.json();
89
- console.log("⚙️ Config loaded:", this.config);
90
- // Load normalization stats
91
- const normUrl = `${baseUrl}/normalization_stats.json`;
92
- const normResponse = await fetch(normUrl);
93
- this.normStats = await normResponse.json();
94
- console.log(`📏 Normalization stats: mean=${this.normStats.mean.toFixed(4)}, std=${this.normStats.std.toFixed(4)}`);
95
- // Initialize FFT workspace
96
- this.fftSize = this.config.frame_size || 512;
97
- this.hannWindow = this.createHannWindow(this.fftSize);
98
- // Create mel filterbank
99
- this.melFilterbank = this.createMelFilterbank(this.fftSize, this.config.sample_rate, this.config.n_mels, 20, // fmin
100
- 8000 // fmax for voice
101
- );
102
- this.isInitialized = true;
103
- console.log("✅ ML Noise Suppressor initialized!");
104
- }
105
- catch (error) {
106
- console.error("❌ Failed to initialize ML Noise Suppressor:", error);
107
- throw error;
108
- }
109
- }
110
- /**
111
- * Create Hann window for FFT
112
- */
113
- createHannWindow(size) {
114
- const window = new Float32Array(size);
115
- for (let i = 0; i < size; i++) {
116
- window[i] = 0.5 * (1 - Math.cos((2 * Math.PI * i) / (size - 1)));
117
- }
118
- return window;
119
- }
120
- /**
121
- * Create mel filterbank matrix
122
- */
123
- createMelFilterbank(fftSize, sampleRate, nMels, fmin, fmax) {
124
- const nFft = Math.floor(fftSize / 2) + 1;
125
- // Convert Hz to Mel scale
126
- const hzToMel = (hz) => 2595 * Math.log10(1 + hz / 700);
127
- const melToHz = (mel) => 700 * (Math.pow(10, mel / 2595) - 1);
128
- const melMin = hzToMel(fmin);
129
- const melMax = hzToMel(fmax);
130
- // Create mel center frequencies
131
- const melPoints = [];
132
- for (let i = 0; i < nMels + 2; i++) {
133
- melPoints.push(melMin + ((melMax - melMin) * i) / (nMels + 1));
134
- }
135
- // Convert to Hz
136
- const hzPoints = melPoints.map(melToHz);
137
- // Convert to FFT bins
138
- const binPoints = hzPoints.map((hz) => Math.floor(((fftSize + 1) * hz) / sampleRate));
139
- // Create triangular filterbank
140
- const filterbank = [];
141
- for (let m = 0; m < nMels; m++) {
142
- const filter = new Float32Array(nFft);
143
- const left = binPoints[m];
144
- const center = binPoints[m + 1];
145
- const right = binPoints[m + 2];
146
- // Rising slope
147
- for (let k = left; k < center && k < nFft; k++) {
148
- filter[k] = (k - left) / (center - left);
149
- }
150
- // Falling slope
151
- for (let k = center; k < right && k < nFft; k++) {
152
- filter[k] = (right - k) / (right - center);
153
- }
154
- filterbank.push(filter);
155
- }
156
- return filterbank;
157
- }
158
- /**
159
- * Compute FFT magnitude spectrum (optimized DFT for real-time)
160
- */
161
- computeFFT(frame) {
162
- const N = frame.length;
163
- const magnitude = new Float32Array(Math.floor(N / 2) + 1);
164
- // Apply Hann window
165
- const windowed = new Float32Array(N);
166
- for (let i = 0; i < N; i++) {
167
- windowed[i] = frame[i] * (this.hannWindow?.[i] || 1);
168
- }
169
- // Compute DFT for positive frequencies only
170
- for (let k = 0; k <= N / 2; k++) {
171
- let real = 0;
172
- let imag = 0;
173
- const twoPiKOverN = (2 * Math.PI * k) / N;
174
- for (let n = 0; n < N; n++) {
175
- const angle = twoPiKOverN * n;
176
- real += windowed[n] * Math.cos(angle);
177
- imag -= windowed[n] * Math.sin(angle);
178
- }
179
- magnitude[k] = Math.sqrt(real * real + imag * imag);
180
- }
181
- return magnitude;
182
- }
183
- /**
184
- * Compute mel-spectrogram features from audio frame
185
- */
186
- computeMelFeatures(audio) {
187
- if (!this.config || !this.melFilterbank) {
188
- throw new Error("Config or filterbank not loaded");
189
- }
190
- // Compute FFT magnitude
191
- const spectrum = this.computeFFT(audio);
192
- // Apply mel filterbank and log compression
193
- const melFeatures = new Array(this.config.n_mels);
194
- for (let m = 0; m < this.config.n_mels; m++) {
195
- let sum = 0;
196
- const filter = this.melFilterbank[m];
197
- for (let k = 0; k < spectrum.length && k < filter.length; k++) {
198
- sum += spectrum[k] * spectrum[k] * filter[k]; // Power spectrum
199
- }
200
- // Log compression (matching training)
201
- melFeatures[m] = Math.log(Math.max(sum, 1e-10) + 1);
202
- }
203
- return melFeatures;
204
- }
205
- /**
206
- * Process audio buffer with ML noise suppression
207
- * @param inputBuffer Audio buffer to process (Float32Array)
208
- * @returns Processed audio buffer
209
- */
210
- async processAudio(inputBuffer) {
211
- if (!this.isInitialized || !this.model || !this.config || !this.normStats) {
212
- return inputBuffer;
213
- }
214
- try {
215
- const hopLength = this.config.hop_length;
216
- const frameSize = this.config.frame_size || 512;
217
- const numFrames = Math.floor((inputBuffer.length - frameSize) / hopLength) + 1;
218
- if (numFrames < 1) {
219
- return inputBuffer;
220
- }
221
- // Extract mel features for each frame
222
- const features = [];
223
- for (let i = 0; i < numFrames; i++) {
224
- const start = i * hopLength;
225
- const frame = inputBuffer.slice(start, start + frameSize);
226
- const melFeatures = this.computeMelFeatures(frame);
227
- features.push(melFeatures);
228
- }
229
- // Add to frame buffer for sequence processing
230
- this.frameBuffer.push(...features);
231
- // Keep only recent frames (2x sequence length for overlap)
232
- const seqLength = this.config.sequence_length;
233
- while (this.frameBuffer.length > seqLength * 2) {
234
- this.frameBuffer.shift();
235
- }
236
- // Need enough frames for one sequence
237
- if (this.frameBuffer.length < seqLength) {
238
- return inputBuffer; // Not enough frames yet, pass through
239
- }
240
- // Create sequence from recent frames
241
- const sequence = this.frameBuffer.slice(-seqLength);
242
- // Normalize features (using training stats)
243
- const normalizedSeq = sequence.map((frame) => frame.map((val) => (val - this.normStats.mean) / this.normStats.std));
244
- // Run model inference
245
- const mask = await tf.tidy(() => {
246
- const inputTensor = tf.tensor3d([normalizedSeq]);
247
- const output = this.model.predict(inputTensor);
248
- return output.arraySync();
249
- });
250
- // Get mask for the last frame (most recent prediction)
251
- const lastMaskFrame = mask[0][seqLength - 1];
252
- const currentMask = new Float32Array(lastMaskFrame);
253
- // Apply temporal smoothing (CRITICAL for Apple-quality audio!)
254
- const smoothedMask = this.applyTemporalSmoothing(currentMask);
255
- // Apply mask to audio with voice preservation
256
- const output = this.applyMaskWithVoicePreservation(inputBuffer, smoothedMask, numFrames);
257
- return output;
258
- }
259
- catch (error) {
260
- console.error("❌ Error processing audio:", error);
261
- return inputBuffer;
262
- }
263
- }
264
- /**
265
- * Apply temporal smoothing to reduce artifacts (Apple-style)
266
- */
267
- applyTemporalSmoothing(currentMask) {
268
- if (!this.prevMask || this.prevMask.length !== currentMask.length) {
269
- this.prevMask = new Float32Array(currentMask);
270
- return currentMask;
271
- }
272
- const smoothed = new Float32Array(currentMask.length);
273
- for (let i = 0; i < currentMask.length; i++) {
274
- // Exponential moving average for smooth transitions
275
- smoothed[i] =
276
- this.SMOOTHING_ALPHA * currentMask[i] +
277
- (1 - this.SMOOTHING_ALPHA) * this.prevMask[i];
278
- // Never completely mute (preserve minimum 3% - prevents artifacts)
279
- smoothed[i] = Math.max(0.03, Math.min(1.0, smoothed[i]));
280
- }
281
- this.prevMask = smoothed;
282
- return smoothed;
283
- }
284
- /**
285
- * Apply mask with voice frequency preservation
286
- */
287
- applyMaskWithVoicePreservation(audio, mask, numFrames) {
288
- const output = new Float32Array(audio.length);
289
- const hopLength = this.config.hop_length;
290
- const nMels = this.config.n_mels;
291
- // Calculate frequency-weighted gain
292
- // Voice fundamentals are in lower mel bins, preserve them more
293
- let voiceGain = 0;
294
- let noiseGain = 0;
295
- // Lower 1/4 of mels = voice fundamentals (80-500Hz)
296
- const voiceBins = Math.floor(nMels / 4);
297
- for (let i = 0; i < voiceBins; i++) {
298
- voiceGain += mask[i];
299
- }
300
- voiceGain /= voiceBins;
301
- // Upper 3/4 = potentially noise
302
- for (let i = voiceBins; i < nMels; i++) {
303
- noiseGain += mask[i];
304
- }
305
- noiseGain /= nMels - voiceBins;
306
- // Blend gains (favor voice preservation)
307
- const avgGain = voiceGain * 0.7 + noiseGain * 0.3;
308
- // Apply gain per sample
309
- for (let i = 0; i < audio.length; i++) {
310
- // Use smooth gain
311
- let gain = avgGain;
312
- // Boost if mask indicates strong voice (> 0.5)
313
- if (avgGain > 0.5) {
314
- gain = Math.min(1.0, avgGain * 1.05);
315
- }
316
- output[i] = audio[i] * gain;
317
- }
318
- // Apply soft fade at edges to prevent clicks
319
- const fadeLen = Math.min(64, output.length / 10);
320
- for (let i = 0; i < fadeLen; i++) {
321
- const fade = i / fadeLen;
322
- output[i] *= fade;
323
- output[output.length - 1 - i] *= fade;
324
- }
325
- return output;
326
- }
327
- /**
328
- * Process MediaStream with ML noise suppression
329
- * @param inputStream MediaStream to process
330
- * @returns Cleaned MediaStream
331
- */
332
- async processMediaStream(inputStream) {
333
- if (!this.audioContext || !this.isInitialized) {
334
- console.warn("⚠️ ML Noise Suppressor not initialized, returning original stream");
335
- return inputStream;
336
- }
337
- try {
338
- console.log("🎤 [ML] Setting up BiLSTM noise suppression pipeline...");
339
- // Create MediaStreamSource from input
340
- const source = this.audioContext.createMediaStreamSource(inputStream);
341
- // Create high-pass filter (remove <80Hz rumble - like Apple)
342
- this.highPassFilter = this.audioContext.createBiquadFilter();
343
- this.highPassFilter.type = "highpass";
344
- this.highPassFilter.frequency.value = 80;
345
- this.highPassFilter.Q.value = 0.7;
346
- // Create destination for output
347
- const destination = this.audioContext.createMediaStreamDestination();
348
- // Create ScriptProcessor for real-time ML processing
349
- // Buffer size of 2048 = ~42ms latency at 48kHz (acceptable for real-time)
350
- const bufferSize = 2048;
351
- this.processingNode = this.audioContext.createScriptProcessor(bufferSize, 1, 1);
352
- let frameCount = 0;
353
- const startTime = performance.now();
354
- // Double-buffering for async ML processing
355
- // We store the PREVIOUS processed result and output it in the NEXT callback
356
- // This adds one buffer of latency but ensures we never output zeros
357
- let previousProcessedBuffer = null;
358
- let processingInFlight = false;
359
- // Process audio frames with ML model
360
- // IMPORTANT: onaudioprocess is synchronous! We use double-buffering to handle async ML
361
- this.processingNode.onaudioprocess = (event) => {
362
- const inputData = event.inputBuffer.getChannelData(0);
363
- const outputData = event.outputBuffer.getChannelData(0);
364
- frameCount++;
365
- // OUTPUT: Use previously processed audio (or passthrough if not ready yet)
366
- if (previousProcessedBuffer) {
367
- outputData.set(previousProcessedBuffer);
368
- }
369
- else {
370
- // First frame or ML not ready - pass through original audio
371
- outputData.set(inputData);
372
- }
373
- // PROCESS: Start async ML processing for the NEXT frame
374
- // Only start new processing if previous one is complete
375
- if (!processingInFlight) {
376
- processingInFlight = true;
377
- const inputCopy = new Float32Array(inputData);
378
- // Fire-and-forget async processing
379
- this.processAudio(inputCopy)
380
- .then((processed) => {
381
- previousProcessedBuffer = processed;
382
- processingInFlight = false;
383
- })
384
- .catch((error) => {
385
- // On error, store the original audio for passthrough
386
- previousProcessedBuffer = inputCopy;
387
- processingInFlight = false;
388
- });
389
- }
390
- // Log performance every ~4 seconds
391
- if (frameCount % 100 === 0) {
392
- const elapsed = (performance.now() - startTime) / 1000;
393
- const fps = frameCount / elapsed;
394
- console.log(`🎤 [ML] BiLSTM: ${frameCount} frames @ ${fps.toFixed(1)} fps`);
395
- }
396
- };
397
- // Connect: source -> highpass -> BiLSTM processor -> destination
398
- source.connect(this.highPassFilter);
399
- this.highPassFilter.connect(this.processingNode);
400
- this.processingNode.connect(destination);
401
- console.log("✅ [ML] Pipeline: mic → highpass(80Hz) → BiLSTM(256x2) → output");
402
- console.log("✅ [ML] Latency: ~42ms, Sample rate: 48kHz");
403
- return destination.stream;
404
- }
405
- catch (error) {
406
- console.error("❌ [ML] Failed to process MediaStream:", error);
407
- return inputStream;
408
- }
409
- }
410
- /**
411
- * Cleanup resources
412
- */
413
- dispose() {
414
- if (this.processingNode) {
415
- this.processingNode.disconnect();
416
- this.processingNode = null;
417
- }
418
- if (this.highPassFilter) {
419
- this.highPassFilter.disconnect();
420
- this.highPassFilter = null;
421
- }
422
- if (this.model) {
423
- this.model.dispose();
424
- this.model = null;
425
- }
426
- this.frameBuffer = [];
427
- this.prevMask = null;
428
- this.melFilterbank = null;
429
- this.isInitialized = false;
430
- console.log("🗑️ ML Noise Suppressor disposed");
431
- }
432
- /**
433
- * Check if initialized
434
- */
435
- isReady() {
436
- return this.isInitialized;
437
- }
438
- }
439
- exports.MLNoiseSuppressor = MLNoiseSuppressor;
@@ -1,74 +0,0 @@
1
- /**
2
- * ULTIMATE ML Noise Suppressor - Enhanced for Apple/Google Meet Quality
3
- * Features:
4
- * 1. Temporal smoothing (exponential moving average)
5
- * 2. Voice frequency preservation (80-500 Hz)
6
- * 3. Sub-bass filtering (remove < 80 Hz)
7
- * 4. Adaptive processing
8
- * 5. WebAssembly acceleration
9
- */
10
- export declare class UltimateMLNoiseSuppressor {
11
- private model;
12
- private config;
13
- private normStats;
14
- private audioContext;
15
- private isInitialized;
16
- private prevMask;
17
- private readonly SMOOTHING_ALPHA;
18
- private highPassFilter;
19
- private voiceBandFilter;
20
- private processingQueue;
21
- private isProcessing;
22
- /**
23
- * Initialize with enhanced setup
24
- */
25
- initialize(modelUrl: string, audioContext: AudioContext): Promise<void>;
26
- /**
27
- * Setup filters for voice frequency preservation
28
- */
29
- private setupVoiceFilters;
30
- /**
31
- * Process audio with ULTIMATE quality
32
- * NOTE: This runs in the AudioWorklet thread. It must be synchronous and fast.
33
- * The heavy ML inference should ideally happen in a Worker, communicating via SharedArrayBuffer.
34
- * For this implementation, we use a simplified frame-based approach.
35
- */
36
- processAudio(inputBuffer: Float32Array): Float32Array;
37
- /**
38
- * Placeholder for async processing (to be moved to a Web Worker)
39
- */
40
- processFrameAsync(inputBuffer: Float32Array): Promise<void>;
41
- /**
42
- * CRITICAL: Temporal smoothing (biggest quality improvement!)
43
- */
44
- private applyTemporalSmoothing;
45
- /**
46
- * Apply high-pass filter to remove rumble
47
- */
48
- private applyHighPassFilter;
49
- /**
50
- * Apply mask with voice frequency preservation
51
- */
52
- private applyMaskWithVoicePreservation;
53
- /**
54
- * Extract mel-spectrogram features
55
- */
56
- private extractMelFeatures;
57
- /**
58
- * Compute mel bin (simplified)
59
- */
60
- private computeMelBin;
61
- /**
62
- * Create sequences for LSTM input
63
- */
64
- private createSequences;
65
- /**
66
- * Reset processing state (call when switching audio streams)
67
- */
68
- reset(): void;
69
- /**
70
- * Get processing latency
71
- */
72
- getLatency(): number;
73
- }
74
- export default UltimateMLNoiseSuppressor;