@omote/core 0.3.1 → 0.3.25
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-T465MTDX.mjs → chunk-B6TIE56N.mjs} +63 -1153
- package/dist/chunk-B6TIE56N.mjs.map +1 -0
- package/dist/events/index.mjs +1 -1
- package/dist/index.d.mts +86 -45
- package/dist/index.d.ts +86 -45
- package/dist/index.js +313 -1428
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +241 -124
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/dist/{transformers.web-MHLR33H6.mjs → transformers.web-T5LWC34T.mjs} +3 -3
- package/package.json +2 -3
- package/dist/chunk-6W7G6WE7.mjs +0 -13
- package/dist/chunk-C3Y37HKD.mjs +0 -26378
- package/dist/chunk-C3Y37HKD.mjs.map +0 -1
- package/dist/chunk-RI6UQ7WF.mjs +0 -26378
- package/dist/chunk-RI6UQ7WF.mjs.map +0 -1
- package/dist/chunk-T465MTDX.mjs.map +0 -1
- package/dist/transformers.web-4C62MDO6.mjs +0 -1724
- package/dist/transformers.web-4C62MDO6.mjs.map +0 -1
- package/dist/transformers.web-ALDLCPHT.mjs +0 -1725
- package/dist/transformers.web-ALDLCPHT.mjs.map +0 -1
- package/dist/transformers.web-MHLR33H6.mjs.map +0 -1
- /package/dist/{chunk-6W7G6WE7.mjs.map → transformers.web-T5LWC34T.mjs.map} +0 -0
package/dist/index.mjs
CHANGED
|
@@ -15,8 +15,8 @@ import {
|
|
|
15
15
|
import {
|
|
16
16
|
__webpack_exports__env,
|
|
17
17
|
__webpack_exports__pipeline
|
|
18
|
-
} from "./chunk-
|
|
19
|
-
import "./chunk-
|
|
18
|
+
} from "./chunk-B6TIE56N.mjs";
|
|
19
|
+
import "./chunk-NSSMTXJJ.mjs";
|
|
20
20
|
|
|
21
21
|
// src/audio/MicrophoneCapture.ts
|
|
22
22
|
var MicrophoneCapture = class {
|
|
@@ -28,6 +28,8 @@ var MicrophoneCapture = class {
|
|
|
28
28
|
this.buffer = new Float32Array(0);
|
|
29
29
|
this._isRecording = false;
|
|
30
30
|
this._loggedFirstChunk = false;
|
|
31
|
+
/** Actual AudioContext sample rate (may differ from target on Firefox) */
|
|
32
|
+
this._nativeSampleRate = 0;
|
|
31
33
|
this.config = {
|
|
32
34
|
sampleRate: config.sampleRate ?? 16e3,
|
|
33
35
|
chunkSize: config.chunkSize ?? 1600
|
|
@@ -62,10 +64,29 @@ var MicrophoneCapture = class {
|
|
|
62
64
|
if (this.context.state === "suspended") {
|
|
63
65
|
await this.context.resume();
|
|
64
66
|
}
|
|
65
|
-
|
|
67
|
+
let source;
|
|
68
|
+
try {
|
|
69
|
+
source = this.context.createMediaStreamSource(this.stream);
|
|
70
|
+
this._nativeSampleRate = this.context.sampleRate;
|
|
71
|
+
} catch (sourceErr) {
|
|
72
|
+
console.warn(
|
|
73
|
+
"[MicrophoneCapture] Cannot connect stream at",
|
|
74
|
+
this.config.sampleRate + "Hz, falling back to native rate:",
|
|
75
|
+
sourceErr.message
|
|
76
|
+
);
|
|
77
|
+
await this.context.close();
|
|
78
|
+
this.context = new AudioContext();
|
|
79
|
+
if (this.context.state === "suspended") {
|
|
80
|
+
await this.context.resume();
|
|
81
|
+
}
|
|
82
|
+
source = this.context.createMediaStreamSource(this.stream);
|
|
83
|
+
this._nativeSampleRate = this.context.sampleRate;
|
|
84
|
+
console.log("[MicrophoneCapture] Using native rate:", this._nativeSampleRate, "Hz \u2192 resampling to", this.config.sampleRate, "Hz");
|
|
85
|
+
}
|
|
66
86
|
this.processor = this.context.createScriptProcessor(4096, 1, 1);
|
|
67
87
|
this.processor.onaudioprocess = (e) => {
|
|
68
|
-
const
|
|
88
|
+
const raw = e.inputBuffer.getChannelData(0);
|
|
89
|
+
const input = this._nativeSampleRate !== this.config.sampleRate ? this.resample(raw, this._nativeSampleRate, this.config.sampleRate) : raw;
|
|
69
90
|
let rms = 0;
|
|
70
91
|
let peak = 0;
|
|
71
92
|
for (let i = 0; i < input.length; i++) {
|
|
@@ -123,6 +144,25 @@ var MicrophoneCapture = class {
|
|
|
123
144
|
this.buffer = new Float32Array(0);
|
|
124
145
|
this._isRecording = false;
|
|
125
146
|
}
|
|
147
|
+
/**
|
|
148
|
+
* Resample audio using linear interpolation.
|
|
149
|
+
* Used when the AudioContext runs at the device's native rate (e.g. 48kHz)
|
|
150
|
+
* and we need to downsample to the target rate (e.g. 16kHz).
|
|
151
|
+
*/
|
|
152
|
+
resample(input, fromRate, toRate) {
|
|
153
|
+
if (fromRate === toRate) return input;
|
|
154
|
+
const ratio = fromRate / toRate;
|
|
155
|
+
const outputLength = Math.floor(input.length / ratio);
|
|
156
|
+
const output = new Float32Array(outputLength);
|
|
157
|
+
for (let i = 0; i < outputLength; i++) {
|
|
158
|
+
const srcIdx = i * ratio;
|
|
159
|
+
const lo = Math.floor(srcIdx);
|
|
160
|
+
const hi = Math.min(lo + 1, input.length - 1);
|
|
161
|
+
const frac = srcIdx - lo;
|
|
162
|
+
output[i] = input[lo] * (1 - frac) + input[hi] * frac;
|
|
163
|
+
}
|
|
164
|
+
return output;
|
|
165
|
+
}
|
|
126
166
|
floatToPCM16(float32) {
|
|
127
167
|
const pcm = new Int16Array(float32.length);
|
|
128
168
|
for (let i = 0; i < float32.length; i++) {
|
|
@@ -263,7 +303,8 @@ var AudioScheduler = class {
|
|
|
263
303
|
const ctx = await this.ensureContext();
|
|
264
304
|
const channels = this.options.channels ?? 1;
|
|
265
305
|
if (!this.isPlaying) {
|
|
266
|
-
|
|
306
|
+
const lookahead = this.options.initialLookaheadSec ?? 0.05;
|
|
307
|
+
this.nextPlayTime = ctx.currentTime + lookahead;
|
|
267
308
|
this.isPlaying = true;
|
|
268
309
|
}
|
|
269
310
|
const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
|
|
@@ -446,8 +487,8 @@ var AudioChunkCoalescer = class {
|
|
|
446
487
|
var LAMPipeline = class {
|
|
447
488
|
constructor(options = {}) {
|
|
448
489
|
this.options = options;
|
|
449
|
-
this.
|
|
450
|
-
// 1.0s at 16kHz (
|
|
490
|
+
this.REQUIRED_SAMPLES = 16e3;
|
|
491
|
+
// 1.0s at 16kHz (LAM requirement)
|
|
451
492
|
this.FRAME_RATE = 30;
|
|
452
493
|
// LAM outputs 30fps
|
|
453
494
|
this.buffer = new Float32Array(0);
|
|
@@ -477,20 +518,22 @@ var LAMPipeline = class {
|
|
|
477
518
|
newBuffer.set(this.buffer, 0);
|
|
478
519
|
newBuffer.set(samples, this.buffer.length);
|
|
479
520
|
this.buffer = newBuffer;
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
521
|
+
while (this.buffer.length >= this.REQUIRED_SAMPLES) {
|
|
522
|
+
await this.processBuffer(lam);
|
|
523
|
+
if (this.buffer.length >= this.REQUIRED_SAMPLES) {
|
|
524
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
525
|
+
}
|
|
483
526
|
}
|
|
484
527
|
}
|
|
485
528
|
/**
|
|
486
529
|
* Process accumulated buffer through LAM inference
|
|
487
530
|
*/
|
|
488
|
-
async processBuffer(lam
|
|
531
|
+
async processBuffer(lam) {
|
|
489
532
|
try {
|
|
490
|
-
const toProcess = this.buffer.slice(0,
|
|
533
|
+
const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
|
|
491
534
|
const processedStartTime = this.bufferStartTime;
|
|
492
|
-
this.buffer = this.buffer.slice(
|
|
493
|
-
const processedDuration =
|
|
535
|
+
this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
|
|
536
|
+
const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
|
|
494
537
|
this.bufferStartTime = processedStartTime + processedDuration;
|
|
495
538
|
const result = await lam.infer(toProcess);
|
|
496
539
|
const frameDuration = 1 / this.FRAME_RATE;
|
|
@@ -509,22 +552,35 @@ var LAMPipeline = class {
|
|
|
509
552
|
/**
|
|
510
553
|
* Get the frame that should be displayed at the current time
|
|
511
554
|
*
|
|
512
|
-
*
|
|
513
|
-
*
|
|
514
|
-
*
|
|
555
|
+
* Automatically removes frames that have already been displayed.
|
|
556
|
+
* This prevents memory leaks from accumulating old frames.
|
|
557
|
+
*
|
|
558
|
+
* Discard Window (prevents premature frame discarding):
|
|
559
|
+
* - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
|
|
560
|
+
* - WASM: 1.0s (LAM inference 50-500ms + higher variability)
|
|
515
561
|
*
|
|
516
|
-
*
|
|
517
|
-
*
|
|
518
|
-
* to natural 30fps pacing via timestamp gating.
|
|
562
|
+
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
563
|
+
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
519
564
|
*
|
|
520
565
|
* @param currentTime - Current AudioContext time
|
|
521
566
|
* @param lam - LAM inference engine (optional, for backend detection)
|
|
522
567
|
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
523
568
|
*/
|
|
524
569
|
getFrameForTime(currentTime, lam) {
|
|
525
|
-
const discardWindow = lam?.backend === "wasm" ?
|
|
570
|
+
const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
|
|
571
|
+
let discardedCount = 0;
|
|
526
572
|
while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
|
|
527
|
-
this.frameQueue.shift();
|
|
573
|
+
const discarded = this.frameQueue.shift();
|
|
574
|
+
discardedCount++;
|
|
575
|
+
if (discardedCount === 1) {
|
|
576
|
+
const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
|
|
577
|
+
console.warn("[LAM] Frame(s) discarded as too old", {
|
|
578
|
+
ageMs,
|
|
579
|
+
discardWindowMs: discardWindow * 1e3,
|
|
580
|
+
queueLength: this.frameQueue.length,
|
|
581
|
+
backend: lam?.backend ?? "unknown"
|
|
582
|
+
});
|
|
583
|
+
}
|
|
528
584
|
}
|
|
529
585
|
if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
|
|
530
586
|
const { frame } = this.frameQueue.shift();
|
|
@@ -543,7 +599,7 @@ var LAMPipeline = class {
|
|
|
543
599
|
* Get current buffer fill level (0-1)
|
|
544
600
|
*/
|
|
545
601
|
get fillLevel() {
|
|
546
|
-
return Math.min(1, this.buffer.length / this.
|
|
602
|
+
return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
|
|
547
603
|
}
|
|
548
604
|
/**
|
|
549
605
|
* Get number of frames queued
|
|
@@ -560,7 +616,7 @@ var LAMPipeline = class {
|
|
|
560
616
|
/**
|
|
561
617
|
* Flush remaining buffered audio
|
|
562
618
|
*
|
|
563
|
-
* Processes any remaining audio in the buffer, even if less than
|
|
619
|
+
* Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
|
|
564
620
|
* This ensures the final audio chunk generates blendshape frames.
|
|
565
621
|
*
|
|
566
622
|
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
@@ -571,17 +627,12 @@ var LAMPipeline = class {
|
|
|
571
627
|
if (this.buffer.length === 0) {
|
|
572
628
|
return;
|
|
573
629
|
}
|
|
630
|
+
const padded = new Float32Array(this.REQUIRED_SAMPLES);
|
|
631
|
+
padded.set(this.buffer, 0);
|
|
574
632
|
const processedStartTime = this.bufferStartTime;
|
|
575
|
-
const sampleRate = this.options.sampleRate ?? 16e3;
|
|
576
|
-
const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
|
|
577
|
-
const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
|
|
578
|
-
const padded = new Float32Array(minSize);
|
|
579
|
-
padded.set(this.buffer, 0);
|
|
580
|
-
return padded;
|
|
581
|
-
})();
|
|
582
633
|
try {
|
|
583
|
-
const result = await lam.infer(
|
|
584
|
-
const actualDuration = this.buffer.length / sampleRate;
|
|
634
|
+
const result = await lam.infer(padded);
|
|
635
|
+
const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
|
|
585
636
|
const frameDuration = 1 / this.FRAME_RATE;
|
|
586
637
|
const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
|
|
587
638
|
for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
|
|
@@ -640,13 +691,12 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
640
691
|
this.monitorInterval = null;
|
|
641
692
|
this.frameAnimationId = null;
|
|
642
693
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
|
|
694
|
+
const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
|
|
695
|
+
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
696
|
+
this.scheduler = new AudioScheduler({
|
|
697
|
+
sampleRate,
|
|
698
|
+
initialLookaheadSec: audioDelayMs / 1e3
|
|
699
|
+
});
|
|
650
700
|
this.coalescer = new AudioChunkCoalescer({
|
|
651
701
|
sampleRate,
|
|
652
702
|
targetDurationMs: options.chunkTargetMs ?? 200
|
|
@@ -2014,9 +2064,7 @@ function formatBytes(bytes) {
|
|
|
2014
2064
|
function isIOSSafari() {
|
|
2015
2065
|
if (typeof navigator === "undefined") return false;
|
|
2016
2066
|
const ua = navigator.userAgent.toLowerCase();
|
|
2017
|
-
return /iphone|ipad|ipod/.test(ua)
|
|
2018
|
-
// Only force WASM on actual iOS devices
|
|
2019
|
-
/safari/.test(ua) && /mobile/.test(ua) && !/chrome|crios|fxios/.test(ua);
|
|
2067
|
+
return /iphone|ipad|ipod/.test(ua) && /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
|
|
2020
2068
|
}
|
|
2021
2069
|
function isIOS() {
|
|
2022
2070
|
if (typeof navigator === "undefined") return false;
|
|
@@ -2074,10 +2122,7 @@ function getOptimalWasmThreads() {
|
|
|
2074
2122
|
return 4;
|
|
2075
2123
|
}
|
|
2076
2124
|
function shouldEnableWasmProxy() {
|
|
2077
|
-
|
|
2078
|
-
return false;
|
|
2079
|
-
}
|
|
2080
|
-
return true;
|
|
2125
|
+
return false;
|
|
2081
2126
|
}
|
|
2082
2127
|
function isSafari() {
|
|
2083
2128
|
if (typeof navigator === "undefined") return false;
|
|
@@ -2092,7 +2137,7 @@ function isSpeechRecognitionAvailable() {
|
|
|
2092
2137
|
return "SpeechRecognition" in window || "webkitSpeechRecognition" in window;
|
|
2093
2138
|
}
|
|
2094
2139
|
function shouldUseNativeASR() {
|
|
2095
|
-
return isIOS() && isSpeechRecognitionAvailable();
|
|
2140
|
+
return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
|
|
2096
2141
|
}
|
|
2097
2142
|
function shouldUseServerLipSync() {
|
|
2098
2143
|
return isIOS();
|
|
@@ -2105,11 +2150,13 @@ var loadedBackend = null;
|
|
|
2105
2150
|
var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
2106
2151
|
async function isWebGPUAvailable() {
|
|
2107
2152
|
if (isIOS()) {
|
|
2108
|
-
logger.debug("WebGPU check: iOS
|
|
2153
|
+
logger.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
|
|
2109
2154
|
return false;
|
|
2110
2155
|
}
|
|
2111
2156
|
if (!hasWebGPUApi()) {
|
|
2112
|
-
logger.debug("WebGPU check: navigator.gpu not available"
|
|
2157
|
+
logger.debug("WebGPU check: navigator.gpu not available", {
|
|
2158
|
+
isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
|
|
2159
|
+
});
|
|
2113
2160
|
return false;
|
|
2114
2161
|
}
|
|
2115
2162
|
try {
|
|
@@ -2133,14 +2180,20 @@ async function isWebGPUAvailable() {
|
|
|
2133
2180
|
}
|
|
2134
2181
|
var iosWasmPatched = false;
|
|
2135
2182
|
function applyIOSWasmMemoryPatch() {
|
|
2136
|
-
if (iosWasmPatched || !
|
|
2183
|
+
if (iosWasmPatched || !isIOSSafari()) return;
|
|
2137
2184
|
iosWasmPatched = true;
|
|
2138
2185
|
const OrigMemory = WebAssembly.Memory;
|
|
2139
|
-
const MAX_IOS_PAGES =
|
|
2140
|
-
logger.info("Applying iOS WASM memory patch (max
|
|
2186
|
+
const MAX_IOS_PAGES = 32768;
|
|
2187
|
+
logger.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
|
|
2141
2188
|
WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
|
|
2142
2189
|
const patched = { ...descriptor };
|
|
2143
2190
|
if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
|
|
2191
|
+
logger.info("iOS memory patch: capping maximum", {
|
|
2192
|
+
original: patched.maximum,
|
|
2193
|
+
capped: MAX_IOS_PAGES,
|
|
2194
|
+
shared: patched.shared,
|
|
2195
|
+
initial: patched.initial
|
|
2196
|
+
});
|
|
2144
2197
|
patched.maximum = MAX_IOS_PAGES;
|
|
2145
2198
|
}
|
|
2146
2199
|
return new OrigMemory(patched);
|
|
@@ -2174,7 +2227,10 @@ async function getOnnxRuntime(backend) {
|
|
|
2174
2227
|
logger.info(`Loading ONNX Runtime with ${backend} backend...`);
|
|
2175
2228
|
applyIOSWasmMemoryPatch();
|
|
2176
2229
|
try {
|
|
2177
|
-
if (backend === "wasm") {
|
|
2230
|
+
if (backend === "wasm" && (isIOS() || isSafari())) {
|
|
2231
|
+
const module = await import("onnxruntime-web/wasm");
|
|
2232
|
+
ortInstance = module.default || module;
|
|
2233
|
+
} else if (backend === "wasm") {
|
|
2178
2234
|
const module = await import("onnxruntime-web");
|
|
2179
2235
|
ortInstance = module.default || module;
|
|
2180
2236
|
} else {
|
|
@@ -2249,6 +2305,16 @@ function getLoadedBackend() {
|
|
|
2249
2305
|
function isOnnxRuntimeLoaded() {
|
|
2250
2306
|
return ortInstance !== null;
|
|
2251
2307
|
}
|
|
2308
|
+
async function preloadOnnxRuntime(preference = "auto") {
|
|
2309
|
+
if (ortInstance) {
|
|
2310
|
+
logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
|
|
2311
|
+
return loadedBackend;
|
|
2312
|
+
}
|
|
2313
|
+
logger.info("Preloading ONNX Runtime...", { preference });
|
|
2314
|
+
const { backend } = await getOnnxRuntimeForPreference(preference);
|
|
2315
|
+
logger.info("ONNX Runtime preloaded", { backend });
|
|
2316
|
+
return backend;
|
|
2317
|
+
}
|
|
2252
2318
|
|
|
2253
2319
|
// src/inference/blendshapeUtils.ts
|
|
2254
2320
|
var LAM_BLENDSHAPES = [
|
|
@@ -2444,6 +2510,7 @@ var CTC_VOCAB = [
|
|
|
2444
2510
|
];
|
|
2445
2511
|
var Wav2Vec2Inference = class {
|
|
2446
2512
|
constructor(config) {
|
|
2513
|
+
this.modelId = "wav2vec2";
|
|
2447
2514
|
this.session = null;
|
|
2448
2515
|
this.ort = null;
|
|
2449
2516
|
this._backend = "wasm";
|
|
@@ -2504,13 +2571,52 @@ var Wav2Vec2Inference = class {
|
|
|
2504
2571
|
logger2.error(errorMsg, { modelUrl, isCached });
|
|
2505
2572
|
throw new Error(errorMsg);
|
|
2506
2573
|
}
|
|
2574
|
+
let externalDataBuffer = null;
|
|
2575
|
+
if (this.config.externalDataUrl !== false) {
|
|
2576
|
+
const dataUrl = typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`;
|
|
2577
|
+
try {
|
|
2578
|
+
const isDataCached = await cache.has(dataUrl);
|
|
2579
|
+
if (isDataCached) {
|
|
2580
|
+
logger2.debug("Loading external data from cache", { dataUrl });
|
|
2581
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
2582
|
+
if (!externalDataBuffer) {
|
|
2583
|
+
logger2.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
2584
|
+
await cache.delete(dataUrl);
|
|
2585
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2586
|
+
}
|
|
2587
|
+
} else {
|
|
2588
|
+
logger2.info("Fetching external model data", {
|
|
2589
|
+
dataUrl,
|
|
2590
|
+
note: "This may be a large download (383MB+)"
|
|
2591
|
+
});
|
|
2592
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2593
|
+
}
|
|
2594
|
+
logger2.info("External data loaded", {
|
|
2595
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
2596
|
+
});
|
|
2597
|
+
} catch (err) {
|
|
2598
|
+
logger2.debug("No external data file found (single-file model)", {
|
|
2599
|
+
dataUrl,
|
|
2600
|
+
error: err.message
|
|
2601
|
+
});
|
|
2602
|
+
}
|
|
2603
|
+
}
|
|
2507
2604
|
logger2.debug("Creating ONNX session", {
|
|
2508
|
-
|
|
2605
|
+
graphSize: formatBytes(modelBuffer.byteLength),
|
|
2606
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
2509
2607
|
backend: this._backend
|
|
2510
2608
|
});
|
|
2511
2609
|
const sessionOptions = getSessionOptions(this._backend);
|
|
2610
|
+
if (externalDataBuffer) {
|
|
2611
|
+
const dataFilename = (typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`).split("/").pop();
|
|
2612
|
+
sessionOptions.externalData = [{
|
|
2613
|
+
path: dataFilename,
|
|
2614
|
+
data: new Uint8Array(externalDataBuffer)
|
|
2615
|
+
}];
|
|
2616
|
+
}
|
|
2512
2617
|
logger2.info("Creating session with execution provider", {
|
|
2513
|
-
executionProvider: this._backend
|
|
2618
|
+
executionProvider: this._backend,
|
|
2619
|
+
hasExternalData: !!externalDataBuffer
|
|
2514
2620
|
});
|
|
2515
2621
|
const modelData = new Uint8Array(modelBuffer);
|
|
2516
2622
|
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
@@ -2756,7 +2862,7 @@ var WhisperInference = class _WhisperInference {
|
|
|
2756
2862
|
* Check if WebGPU is available in this browser
|
|
2757
2863
|
*/
|
|
2758
2864
|
static async isWebGPUAvailable() {
|
|
2759
|
-
return
|
|
2865
|
+
return "gpu" in navigator;
|
|
2760
2866
|
}
|
|
2761
2867
|
/**
|
|
2762
2868
|
* Load the Whisper model pipeline
|
|
@@ -3051,18 +3157,13 @@ var WhisperInference = class _WhisperInference {
|
|
|
3051
3157
|
var logger5 = createLogger("Wav2ArkitCpu");
|
|
3052
3158
|
var Wav2ArkitCpuInference = class {
|
|
3053
3159
|
constructor(config) {
|
|
3160
|
+
this.modelId = "wav2arkit_cpu";
|
|
3054
3161
|
this.session = null;
|
|
3055
3162
|
this.ort = null;
|
|
3056
3163
|
this._backend = "wasm";
|
|
3057
3164
|
this.isLoading = false;
|
|
3058
3165
|
// Inference queue for handling concurrent calls
|
|
3059
3166
|
this.inferenceQueue = Promise.resolve();
|
|
3060
|
-
/**
|
|
3061
|
-
* Preferred chunk size: 4000 samples (250ms at 16kHz).
|
|
3062
|
-
* wav2arkit_cpu accepts variable-length input, so we use smaller chunks
|
|
3063
|
-
* for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
|
|
3064
|
-
*/
|
|
3065
|
-
this.chunkSamples = 4e3;
|
|
3066
3167
|
this.config = config;
|
|
3067
3168
|
}
|
|
3068
3169
|
get backend() {
|
|
@@ -3096,23 +3197,25 @@ var Wav2ArkitCpuInference = class {
|
|
|
3096
3197
|
this._backend = backend;
|
|
3097
3198
|
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3098
3199
|
const modelUrl = this.config.modelUrl;
|
|
3099
|
-
const
|
|
3100
|
-
|
|
3101
|
-
if (isIOS()
|
|
3102
|
-
|
|
3103
|
-
sessionOptions.externalData = [{
|
|
3104
|
-
path: dataFilename,
|
|
3105
|
-
data: this.config.modelDataUrl
|
|
3106
|
-
}];
|
|
3107
|
-
logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
|
|
3200
|
+
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
3201
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
3202
|
+
if (isIOS()) {
|
|
3203
|
+
logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
3108
3204
|
modelUrl,
|
|
3109
|
-
|
|
3110
|
-
dataUrl: this.config.modelDataUrl
|
|
3205
|
+
dataUrl
|
|
3111
3206
|
});
|
|
3207
|
+
if (dataUrl) {
|
|
3208
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
3209
|
+
sessionOptions.externalData = [{
|
|
3210
|
+
path: dataFilename,
|
|
3211
|
+
data: dataUrl
|
|
3212
|
+
// URL string — ORT fetches directly into WASM
|
|
3213
|
+
}];
|
|
3214
|
+
}
|
|
3112
3215
|
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
3113
3216
|
} else {
|
|
3114
3217
|
const cache = getModelCache();
|
|
3115
|
-
isCached = await cache.has(modelUrl);
|
|
3218
|
+
const isCached = await cache.has(modelUrl);
|
|
3116
3219
|
let modelBuffer;
|
|
3117
3220
|
if (isCached) {
|
|
3118
3221
|
logger5.debug("Loading model from cache", { modelUrl });
|
|
@@ -3123,42 +3226,48 @@ var Wav2ArkitCpuInference = class {
|
|
|
3123
3226
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3124
3227
|
}
|
|
3125
3228
|
} else {
|
|
3126
|
-
logger5.debug("Fetching and caching model", { modelUrl });
|
|
3229
|
+
logger5.debug("Fetching and caching model graph", { modelUrl });
|
|
3127
3230
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3128
3231
|
}
|
|
3129
3232
|
if (!modelBuffer) {
|
|
3130
3233
|
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
3131
3234
|
}
|
|
3132
|
-
let externalDataBuffer;
|
|
3133
|
-
if (
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3235
|
+
let externalDataBuffer = null;
|
|
3236
|
+
if (dataUrl) {
|
|
3237
|
+
try {
|
|
3238
|
+
const isDataCached = await cache.has(dataUrl);
|
|
3239
|
+
if (isDataCached) {
|
|
3240
|
+
logger5.debug("Loading external data from cache", { dataUrl });
|
|
3241
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
3242
|
+
if (!externalDataBuffer) {
|
|
3243
|
+
logger5.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
3244
|
+
await cache.delete(dataUrl);
|
|
3245
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3246
|
+
}
|
|
3247
|
+
} else {
|
|
3248
|
+
logger5.info("Fetching external model data", {
|
|
3249
|
+
dataUrl,
|
|
3250
|
+
note: "This may be a large download (400MB+)"
|
|
3251
|
+
});
|
|
3142
3252
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3143
3253
|
}
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3254
|
+
logger5.info("External data loaded", {
|
|
3255
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
3256
|
+
});
|
|
3257
|
+
} catch (err) {
|
|
3258
|
+
logger5.debug("No external data file found (single-file model)", {
|
|
3259
|
+
dataUrl,
|
|
3260
|
+
error: err.message
|
|
3147
3261
|
});
|
|
3148
|
-
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3149
3262
|
}
|
|
3150
|
-
logger5.debug("External data loaded", {
|
|
3151
|
-
size: formatBytes(externalDataBuffer.byteLength)
|
|
3152
|
-
});
|
|
3153
3263
|
}
|
|
3154
3264
|
logger5.debug("Creating ONNX session", {
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
|
|
3265
|
+
graphSize: formatBytes(modelBuffer.byteLength),
|
|
3266
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
3158
3267
|
backend: this._backend
|
|
3159
3268
|
});
|
|
3160
3269
|
if (externalDataBuffer) {
|
|
3161
|
-
const dataFilename =
|
|
3270
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
3162
3271
|
sessionOptions.externalData = [{
|
|
3163
3272
|
path: dataFilename,
|
|
3164
3273
|
data: new Uint8Array(externalDataBuffer)
|
|
@@ -3177,7 +3286,7 @@ var Wav2ArkitCpuInference = class {
|
|
|
3177
3286
|
span?.setAttributes({
|
|
3178
3287
|
"model.backend": this._backend,
|
|
3179
3288
|
"model.load_time_ms": loadTimeMs,
|
|
3180
|
-
"model.cached":
|
|
3289
|
+
"model.cached": !isIOS()
|
|
3181
3290
|
});
|
|
3182
3291
|
span?.end();
|
|
3183
3292
|
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
@@ -3258,11 +3367,11 @@ var Wav2ArkitCpuInference = class {
|
|
|
3258
3367
|
const blendshapes = [];
|
|
3259
3368
|
for (let f = 0; f < numFrames; f++) {
|
|
3260
3369
|
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
3261
|
-
const
|
|
3262
|
-
blendshapes.push(
|
|
3370
|
+
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
3371
|
+
blendshapes.push(symmetrized);
|
|
3263
3372
|
}
|
|
3264
3373
|
logger5.trace("Inference completed", {
|
|
3265
|
-
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
3374
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3266
3375
|
numFrames,
|
|
3267
3376
|
inputSamples
|
|
3268
3377
|
});
|
|
@@ -3328,14 +3437,14 @@ function createLipSync(config) {
|
|
|
3328
3437
|
});
|
|
3329
3438
|
}
|
|
3330
3439
|
if (useCpu) {
|
|
3331
|
-
logger6.info("Creating Wav2ArkitCpuInference (WASM)");
|
|
3440
|
+
logger6.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
3332
3441
|
return new Wav2ArkitCpuInference({
|
|
3333
|
-
modelUrl: config.cpuModelUrl
|
|
3334
|
-
modelDataUrl: config.cpuModelDataUrl
|
|
3442
|
+
modelUrl: config.cpuModelUrl
|
|
3335
3443
|
});
|
|
3336
3444
|
}
|
|
3337
3445
|
const gpuInstance = new Wav2Vec2Inference({
|
|
3338
3446
|
modelUrl: config.gpuModelUrl,
|
|
3447
|
+
externalDataUrl: config.gpuExternalDataUrl,
|
|
3339
3448
|
backend: config.gpuBackend ?? "auto",
|
|
3340
3449
|
numIdentityClasses: config.numIdentityClasses
|
|
3341
3450
|
});
|
|
@@ -3352,15 +3461,15 @@ var LipSyncWithFallback = class {
|
|
|
3352
3461
|
this.implementation = gpuInstance;
|
|
3353
3462
|
this.config = config;
|
|
3354
3463
|
}
|
|
3464
|
+
get modelId() {
|
|
3465
|
+
return this.implementation.modelId;
|
|
3466
|
+
}
|
|
3355
3467
|
get backend() {
|
|
3356
3468
|
return this.implementation.backend;
|
|
3357
3469
|
}
|
|
3358
3470
|
get isLoaded() {
|
|
3359
3471
|
return this.implementation.isLoaded;
|
|
3360
3472
|
}
|
|
3361
|
-
get chunkSamples() {
|
|
3362
|
-
return this.implementation.chunkSamples;
|
|
3363
|
-
}
|
|
3364
3473
|
async load() {
|
|
3365
3474
|
try {
|
|
3366
3475
|
return await this.implementation.load();
|
|
@@ -3373,8 +3482,7 @@ var LipSyncWithFallback = class {
|
|
|
3373
3482
|
} catch {
|
|
3374
3483
|
}
|
|
3375
3484
|
this.implementation = new Wav2ArkitCpuInference({
|
|
3376
|
-
modelUrl: this.config.cpuModelUrl
|
|
3377
|
-
modelDataUrl: this.config.cpuModelDataUrl
|
|
3485
|
+
modelUrl: this.config.cpuModelUrl
|
|
3378
3486
|
});
|
|
3379
3487
|
this.hasFallenBack = true;
|
|
3380
3488
|
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
@@ -3404,6 +3512,8 @@ var SileroVADInference = class {
|
|
|
3404
3512
|
// Pre-speech buffer for capturing beginning of speech
|
|
3405
3513
|
this.preSpeechBuffer = [];
|
|
3406
3514
|
this.wasSpeaking = false;
|
|
3515
|
+
// Cached sample rate tensor (int64 scalar, never changes per instance)
|
|
3516
|
+
this.srTensor = null;
|
|
3407
3517
|
const sampleRate = config.sampleRate ?? 16e3;
|
|
3408
3518
|
if (sampleRate !== 8e3 && sampleRate !== 16e3) {
|
|
3409
3519
|
throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
|
|
@@ -3534,6 +3644,24 @@ var SileroVADInference = class {
|
|
|
3534
3644
|
this.context = new Float32Array(this.contextSize);
|
|
3535
3645
|
this.preSpeechBuffer = [];
|
|
3536
3646
|
this.wasSpeaking = false;
|
|
3647
|
+
if (!this.srTensor) {
|
|
3648
|
+
try {
|
|
3649
|
+
this.srTensor = new this.ort.Tensor(
|
|
3650
|
+
"int64",
|
|
3651
|
+
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
3652
|
+
[]
|
|
3653
|
+
);
|
|
3654
|
+
} catch (e) {
|
|
3655
|
+
logger7.warn("BigInt64Array not available, using bigint array fallback", {
|
|
3656
|
+
error: e instanceof Error ? e.message : String(e)
|
|
3657
|
+
});
|
|
3658
|
+
this.srTensor = new this.ort.Tensor(
|
|
3659
|
+
"int64",
|
|
3660
|
+
[BigInt(this.config.sampleRate)],
|
|
3661
|
+
[]
|
|
3662
|
+
);
|
|
3663
|
+
}
|
|
3664
|
+
}
|
|
3537
3665
|
}
|
|
3538
3666
|
/**
|
|
3539
3667
|
* Process a single audio chunk
|
|
@@ -3665,20 +3793,7 @@ var SileroVADInference = class {
|
|
|
3665
3793
|
inputBuffer.set(audioChunkCopy, this.contextSize);
|
|
3666
3794
|
const inputBufferCopy = new Float32Array(inputBuffer);
|
|
3667
3795
|
const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
|
|
3668
|
-
|
|
3669
|
-
try {
|
|
3670
|
-
srTensor = new this.ort.Tensor(
|
|
3671
|
-
"int64",
|
|
3672
|
-
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
3673
|
-
[]
|
|
3674
|
-
);
|
|
3675
|
-
} catch {
|
|
3676
|
-
srTensor = new this.ort.Tensor(
|
|
3677
|
-
"int64",
|
|
3678
|
-
[BigInt(this.config.sampleRate)],
|
|
3679
|
-
[]
|
|
3680
|
-
);
|
|
3681
|
-
}
|
|
3796
|
+
const srTensor = this.srTensor;
|
|
3682
3797
|
const stateCopy = new Float32Array(this.state.data);
|
|
3683
3798
|
const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
|
|
3684
3799
|
const feeds = {
|
|
@@ -3767,6 +3882,7 @@ var SileroVADInference = class {
|
|
|
3767
3882
|
this.session = null;
|
|
3768
3883
|
}
|
|
3769
3884
|
this.state = null;
|
|
3885
|
+
this.srTensor = null;
|
|
3770
3886
|
}
|
|
3771
3887
|
};
|
|
3772
3888
|
/**
|
|
@@ -6579,7 +6695,7 @@ async function nukeBrowserCaches(preventRecreation = false) {
|
|
|
6579
6695
|
totalDeleted: deletedCount
|
|
6580
6696
|
});
|
|
6581
6697
|
if (preventRecreation) {
|
|
6582
|
-
const { env } = await import("./transformers.web-
|
|
6698
|
+
const { env } = await import("./transformers.web-T5LWC34T.mjs");
|
|
6583
6699
|
env.useBrowserCache = false;
|
|
6584
6700
|
logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
|
|
6585
6701
|
}
|
|
@@ -7194,6 +7310,7 @@ export {
|
|
|
7194
7310
|
nukeBrowserCaches,
|
|
7195
7311
|
parseHuggingFaceUrl,
|
|
7196
7312
|
preloadModels,
|
|
7313
|
+
preloadOnnxRuntime,
|
|
7197
7314
|
remapWav2ArkitToLam,
|
|
7198
7315
|
resetLoggingConfig,
|
|
7199
7316
|
resolveBackend,
|