@omote/core 0.3.1 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/events/index.mjs +0 -1
- package/dist/index.d.mts +287 -304
- package/dist/index.d.ts +287 -304
- package/dist/index.js +883 -40000
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +847 -949
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.mjs +0 -1
- package/package.json +1 -3
- package/dist/chunk-6W7G6WE7.mjs +0 -13
- package/dist/chunk-6W7G6WE7.mjs.map +0 -1
- package/dist/chunk-C3Y37HKD.mjs +0 -26378
- package/dist/chunk-C3Y37HKD.mjs.map +0 -1
- package/dist/chunk-NSSMTXJJ.mjs +0 -8
- package/dist/chunk-NSSMTXJJ.mjs.map +0 -1
- package/dist/chunk-RI6UQ7WF.mjs +0 -26378
- package/dist/chunk-RI6UQ7WF.mjs.map +0 -1
- package/dist/chunk-T465MTDX.mjs +0 -38869
- package/dist/chunk-T465MTDX.mjs.map +0 -1
- package/dist/transformers.web-4C62MDO6.mjs +0 -1724
- package/dist/transformers.web-4C62MDO6.mjs.map +0 -1
- package/dist/transformers.web-ALDLCPHT.mjs +0 -1725
- package/dist/transformers.web-ALDLCPHT.mjs.map +0 -1
- package/dist/transformers.web-MHLR33H6.mjs +0 -1718
- package/dist/transformers.web-MHLR33H6.mjs.map +0 -1
package/dist/index.mjs
CHANGED
|
@@ -12,11 +12,6 @@ import {
|
|
|
12
12
|
setLogLevel,
|
|
13
13
|
setLoggingEnabled
|
|
14
14
|
} from "./chunk-ESU52TDS.mjs";
|
|
15
|
-
import {
|
|
16
|
-
__webpack_exports__env,
|
|
17
|
-
__webpack_exports__pipeline
|
|
18
|
-
} from "./chunk-T465MTDX.mjs";
|
|
19
|
-
import "./chunk-6W7G6WE7.mjs";
|
|
20
15
|
|
|
21
16
|
// src/audio/MicrophoneCapture.ts
|
|
22
17
|
var MicrophoneCapture = class {
|
|
@@ -28,6 +23,8 @@ var MicrophoneCapture = class {
|
|
|
28
23
|
this.buffer = new Float32Array(0);
|
|
29
24
|
this._isRecording = false;
|
|
30
25
|
this._loggedFirstChunk = false;
|
|
26
|
+
/** Actual AudioContext sample rate (may differ from target on Firefox) */
|
|
27
|
+
this._nativeSampleRate = 0;
|
|
31
28
|
this.config = {
|
|
32
29
|
sampleRate: config.sampleRate ?? 16e3,
|
|
33
30
|
chunkSize: config.chunkSize ?? 1600
|
|
@@ -62,10 +59,29 @@ var MicrophoneCapture = class {
|
|
|
62
59
|
if (this.context.state === "suspended") {
|
|
63
60
|
await this.context.resume();
|
|
64
61
|
}
|
|
65
|
-
|
|
62
|
+
let source;
|
|
63
|
+
try {
|
|
64
|
+
source = this.context.createMediaStreamSource(this.stream);
|
|
65
|
+
this._nativeSampleRate = this.context.sampleRate;
|
|
66
|
+
} catch (sourceErr) {
|
|
67
|
+
console.warn(
|
|
68
|
+
"[MicrophoneCapture] Cannot connect stream at",
|
|
69
|
+
this.config.sampleRate + "Hz, falling back to native rate:",
|
|
70
|
+
sourceErr.message
|
|
71
|
+
);
|
|
72
|
+
await this.context.close();
|
|
73
|
+
this.context = new AudioContext();
|
|
74
|
+
if (this.context.state === "suspended") {
|
|
75
|
+
await this.context.resume();
|
|
76
|
+
}
|
|
77
|
+
source = this.context.createMediaStreamSource(this.stream);
|
|
78
|
+
this._nativeSampleRate = this.context.sampleRate;
|
|
79
|
+
console.log("[MicrophoneCapture] Using native rate:", this._nativeSampleRate, "Hz \u2192 resampling to", this.config.sampleRate, "Hz");
|
|
80
|
+
}
|
|
66
81
|
this.processor = this.context.createScriptProcessor(4096, 1, 1);
|
|
67
82
|
this.processor.onaudioprocess = (e) => {
|
|
68
|
-
const
|
|
83
|
+
const raw = e.inputBuffer.getChannelData(0);
|
|
84
|
+
const input = this._nativeSampleRate !== this.config.sampleRate ? this.resample(raw, this._nativeSampleRate, this.config.sampleRate) : raw;
|
|
69
85
|
let rms = 0;
|
|
70
86
|
let peak = 0;
|
|
71
87
|
for (let i = 0; i < input.length; i++) {
|
|
@@ -123,6 +139,25 @@ var MicrophoneCapture = class {
|
|
|
123
139
|
this.buffer = new Float32Array(0);
|
|
124
140
|
this._isRecording = false;
|
|
125
141
|
}
|
|
142
|
+
/**
|
|
143
|
+
* Resample audio using linear interpolation.
|
|
144
|
+
* Used when the AudioContext runs at the device's native rate (e.g. 48kHz)
|
|
145
|
+
* and we need to downsample to the target rate (e.g. 16kHz).
|
|
146
|
+
*/
|
|
147
|
+
resample(input, fromRate, toRate) {
|
|
148
|
+
if (fromRate === toRate) return input;
|
|
149
|
+
const ratio = fromRate / toRate;
|
|
150
|
+
const outputLength = Math.floor(input.length / ratio);
|
|
151
|
+
const output = new Float32Array(outputLength);
|
|
152
|
+
for (let i = 0; i < outputLength; i++) {
|
|
153
|
+
const srcIdx = i * ratio;
|
|
154
|
+
const lo = Math.floor(srcIdx);
|
|
155
|
+
const hi = Math.min(lo + 1, input.length - 1);
|
|
156
|
+
const frac = srcIdx - lo;
|
|
157
|
+
output[i] = input[lo] * (1 - frac) + input[hi] * frac;
|
|
158
|
+
}
|
|
159
|
+
return output;
|
|
160
|
+
}
|
|
126
161
|
floatToPCM16(float32) {
|
|
127
162
|
const pcm = new Int16Array(float32.length);
|
|
128
163
|
for (let i = 0; i < float32.length; i++) {
|
|
@@ -263,7 +298,8 @@ var AudioScheduler = class {
|
|
|
263
298
|
const ctx = await this.ensureContext();
|
|
264
299
|
const channels = this.options.channels ?? 1;
|
|
265
300
|
if (!this.isPlaying) {
|
|
266
|
-
|
|
301
|
+
const lookahead = this.options.initialLookaheadSec ?? 0.05;
|
|
302
|
+
this.nextPlayTime = ctx.currentTime + lookahead;
|
|
267
303
|
this.isPlaying = true;
|
|
268
304
|
}
|
|
269
305
|
const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
|
|
@@ -446,8 +482,8 @@ var AudioChunkCoalescer = class {
|
|
|
446
482
|
var LAMPipeline = class {
|
|
447
483
|
constructor(options = {}) {
|
|
448
484
|
this.options = options;
|
|
449
|
-
this.
|
|
450
|
-
// 1.0s at 16kHz (
|
|
485
|
+
this.REQUIRED_SAMPLES = 16e3;
|
|
486
|
+
// 1.0s at 16kHz (LAM requirement)
|
|
451
487
|
this.FRAME_RATE = 30;
|
|
452
488
|
// LAM outputs 30fps
|
|
453
489
|
this.buffer = new Float32Array(0);
|
|
@@ -477,20 +513,22 @@ var LAMPipeline = class {
|
|
|
477
513
|
newBuffer.set(this.buffer, 0);
|
|
478
514
|
newBuffer.set(samples, this.buffer.length);
|
|
479
515
|
this.buffer = newBuffer;
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
516
|
+
while (this.buffer.length >= this.REQUIRED_SAMPLES) {
|
|
517
|
+
await this.processBuffer(lam);
|
|
518
|
+
if (this.buffer.length >= this.REQUIRED_SAMPLES) {
|
|
519
|
+
await new Promise((r) => setTimeout(r, 0));
|
|
520
|
+
}
|
|
483
521
|
}
|
|
484
522
|
}
|
|
485
523
|
/**
|
|
486
524
|
* Process accumulated buffer through LAM inference
|
|
487
525
|
*/
|
|
488
|
-
async processBuffer(lam
|
|
526
|
+
async processBuffer(lam) {
|
|
489
527
|
try {
|
|
490
|
-
const toProcess = this.buffer.slice(0,
|
|
528
|
+
const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
|
|
491
529
|
const processedStartTime = this.bufferStartTime;
|
|
492
|
-
this.buffer = this.buffer.slice(
|
|
493
|
-
const processedDuration =
|
|
530
|
+
this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
|
|
531
|
+
const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
|
|
494
532
|
this.bufferStartTime = processedStartTime + processedDuration;
|
|
495
533
|
const result = await lam.infer(toProcess);
|
|
496
534
|
const frameDuration = 1 / this.FRAME_RATE;
|
|
@@ -509,22 +547,35 @@ var LAMPipeline = class {
|
|
|
509
547
|
/**
|
|
510
548
|
* Get the frame that should be displayed at the current time
|
|
511
549
|
*
|
|
512
|
-
*
|
|
513
|
-
*
|
|
514
|
-
* frames are ready by the time their corresponding audio plays.
|
|
550
|
+
* Automatically removes frames that have already been displayed.
|
|
551
|
+
* This prevents memory leaks from accumulating old frames.
|
|
515
552
|
*
|
|
516
|
-
* Discard
|
|
517
|
-
*
|
|
518
|
-
*
|
|
553
|
+
* Discard Window (prevents premature frame discarding):
|
|
554
|
+
* - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
|
|
555
|
+
* - WASM: 1.0s (LAM inference 50-500ms + higher variability)
|
|
556
|
+
*
|
|
557
|
+
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
558
|
+
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
519
559
|
*
|
|
520
560
|
* @param currentTime - Current AudioContext time
|
|
521
561
|
* @param lam - LAM inference engine (optional, for backend detection)
|
|
522
562
|
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
523
563
|
*/
|
|
524
564
|
getFrameForTime(currentTime, lam) {
|
|
525
|
-
const discardWindow = lam?.backend === "wasm" ?
|
|
565
|
+
const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
|
|
566
|
+
let discardedCount = 0;
|
|
526
567
|
while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
|
|
527
|
-
this.frameQueue.shift();
|
|
568
|
+
const discarded = this.frameQueue.shift();
|
|
569
|
+
discardedCount++;
|
|
570
|
+
if (discardedCount === 1) {
|
|
571
|
+
const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
|
|
572
|
+
console.warn("[LAM] Frame(s) discarded as too old", {
|
|
573
|
+
ageMs,
|
|
574
|
+
discardWindowMs: discardWindow * 1e3,
|
|
575
|
+
queueLength: this.frameQueue.length,
|
|
576
|
+
backend: lam?.backend ?? "unknown"
|
|
577
|
+
});
|
|
578
|
+
}
|
|
528
579
|
}
|
|
529
580
|
if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
|
|
530
581
|
const { frame } = this.frameQueue.shift();
|
|
@@ -543,7 +594,7 @@ var LAMPipeline = class {
|
|
|
543
594
|
* Get current buffer fill level (0-1)
|
|
544
595
|
*/
|
|
545
596
|
get fillLevel() {
|
|
546
|
-
return Math.min(1, this.buffer.length / this.
|
|
597
|
+
return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
|
|
547
598
|
}
|
|
548
599
|
/**
|
|
549
600
|
* Get number of frames queued
|
|
@@ -560,7 +611,7 @@ var LAMPipeline = class {
|
|
|
560
611
|
/**
|
|
561
612
|
* Flush remaining buffered audio
|
|
562
613
|
*
|
|
563
|
-
* Processes any remaining audio in the buffer, even if less than
|
|
614
|
+
* Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
|
|
564
615
|
* This ensures the final audio chunk generates blendshape frames.
|
|
565
616
|
*
|
|
566
617
|
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
@@ -571,17 +622,12 @@ var LAMPipeline = class {
|
|
|
571
622
|
if (this.buffer.length === 0) {
|
|
572
623
|
return;
|
|
573
624
|
}
|
|
625
|
+
const padded = new Float32Array(this.REQUIRED_SAMPLES);
|
|
626
|
+
padded.set(this.buffer, 0);
|
|
574
627
|
const processedStartTime = this.bufferStartTime;
|
|
575
|
-
const sampleRate = this.options.sampleRate ?? 16e3;
|
|
576
|
-
const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
|
|
577
|
-
const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
|
|
578
|
-
const padded = new Float32Array(minSize);
|
|
579
|
-
padded.set(this.buffer, 0);
|
|
580
|
-
return padded;
|
|
581
|
-
})();
|
|
582
628
|
try {
|
|
583
|
-
const result = await lam.infer(
|
|
584
|
-
const actualDuration = this.buffer.length / sampleRate;
|
|
629
|
+
const result = await lam.infer(padded);
|
|
630
|
+
const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
|
|
585
631
|
const frameDuration = 1 / this.FRAME_RATE;
|
|
586
632
|
const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
|
|
587
633
|
for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
|
|
@@ -640,13 +686,12 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
640
686
|
this.monitorInterval = null;
|
|
641
687
|
this.frameAnimationId = null;
|
|
642
688
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
|
|
689
|
+
const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
|
|
690
|
+
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
691
|
+
this.scheduler = new AudioScheduler({
|
|
692
|
+
sampleRate,
|
|
693
|
+
initialLookaheadSec: audioDelayMs / 1e3
|
|
694
|
+
});
|
|
650
695
|
this.coalescer = new AudioChunkCoalescer({
|
|
651
696
|
sampleRate,
|
|
652
697
|
targetDurationMs: options.chunkTargetMs ?? 200
|
|
@@ -2014,9 +2059,7 @@ function formatBytes(bytes) {
|
|
|
2014
2059
|
function isIOSSafari() {
|
|
2015
2060
|
if (typeof navigator === "undefined") return false;
|
|
2016
2061
|
const ua = navigator.userAgent.toLowerCase();
|
|
2017
|
-
return /iphone|ipad|ipod/.test(ua)
|
|
2018
|
-
// Only force WASM on actual iOS devices
|
|
2019
|
-
/safari/.test(ua) && /mobile/.test(ua) && !/chrome|crios|fxios/.test(ua);
|
|
2062
|
+
return /iphone|ipad|ipod/.test(ua) && /safari/.test(ua) && !/chrome|crios|fxios|chromium|edg/.test(ua);
|
|
2020
2063
|
}
|
|
2021
2064
|
function isIOS() {
|
|
2022
2065
|
if (typeof navigator === "undefined") return false;
|
|
@@ -2074,10 +2117,7 @@ function getOptimalWasmThreads() {
|
|
|
2074
2117
|
return 4;
|
|
2075
2118
|
}
|
|
2076
2119
|
function shouldEnableWasmProxy() {
|
|
2077
|
-
|
|
2078
|
-
return false;
|
|
2079
|
-
}
|
|
2080
|
-
return true;
|
|
2120
|
+
return false;
|
|
2081
2121
|
}
|
|
2082
2122
|
function isSafari() {
|
|
2083
2123
|
if (typeof navigator === "undefined") return false;
|
|
@@ -2092,7 +2132,7 @@ function isSpeechRecognitionAvailable() {
|
|
|
2092
2132
|
return "SpeechRecognition" in window || "webkitSpeechRecognition" in window;
|
|
2093
2133
|
}
|
|
2094
2134
|
function shouldUseNativeASR() {
|
|
2095
|
-
return isIOS() && isSpeechRecognitionAvailable();
|
|
2135
|
+
return (isIOS() || isSafari()) && isSpeechRecognitionAvailable();
|
|
2096
2136
|
}
|
|
2097
2137
|
function shouldUseServerLipSync() {
|
|
2098
2138
|
return isIOS();
|
|
@@ -2105,11 +2145,13 @@ var loadedBackend = null;
|
|
|
2105
2145
|
var WASM_CDN_PATH = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.23.2/dist/";
|
|
2106
2146
|
async function isWebGPUAvailable() {
|
|
2107
2147
|
if (isIOS()) {
|
|
2108
|
-
logger.debug("WebGPU check: iOS
|
|
2148
|
+
logger.debug("WebGPU check: disabled on iOS (asyncify bundle crashes WebKit)");
|
|
2109
2149
|
return false;
|
|
2110
2150
|
}
|
|
2111
2151
|
if (!hasWebGPUApi()) {
|
|
2112
|
-
logger.debug("WebGPU check: navigator.gpu not available"
|
|
2152
|
+
logger.debug("WebGPU check: navigator.gpu not available", {
|
|
2153
|
+
isSecureContext: typeof window !== "undefined" ? window.isSecureContext : "N/A"
|
|
2154
|
+
});
|
|
2113
2155
|
return false;
|
|
2114
2156
|
}
|
|
2115
2157
|
try {
|
|
@@ -2133,14 +2175,20 @@ async function isWebGPUAvailable() {
|
|
|
2133
2175
|
}
|
|
2134
2176
|
var iosWasmPatched = false;
|
|
2135
2177
|
function applyIOSWasmMemoryPatch() {
|
|
2136
|
-
if (iosWasmPatched || !
|
|
2178
|
+
if (iosWasmPatched || !isIOSSafari()) return;
|
|
2137
2179
|
iosWasmPatched = true;
|
|
2138
2180
|
const OrigMemory = WebAssembly.Memory;
|
|
2139
|
-
const MAX_IOS_PAGES =
|
|
2140
|
-
logger.info("Applying iOS WASM memory patch (max
|
|
2181
|
+
const MAX_IOS_PAGES = 32768;
|
|
2182
|
+
logger.info("Applying iOS WASM memory patch (max\u21922GB, shared preserved)");
|
|
2141
2183
|
WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
|
|
2142
2184
|
const patched = { ...descriptor };
|
|
2143
2185
|
if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
|
|
2186
|
+
logger.info("iOS memory patch: capping maximum", {
|
|
2187
|
+
original: patched.maximum,
|
|
2188
|
+
capped: MAX_IOS_PAGES,
|
|
2189
|
+
shared: patched.shared,
|
|
2190
|
+
initial: patched.initial
|
|
2191
|
+
});
|
|
2144
2192
|
patched.maximum = MAX_IOS_PAGES;
|
|
2145
2193
|
}
|
|
2146
2194
|
return new OrigMemory(patched);
|
|
@@ -2174,7 +2222,10 @@ async function getOnnxRuntime(backend) {
|
|
|
2174
2222
|
logger.info(`Loading ONNX Runtime with ${backend} backend...`);
|
|
2175
2223
|
applyIOSWasmMemoryPatch();
|
|
2176
2224
|
try {
|
|
2177
|
-
if (backend === "wasm") {
|
|
2225
|
+
if (backend === "wasm" && (isIOS() || isSafari())) {
|
|
2226
|
+
const module = await import("onnxruntime-web/wasm");
|
|
2227
|
+
ortInstance = module.default || module;
|
|
2228
|
+
} else if (backend === "wasm") {
|
|
2178
2229
|
const module = await import("onnxruntime-web");
|
|
2179
2230
|
ortInstance = module.default || module;
|
|
2180
2231
|
} else {
|
|
@@ -2218,6 +2269,14 @@ function getSessionOptions(backend) {
|
|
|
2218
2269
|
graphOptimizationLevel: "all"
|
|
2219
2270
|
};
|
|
2220
2271
|
}
|
|
2272
|
+
if (isIOS()) {
|
|
2273
|
+
return {
|
|
2274
|
+
executionProviders: ["wasm"],
|
|
2275
|
+
graphOptimizationLevel: "basic",
|
|
2276
|
+
enableCpuMemArena: false,
|
|
2277
|
+
enableMemPattern: false
|
|
2278
|
+
};
|
|
2279
|
+
}
|
|
2221
2280
|
return {
|
|
2222
2281
|
executionProviders: ["wasm"],
|
|
2223
2282
|
graphOptimizationLevel: "all"
|
|
@@ -2249,6 +2308,16 @@ function getLoadedBackend() {
|
|
|
2249
2308
|
function isOnnxRuntimeLoaded() {
|
|
2250
2309
|
return ortInstance !== null;
|
|
2251
2310
|
}
|
|
2311
|
+
async function preloadOnnxRuntime(preference = "auto") {
|
|
2312
|
+
if (ortInstance) {
|
|
2313
|
+
logger.info("ONNX Runtime already preloaded", { backend: loadedBackend });
|
|
2314
|
+
return loadedBackend;
|
|
2315
|
+
}
|
|
2316
|
+
logger.info("Preloading ONNX Runtime...", { preference });
|
|
2317
|
+
const { backend } = await getOnnxRuntimeForPreference(preference);
|
|
2318
|
+
logger.info("ONNX Runtime preloaded", { backend });
|
|
2319
|
+
return backend;
|
|
2320
|
+
}
|
|
2252
2321
|
|
|
2253
2322
|
// src/inference/blendshapeUtils.ts
|
|
2254
2323
|
var LAM_BLENDSHAPES = [
|
|
@@ -2444,6 +2513,7 @@ var CTC_VOCAB = [
|
|
|
2444
2513
|
];
|
|
2445
2514
|
var Wav2Vec2Inference = class {
|
|
2446
2515
|
constructor(config) {
|
|
2516
|
+
this.modelId = "wav2vec2";
|
|
2447
2517
|
this.session = null;
|
|
2448
2518
|
this.ort = null;
|
|
2449
2519
|
this._backend = "wasm";
|
|
@@ -2482,38 +2552,108 @@ var Wav2Vec2Inference = class {
|
|
|
2482
2552
|
this.ort = ort;
|
|
2483
2553
|
this._backend = backend;
|
|
2484
2554
|
logger2.info("ONNX Runtime loaded", { backend: this._backend });
|
|
2485
|
-
const cache = getModelCache();
|
|
2486
2555
|
const modelUrl = this.config.modelUrl;
|
|
2487
|
-
const
|
|
2488
|
-
|
|
2489
|
-
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2494
|
-
|
|
2495
|
-
|
|
2496
|
-
|
|
2556
|
+
const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
|
|
2557
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
2558
|
+
let isCached = false;
|
|
2559
|
+
if (isIOS()) {
|
|
2560
|
+
logger2.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
2561
|
+
modelUrl,
|
|
2562
|
+
dataUrl
|
|
2563
|
+
});
|
|
2564
|
+
if (dataUrl) {
|
|
2565
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
2566
|
+
logger2.info("iOS: setting externalData", { dataFilename, dataUrl });
|
|
2567
|
+
sessionOptions.externalData = [{
|
|
2568
|
+
path: dataFilename,
|
|
2569
|
+
data: dataUrl
|
|
2570
|
+
// URL string — ORT fetches directly into WASM
|
|
2571
|
+
}];
|
|
2497
2572
|
}
|
|
2573
|
+
logger2.info("iOS: calling InferenceSession.create() with URL string", {
|
|
2574
|
+
modelUrl,
|
|
2575
|
+
sessionOptions: JSON.stringify(
|
|
2576
|
+
sessionOptions,
|
|
2577
|
+
(_, v) => typeof v === "string" && v.length > 100 ? v.slice(0, 100) + "..." : v
|
|
2578
|
+
)
|
|
2579
|
+
});
|
|
2580
|
+
try {
|
|
2581
|
+
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
2582
|
+
} catch (sessionErr) {
|
|
2583
|
+
logger2.error("iOS: InferenceSession.create() failed", {
|
|
2584
|
+
error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
|
|
2585
|
+
errorType: sessionErr?.constructor?.name,
|
|
2586
|
+
stack: sessionErr instanceof Error ? sessionErr.stack : void 0
|
|
2587
|
+
});
|
|
2588
|
+
throw sessionErr;
|
|
2589
|
+
}
|
|
2590
|
+
logger2.info("iOS: session created successfully", {
|
|
2591
|
+
inputNames: this.session.inputNames,
|
|
2592
|
+
outputNames: this.session.outputNames
|
|
2593
|
+
});
|
|
2498
2594
|
} else {
|
|
2499
|
-
|
|
2500
|
-
|
|
2501
|
-
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
|
|
2505
|
-
|
|
2595
|
+
const cache = getModelCache();
|
|
2596
|
+
isCached = await cache.has(modelUrl);
|
|
2597
|
+
let modelBuffer;
|
|
2598
|
+
if (isCached) {
|
|
2599
|
+
logger2.debug("Loading model from cache", { modelUrl });
|
|
2600
|
+
modelBuffer = await cache.get(modelUrl);
|
|
2601
|
+
if (!modelBuffer) {
|
|
2602
|
+
logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
2603
|
+
await cache.delete(modelUrl);
|
|
2604
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
2605
|
+
}
|
|
2606
|
+
} else {
|
|
2607
|
+
logger2.debug("Fetching and caching model", { modelUrl });
|
|
2608
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
2609
|
+
}
|
|
2610
|
+
if (!modelBuffer) {
|
|
2611
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
2612
|
+
}
|
|
2613
|
+
let externalDataBuffer = null;
|
|
2614
|
+
if (dataUrl) {
|
|
2615
|
+
try {
|
|
2616
|
+
const isDataCached = await cache.has(dataUrl);
|
|
2617
|
+
if (isDataCached) {
|
|
2618
|
+
logger2.debug("Loading external data from cache", { dataUrl });
|
|
2619
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
2620
|
+
if (!externalDataBuffer) {
|
|
2621
|
+
logger2.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
2622
|
+
await cache.delete(dataUrl);
|
|
2623
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2624
|
+
}
|
|
2625
|
+
} else {
|
|
2626
|
+
logger2.info("Fetching external model data", {
|
|
2627
|
+
dataUrl,
|
|
2628
|
+
note: "This may be a large download (383MB+)"
|
|
2629
|
+
});
|
|
2630
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2631
|
+
}
|
|
2632
|
+
logger2.info("External data loaded", {
|
|
2633
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
2634
|
+
});
|
|
2635
|
+
} catch (err) {
|
|
2636
|
+
logger2.debug("No external data file found (single-file model)", {
|
|
2637
|
+
dataUrl,
|
|
2638
|
+
error: err.message
|
|
2639
|
+
});
|
|
2640
|
+
}
|
|
2641
|
+
}
|
|
2642
|
+
logger2.debug("Creating ONNX session", {
|
|
2643
|
+
graphSize: formatBytes(modelBuffer.byteLength),
|
|
2644
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
2645
|
+
backend: this._backend
|
|
2646
|
+
});
|
|
2647
|
+
if (externalDataBuffer) {
|
|
2648
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
2649
|
+
sessionOptions.externalData = [{
|
|
2650
|
+
path: dataFilename,
|
|
2651
|
+
data: new Uint8Array(externalDataBuffer)
|
|
2652
|
+
}];
|
|
2653
|
+
}
|
|
2654
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
2655
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
2506
2656
|
}
|
|
2507
|
-
logger2.debug("Creating ONNX session", {
|
|
2508
|
-
size: formatBytes(modelBuffer.byteLength),
|
|
2509
|
-
backend: this._backend
|
|
2510
|
-
});
|
|
2511
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
2512
|
-
logger2.info("Creating session with execution provider", {
|
|
2513
|
-
executionProvider: this._backend
|
|
2514
|
-
});
|
|
2515
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
2516
|
-
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
2517
2657
|
logger2.info("ONNX session created successfully", {
|
|
2518
2658
|
executionProvider: this._backend,
|
|
2519
2659
|
backend: this._backend
|
|
@@ -2528,7 +2668,7 @@ var Wav2Vec2Inference = class {
|
|
|
2528
2668
|
span?.setAttributes({
|
|
2529
2669
|
"model.backend": this._backend,
|
|
2530
2670
|
"model.load_time_ms": loadTimeMs,
|
|
2531
|
-
"model.cached": isCached
|
|
2671
|
+
"model.cached": !isIOS() && isCached
|
|
2532
2672
|
});
|
|
2533
2673
|
span?.end();
|
|
2534
2674
|
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
@@ -2731,319 +2871,550 @@ LAM_BLENDSHAPES.forEach((name, index) => {
|
|
|
2731
2871
|
});
|
|
2732
2872
|
var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
|
|
2733
2873
|
|
|
2734
|
-
// src/inference/
|
|
2735
|
-
|
|
2736
|
-
|
|
2737
|
-
|
|
2738
|
-
|
|
2739
|
-
|
|
2874
|
+
// src/inference/kaldiFbank.ts
|
|
2875
|
+
function fft(re, im) {
|
|
2876
|
+
const n = re.length;
|
|
2877
|
+
for (let i = 1, j = 0; i < n; i++) {
|
|
2878
|
+
let bit = n >> 1;
|
|
2879
|
+
while (j & bit) {
|
|
2880
|
+
j ^= bit;
|
|
2881
|
+
bit >>= 1;
|
|
2882
|
+
}
|
|
2883
|
+
j ^= bit;
|
|
2884
|
+
if (i < j) {
|
|
2885
|
+
let tmp = re[i];
|
|
2886
|
+
re[i] = re[j];
|
|
2887
|
+
re[j] = tmp;
|
|
2888
|
+
tmp = im[i];
|
|
2889
|
+
im[i] = im[j];
|
|
2890
|
+
im[j] = tmp;
|
|
2891
|
+
}
|
|
2892
|
+
}
|
|
2893
|
+
for (let len = 2; len <= n; len *= 2) {
|
|
2894
|
+
const halfLen = len / 2;
|
|
2895
|
+
const angle = -2 * Math.PI / len;
|
|
2896
|
+
const wRe = Math.cos(angle);
|
|
2897
|
+
const wIm = Math.sin(angle);
|
|
2898
|
+
for (let i = 0; i < n; i += len) {
|
|
2899
|
+
let curRe = 1;
|
|
2900
|
+
let curIm = 0;
|
|
2901
|
+
for (let j = 0; j < halfLen; j++) {
|
|
2902
|
+
const a = i + j;
|
|
2903
|
+
const b = a + halfLen;
|
|
2904
|
+
const tRe = curRe * re[b] - curIm * im[b];
|
|
2905
|
+
const tIm = curRe * im[b] + curIm * re[b];
|
|
2906
|
+
re[b] = re[a] - tRe;
|
|
2907
|
+
im[b] = im[a] - tIm;
|
|
2908
|
+
re[a] += tRe;
|
|
2909
|
+
im[a] += tIm;
|
|
2910
|
+
const nextRe = curRe * wRe - curIm * wIm;
|
|
2911
|
+
curIm = curRe * wIm + curIm * wRe;
|
|
2912
|
+
curRe = nextRe;
|
|
2913
|
+
}
|
|
2914
|
+
}
|
|
2915
|
+
}
|
|
2916
|
+
}
|
|
2917
|
+
function htkMel(freq) {
|
|
2918
|
+
return 1127 * Math.log(1 + freq / 700);
|
|
2919
|
+
}
|
|
2920
|
+
function htkMelInverse(mel) {
|
|
2921
|
+
return 700 * (Math.exp(mel / 1127) - 1);
|
|
2922
|
+
}
|
|
2923
|
+
function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
|
|
2924
|
+
const numFftBins = fftSize / 2 + 1;
|
|
2925
|
+
const lowMel = htkMel(lowFreq);
|
|
2926
|
+
const highMel = htkMel(highFreq);
|
|
2927
|
+
const melPoints = new Float64Array(numBins + 2);
|
|
2928
|
+
for (let i = 0; i < numBins + 2; i++) {
|
|
2929
|
+
melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
|
|
2930
|
+
}
|
|
2931
|
+
const binFreqs = new Float64Array(numBins + 2);
|
|
2932
|
+
for (let i = 0; i < numBins + 2; i++) {
|
|
2933
|
+
binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
|
|
2934
|
+
}
|
|
2935
|
+
const filters = [];
|
|
2936
|
+
for (let m = 0; m < numBins; m++) {
|
|
2937
|
+
const left = binFreqs[m];
|
|
2938
|
+
const center = binFreqs[m + 1];
|
|
2939
|
+
const right = binFreqs[m + 2];
|
|
2940
|
+
const startBin = Math.max(0, Math.ceil(left));
|
|
2941
|
+
const endBin = Math.min(numFftBins - 1, Math.floor(right));
|
|
2942
|
+
const weights = new Float32Array(endBin - startBin + 1);
|
|
2943
|
+
for (let k = startBin; k <= endBin; k++) {
|
|
2944
|
+
if (k <= center) {
|
|
2945
|
+
weights[k - startBin] = center - left > 0 ? (k - left) / (center - left) : 0;
|
|
2946
|
+
} else {
|
|
2947
|
+
weights[k - startBin] = right - center > 0 ? (right - k) / (right - center) : 0;
|
|
2948
|
+
}
|
|
2949
|
+
}
|
|
2950
|
+
filters.push({ startBin, weights });
|
|
2951
|
+
}
|
|
2952
|
+
return filters;
|
|
2953
|
+
}
|
|
2954
|
+
function createHammingWindow(length) {
|
|
2955
|
+
const window2 = new Float32Array(length);
|
|
2956
|
+
for (let i = 0; i < length; i++) {
|
|
2957
|
+
window2[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
|
|
2958
|
+
}
|
|
2959
|
+
return window2;
|
|
2960
|
+
}
|
|
2961
|
+
function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
|
|
2962
|
+
const frameLengthMs = opts?.frameLengthMs ?? 25;
|
|
2963
|
+
const frameShiftMs = opts?.frameShiftMs ?? 10;
|
|
2964
|
+
const lowFreq = opts?.lowFreq ?? 20;
|
|
2965
|
+
const highFreq = opts?.highFreq ?? sampleRate / 2;
|
|
2966
|
+
const dither = opts?.dither ?? 0;
|
|
2967
|
+
const preemphasis = opts?.preemphasis ?? 0.97;
|
|
2968
|
+
const frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1e3);
|
|
2969
|
+
const frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1e3);
|
|
2970
|
+
const scaled = new Float32Array(audio.length);
|
|
2971
|
+
for (let i = 0; i < audio.length; i++) {
|
|
2972
|
+
scaled[i] = audio[i] * 32768;
|
|
2973
|
+
}
|
|
2974
|
+
if (dither > 0) {
|
|
2975
|
+
for (let i = 0; i < scaled.length; i++) {
|
|
2976
|
+
const u1 = Math.random();
|
|
2977
|
+
const u2 = Math.random();
|
|
2978
|
+
scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
2979
|
+
}
|
|
2980
|
+
}
|
|
2981
|
+
const numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
|
|
2982
|
+
if (numFrames === 0) {
|
|
2983
|
+
return new Float32Array(0);
|
|
2984
|
+
}
|
|
2985
|
+
let fftSize = 1;
|
|
2986
|
+
while (fftSize < frameLengthSamples) fftSize *= 2;
|
|
2987
|
+
const numFftBins = fftSize / 2 + 1;
|
|
2988
|
+
const window2 = createHammingWindow(frameLengthSamples);
|
|
2989
|
+
const filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
|
|
2990
|
+
const output = new Float32Array(numFrames * numMelBins);
|
|
2991
|
+
const fftRe = new Float64Array(fftSize);
|
|
2992
|
+
const fftIm = new Float64Array(fftSize);
|
|
2993
|
+
for (let f = 0; f < numFrames; f++) {
|
|
2994
|
+
const offset = f * frameShiftSamples;
|
|
2995
|
+
fftRe.fill(0);
|
|
2996
|
+
fftIm.fill(0);
|
|
2997
|
+
for (let i = 0; i < frameLengthSamples; i++) {
|
|
2998
|
+
let sample = scaled[offset + i];
|
|
2999
|
+
if (preemphasis > 0 && i > 0) {
|
|
3000
|
+
sample -= preemphasis * scaled[offset + i - 1];
|
|
3001
|
+
} else if (preemphasis > 0 && i === 0 && offset > 0) {
|
|
3002
|
+
sample -= preemphasis * scaled[offset - 1];
|
|
3003
|
+
}
|
|
3004
|
+
fftRe[i] = sample * window2[i];
|
|
3005
|
+
}
|
|
3006
|
+
fft(fftRe, fftIm);
|
|
3007
|
+
const outOffset = f * numMelBins;
|
|
3008
|
+
for (let m = 0; m < numMelBins; m++) {
|
|
3009
|
+
const filter = filters[m];
|
|
3010
|
+
let energy = 0;
|
|
3011
|
+
for (let k = 0; k < filter.weights.length; k++) {
|
|
3012
|
+
const bin = filter.startBin + k;
|
|
3013
|
+
if (bin < numFftBins) {
|
|
3014
|
+
const powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
|
|
3015
|
+
energy += filter.weights[k] * powerSpec;
|
|
3016
|
+
}
|
|
3017
|
+
}
|
|
3018
|
+
output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
|
|
3019
|
+
}
|
|
3020
|
+
}
|
|
3021
|
+
return output;
|
|
3022
|
+
}
|
|
3023
|
+
function applyLFR(features, featureDim, lfrM = 7, lfrN = 6) {
|
|
3024
|
+
const numFrames = features.length / featureDim;
|
|
3025
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
3026
|
+
const leftPad = Math.floor((lfrM - 1) / 2);
|
|
3027
|
+
const paddedLen = numFrames + leftPad;
|
|
3028
|
+
const numOutputFrames = Math.ceil(paddedLen / lfrN);
|
|
3029
|
+
const outputDim = featureDim * lfrM;
|
|
3030
|
+
const output = new Float32Array(numOutputFrames * outputDim);
|
|
3031
|
+
for (let i = 0; i < numOutputFrames; i++) {
|
|
3032
|
+
const startFrame = i * lfrN - leftPad;
|
|
3033
|
+
for (let j = 0; j < lfrM; j++) {
|
|
3034
|
+
let srcFrame = startFrame + j;
|
|
3035
|
+
if (srcFrame < 0) srcFrame = 0;
|
|
3036
|
+
if (srcFrame >= numFrames) srcFrame = numFrames - 1;
|
|
3037
|
+
const srcOffset = srcFrame * featureDim;
|
|
3038
|
+
const dstOffset = i * outputDim + j * featureDim;
|
|
3039
|
+
for (let k = 0; k < featureDim; k++) {
|
|
3040
|
+
output[dstOffset + k] = features[srcOffset + k];
|
|
3041
|
+
}
|
|
3042
|
+
}
|
|
3043
|
+
}
|
|
3044
|
+
return output;
|
|
3045
|
+
}
|
|
3046
|
+
function applyCMVN(features, dim, negMean, invStddev) {
|
|
3047
|
+
for (let i = 0; i < features.length; i++) {
|
|
3048
|
+
const d = i % dim;
|
|
3049
|
+
features[i] = (features[i] + negMean[d]) * invStddev[d];
|
|
3050
|
+
}
|
|
3051
|
+
return features;
|
|
3052
|
+
}
|
|
3053
|
+
function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
|
|
3054
|
+
const negMean = new Float32Array(
|
|
3055
|
+
negMeanStr.split(",").map((s) => parseFloat(s.trim()))
|
|
3056
|
+
);
|
|
3057
|
+
const invStddev = new Float32Array(
|
|
3058
|
+
invStddevStr.split(",").map((s) => parseFloat(s.trim()))
|
|
3059
|
+
);
|
|
3060
|
+
return { negMean, invStddev };
|
|
3061
|
+
}
|
|
3062
|
+
|
|
3063
|
+
// src/inference/ctcDecoder.ts
|
|
3064
|
+
function resolveLanguageId(language) {
|
|
3065
|
+
const map = {
|
|
3066
|
+
auto: 0,
|
|
3067
|
+
zh: 3,
|
|
3068
|
+
en: 4,
|
|
3069
|
+
yue: 7,
|
|
3070
|
+
ja: 11,
|
|
3071
|
+
ko: 12
|
|
3072
|
+
};
|
|
3073
|
+
return map[language] ?? 0;
|
|
3074
|
+
}
|
|
3075
|
+
function resolveTextNormId(textNorm) {
|
|
3076
|
+
return textNorm === "without_itn" ? 15 : 14;
|
|
3077
|
+
}
|
|
3078
|
+
function parseTokensFile(content) {
|
|
3079
|
+
const map = /* @__PURE__ */ new Map();
|
|
3080
|
+
const lines = content.split("\n");
|
|
3081
|
+
for (const line of lines) {
|
|
3082
|
+
const trimmed = line.trim();
|
|
3083
|
+
if (!trimmed) continue;
|
|
3084
|
+
const lastSpace = trimmed.lastIndexOf(" ");
|
|
3085
|
+
if (lastSpace === -1) continue;
|
|
3086
|
+
const token = trimmed.substring(0, lastSpace);
|
|
3087
|
+
const id = parseInt(trimmed.substring(lastSpace + 1), 10);
|
|
3088
|
+
if (!isNaN(id)) {
|
|
3089
|
+
map.set(id, token);
|
|
3090
|
+
}
|
|
3091
|
+
}
|
|
3092
|
+
return map;
|
|
3093
|
+
}
|
|
3094
|
+
function parseStructuredToken(token) {
|
|
3095
|
+
const match = token.match(/^<\|(.+)\|>$/);
|
|
3096
|
+
if (!match) return null;
|
|
3097
|
+
const value = match[1];
|
|
3098
|
+
if (value === "zh" || value === "en" || value === "ja" || value === "ko" || value === "yue" || value === "nospeech") {
|
|
3099
|
+
return { type: "language", value };
|
|
3100
|
+
}
|
|
3101
|
+
const emotions = ["HAPPY", "SAD", "ANGRY", "NEUTRAL", "FEARFUL", "DISGUSTED", "SURPRISED", "EMO_UNKNOWN"];
|
|
3102
|
+
if (emotions.includes(value)) {
|
|
3103
|
+
return { type: "emotion", value };
|
|
3104
|
+
}
|
|
3105
|
+
const events = ["Speech", "BGM", "Applause", "Laughter", "Crying", "Coughing", "Sneezing", "EVENT_UNKNOWN"];
|
|
3106
|
+
if (events.includes(value)) {
|
|
3107
|
+
return { type: "event", value };
|
|
3108
|
+
}
|
|
3109
|
+
if (value === "withitn" || value === "woitn" || value === "with_itn" || value === "without_itn") {
|
|
3110
|
+
return { type: "textnorm", value };
|
|
3111
|
+
}
|
|
3112
|
+
return null;
|
|
3113
|
+
}
|
|
3114
|
+
function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
3115
|
+
const tokenIds = [];
|
|
3116
|
+
for (let t = 0; t < seqLen; t++) {
|
|
3117
|
+
const offset = t * vocabSize;
|
|
3118
|
+
let maxIdx = 0;
|
|
3119
|
+
let maxVal = logits[offset];
|
|
3120
|
+
for (let v = 1; v < vocabSize; v++) {
|
|
3121
|
+
if (logits[offset + v] > maxVal) {
|
|
3122
|
+
maxVal = logits[offset + v];
|
|
3123
|
+
maxIdx = v;
|
|
3124
|
+
}
|
|
3125
|
+
}
|
|
3126
|
+
tokenIds.push(maxIdx);
|
|
3127
|
+
}
|
|
3128
|
+
const collapsed = [];
|
|
3129
|
+
let prev = -1;
|
|
3130
|
+
for (const id of tokenIds) {
|
|
3131
|
+
if (id !== prev) {
|
|
3132
|
+
collapsed.push(id);
|
|
3133
|
+
prev = id;
|
|
3134
|
+
}
|
|
3135
|
+
}
|
|
3136
|
+
const filtered = collapsed.filter((id) => id !== 0 && id !== 1 && id !== 2);
|
|
3137
|
+
let language;
|
|
3138
|
+
let emotion;
|
|
3139
|
+
let event;
|
|
3140
|
+
const textTokens = [];
|
|
3141
|
+
for (const id of filtered) {
|
|
3142
|
+
const token = tokenMap.get(id);
|
|
3143
|
+
if (!token) continue;
|
|
3144
|
+
const structured = parseStructuredToken(token);
|
|
3145
|
+
if (structured) {
|
|
3146
|
+
if (structured.type === "language") language = structured.value;
|
|
3147
|
+
else if (structured.type === "emotion") emotion = structured.value;
|
|
3148
|
+
else if (structured.type === "event") event = structured.value;
|
|
3149
|
+
} else {
|
|
3150
|
+
textTokens.push(token);
|
|
3151
|
+
}
|
|
3152
|
+
}
|
|
3153
|
+
let text = textTokens.join("");
|
|
3154
|
+
text = text.replace(/\u2581/g, " ").trim();
|
|
3155
|
+
return { text, language, emotion, event };
|
|
3156
|
+
}
|
|
3157
|
+
|
|
3158
|
+
// src/inference/SenseVoiceInference.ts
|
|
3159
|
+
var logger4 = createLogger("SenseVoice");
|
|
3160
|
+
var SenseVoiceInference = class {
|
|
3161
|
+
constructor(config) {
|
|
3162
|
+
this.session = null;
|
|
3163
|
+
this.ort = null;
|
|
3164
|
+
this._backend = "wasm";
|
|
2740
3165
|
this.isLoading = false;
|
|
2741
|
-
this.
|
|
3166
|
+
this.inferenceQueue = Promise.resolve();
|
|
3167
|
+
// Preprocessing state (loaded once)
|
|
3168
|
+
this.tokenMap = null;
|
|
3169
|
+
this.negMean = null;
|
|
3170
|
+
this.invStddev = null;
|
|
3171
|
+
this.languageId = 0;
|
|
3172
|
+
this.textNormId = 14;
|
|
3173
|
+
const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
|
|
3174
|
+
const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
|
|
2742
3175
|
this.config = {
|
|
2743
|
-
|
|
2744
|
-
|
|
2745
|
-
language: config.language
|
|
2746
|
-
|
|
2747
|
-
|
|
2748
|
-
device: config.device || "auto",
|
|
2749
|
-
localModelPath: config.localModelPath,
|
|
2750
|
-
token: config.token,
|
|
2751
|
-
suppressNonSpeech: config.suppressNonSpeech !== false
|
|
2752
|
-
// Default true
|
|
3176
|
+
modelUrl: config.modelUrl,
|
|
3177
|
+
tokensUrl,
|
|
3178
|
+
language: config.language ?? "auto",
|
|
3179
|
+
textNorm: config.textNorm ?? "with_itn",
|
|
3180
|
+
backend: config.backend ?? "auto"
|
|
2753
3181
|
};
|
|
3182
|
+
this.languageId = resolveLanguageId(this.config.language);
|
|
3183
|
+
this.textNormId = resolveTextNormId(this.config.textNorm);
|
|
2754
3184
|
}
|
|
2755
|
-
|
|
2756
|
-
|
|
2757
|
-
*/
|
|
2758
|
-
static async isWebGPUAvailable() {
|
|
2759
|
-
return isWebGPUAvailable();
|
|
3185
|
+
get backend() {
|
|
3186
|
+
return this.session ? this._backend : null;
|
|
2760
3187
|
}
|
|
2761
|
-
|
|
2762
|
-
|
|
2763
|
-
|
|
3188
|
+
get isLoaded() {
|
|
3189
|
+
return this.session !== null;
|
|
3190
|
+
}
|
|
3191
|
+
// ─── Load ───────────────────────────────────────────────────────────────
|
|
2764
3192
|
async load(onProgress) {
|
|
2765
3193
|
if (this.isLoading) {
|
|
2766
|
-
|
|
2767
|
-
while (this.isLoading) {
|
|
2768
|
-
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
2769
|
-
}
|
|
2770
|
-
return;
|
|
3194
|
+
throw new Error("Model is already loading");
|
|
2771
3195
|
}
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
logger4.debug("Model already loaded", { model: modelName });
|
|
2775
|
-
return;
|
|
3196
|
+
if (this.session) {
|
|
3197
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
2776
3198
|
}
|
|
2777
3199
|
this.isLoading = true;
|
|
3200
|
+
const startTime = performance.now();
|
|
2778
3201
|
const telemetry = getTelemetry();
|
|
2779
|
-
const span = telemetry?.startSpan("
|
|
2780
|
-
"
|
|
2781
|
-
"
|
|
2782
|
-
"whisper.device": this.config.device
|
|
3202
|
+
const span = telemetry?.startSpan("SenseVoice.load", {
|
|
3203
|
+
"model.url": this.config.modelUrl,
|
|
3204
|
+
"model.backend_requested": this.config.backend
|
|
2783
3205
|
});
|
|
2784
3206
|
try {
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
|
|
2790
|
-
|
|
2791
|
-
|
|
2792
|
-
if (
|
|
2793
|
-
|
|
2794
|
-
await this.pipeline.dispose();
|
|
2795
|
-
this.pipeline = null;
|
|
3207
|
+
logger4.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3208
|
+
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3209
|
+
this.ort = ort;
|
|
3210
|
+
this._backend = backend;
|
|
3211
|
+
logger4.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3212
|
+
logger4.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3213
|
+
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3214
|
+
if (!tokensResponse.ok) {
|
|
3215
|
+
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
2796
3216
|
}
|
|
2797
|
-
const
|
|
2798
|
-
|
|
2799
|
-
logger4.
|
|
2800
|
-
|
|
2801
|
-
|
|
2802
|
-
|
|
2803
|
-
__webpack_exports__env.useCustomCache = false;
|
|
2804
|
-
__webpack_exports__env.useWasmCache = false;
|
|
2805
|
-
if (__webpack_exports__env.backends.onnx.wasm) {
|
|
2806
|
-
__webpack_exports__env.backends.onnx.wasm.proxy = false;
|
|
2807
|
-
__webpack_exports__env.backends.onnx.wasm.numThreads = 1;
|
|
3217
|
+
const tokensText = await tokensResponse.text();
|
|
3218
|
+
this.tokenMap = parseTokensFile(tokensText);
|
|
3219
|
+
logger4.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3220
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
3221
|
+
if (this._backend === "webgpu") {
|
|
3222
|
+
sessionOptions.graphOptimizationLevel = "basic";
|
|
2808
3223
|
}
|
|
2809
|
-
|
|
2810
|
-
|
|
2811
|
-
|
|
2812
|
-
|
|
2813
|
-
|
|
2814
|
-
|
|
2815
|
-
|
|
2816
|
-
|
|
2817
|
-
|
|
2818
|
-
|
|
2819
|
-
|
|
2820
|
-
|
|
2821
|
-
|
|
2822
|
-
|
|
2823
|
-
|
|
2824
|
-
|
|
2825
|
-
|
|
2826
|
-
}
|
|
2827
|
-
|
|
3224
|
+
let isCached = false;
|
|
3225
|
+
if (isIOS()) {
|
|
3226
|
+
logger4.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3227
|
+
modelUrl: this.config.modelUrl
|
|
3228
|
+
});
|
|
3229
|
+
this.session = await this.ort.InferenceSession.create(
|
|
3230
|
+
this.config.modelUrl,
|
|
3231
|
+
sessionOptions
|
|
3232
|
+
);
|
|
3233
|
+
} else {
|
|
3234
|
+
const cache = getModelCache();
|
|
3235
|
+
isCached = await cache.has(this.config.modelUrl);
|
|
3236
|
+
let modelBuffer;
|
|
3237
|
+
if (isCached) {
|
|
3238
|
+
logger4.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3239
|
+
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3240
|
+
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3241
|
+
} else {
|
|
3242
|
+
logger4.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3243
|
+
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
3244
|
+
}
|
|
3245
|
+
logger4.debug("Creating ONNX session", {
|
|
3246
|
+
size: formatBytes(modelBuffer.byteLength),
|
|
3247
|
+
backend: this._backend
|
|
3248
|
+
});
|
|
3249
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
3250
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
2828
3251
|
}
|
|
2829
|
-
|
|
2830
|
-
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
2837
|
-
|
|
2838
|
-
|
|
2839
|
-
|
|
3252
|
+
try {
|
|
3253
|
+
const metadata = this.session.handler?.metadata;
|
|
3254
|
+
if (metadata?.neg_mean && metadata?.inv_stddev) {
|
|
3255
|
+
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
3256
|
+
this.negMean = cmvn.negMean;
|
|
3257
|
+
this.invStddev = cmvn.invStddev;
|
|
3258
|
+
logger4.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
3259
|
+
} else {
|
|
3260
|
+
logger4.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
3261
|
+
}
|
|
3262
|
+
} catch (cmvnErr) {
|
|
3263
|
+
logger4.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
3264
|
+
}
|
|
3265
|
+
const loadTimeMs = performance.now() - startTime;
|
|
3266
|
+
logger4.info("SenseVoice model loaded", {
|
|
3267
|
+
backend: this._backend,
|
|
3268
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
3269
|
+
vocabSize: this.tokenMap.size,
|
|
3270
|
+
inputs: this.session.inputNames,
|
|
3271
|
+
outputs: this.session.outputNames,
|
|
3272
|
+
hasCMVN: this.negMean !== null
|
|
2840
3273
|
});
|
|
2841
3274
|
span?.setAttributes({
|
|
2842
|
-
"
|
|
3275
|
+
"model.backend": this._backend,
|
|
3276
|
+
"model.load_time_ms": loadTimeMs,
|
|
3277
|
+
"model.cached": !isIOS() && isCached,
|
|
3278
|
+
"model.vocab_size": this.tokenMap.size
|
|
2843
3279
|
});
|
|
2844
3280
|
span?.end();
|
|
2845
|
-
|
|
2846
|
-
|
|
2847
|
-
|
|
2848
|
-
|
|
2849
|
-
|
|
2850
|
-
|
|
3281
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
3282
|
+
model: "sensevoice",
|
|
3283
|
+
backend: this._backend
|
|
3284
|
+
});
|
|
3285
|
+
return {
|
|
3286
|
+
backend: this._backend,
|
|
3287
|
+
loadTimeMs,
|
|
3288
|
+
inputNames: [...this.session.inputNames],
|
|
3289
|
+
outputNames: [...this.session.outputNames],
|
|
3290
|
+
vocabSize: this.tokenMap.size
|
|
2851
3291
|
};
|
|
2852
|
-
|
|
2853
|
-
span?.endWithError(error);
|
|
3292
|
+
} catch (error) {
|
|
3293
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
3294
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
3295
|
+
model: "sensevoice",
|
|
3296
|
+
error_type: "load_failed"
|
|
3297
|
+
});
|
|
2854
3298
|
throw error;
|
|
2855
3299
|
} finally {
|
|
2856
3300
|
this.isLoading = false;
|
|
2857
3301
|
}
|
|
2858
3302
|
}
|
|
3303
|
+
// ─── Transcribe ─────────────────────────────────────────────────────────
|
|
2859
3304
|
/**
|
|
2860
|
-
* Transcribe audio to text
|
|
3305
|
+
* Transcribe audio samples to text
|
|
2861
3306
|
*
|
|
2862
|
-
* @param audio
|
|
2863
|
-
* @
|
|
3307
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
3308
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
2864
3309
|
*/
|
|
2865
|
-
async transcribe(
|
|
2866
|
-
if (!this.
|
|
3310
|
+
async transcribe(audioSamples) {
|
|
3311
|
+
if (!this.session || !this.ort || !this.tokenMap) {
|
|
2867
3312
|
throw new Error("Model not loaded. Call load() first.");
|
|
2868
3313
|
}
|
|
2869
|
-
const
|
|
2870
|
-
|
|
2871
|
-
const span = telemetry?.startSpan("whisper.transcribe", {
|
|
2872
|
-
"audio.samples": audioCopy.length,
|
|
2873
|
-
"audio.duration_s": audioCopy.length / 16e3,
|
|
2874
|
-
"whisper.model": this.currentModel
|
|
2875
|
-
});
|
|
2876
|
-
try {
|
|
2877
|
-
const inferStart = performance.now();
|
|
2878
|
-
const audioDurationSec = audioCopy.length / 16e3;
|
|
2879
|
-
const isShortAudio = audioDurationSec < 10;
|
|
2880
|
-
logger4.debug("Starting transcription", {
|
|
2881
|
-
audioSamples: audioCopy.length,
|
|
2882
|
-
durationSeconds: audioDurationSec.toFixed(2),
|
|
2883
|
-
isShortAudio
|
|
2884
|
-
});
|
|
2885
|
-
const transcribeOptions = {
|
|
2886
|
-
// Decoding strategy
|
|
2887
|
-
top_k: 0,
|
|
2888
|
-
do_sample: false,
|
|
2889
|
-
// Adaptive chunking: Disable for short audio, enable for long audio
|
|
2890
|
-
chunk_length_s: options?.chunkLengthS || (isShortAudio ? audioDurationSec : 30),
|
|
2891
|
-
stride_length_s: options?.strideLengthS || (isShortAudio ? 0 : 5),
|
|
2892
|
-
// Timestamps
|
|
2893
|
-
return_timestamps: options?.returnTimestamps || false,
|
|
2894
|
-
force_full_sequences: false
|
|
2895
|
-
};
|
|
2896
|
-
if (this.config.multilingual) {
|
|
2897
|
-
transcribeOptions.language = options?.language || this.config.language;
|
|
2898
|
-
transcribeOptions.task = options?.task || this.config.task;
|
|
2899
|
-
}
|
|
2900
|
-
const rawResult = await this.pipeline(audioCopy, transcribeOptions);
|
|
2901
|
-
const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
|
|
2902
|
-
const inferenceTimeMs = performance.now() - inferStart;
|
|
2903
|
-
let cleanedText = result.text;
|
|
2904
|
-
if (this.config.suppressNonSpeech) {
|
|
2905
|
-
cleanedText = this.removeNonSpeechTokens(cleanedText);
|
|
2906
|
-
}
|
|
2907
|
-
const transcription = {
|
|
2908
|
-
text: cleanedText,
|
|
2909
|
-
language: this.config.language,
|
|
2910
|
-
inferenceTimeMs,
|
|
2911
|
-
chunks: result.chunks
|
|
2912
|
-
};
|
|
2913
|
-
logger4.debug("Transcription complete", {
|
|
2914
|
-
text: transcription.text,
|
|
2915
|
-
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
2916
|
-
chunksCount: result.chunks?.length || 0
|
|
2917
|
-
});
|
|
2918
|
-
span?.setAttributes({
|
|
2919
|
-
"whisper.inference_time_ms": inferenceTimeMs,
|
|
2920
|
-
"whisper.text_length": transcription.text.length
|
|
2921
|
-
});
|
|
2922
|
-
span?.end();
|
|
2923
|
-
return transcription;
|
|
2924
|
-
} catch (error) {
|
|
2925
|
-
logger4.error("Transcribe error", { error });
|
|
2926
|
-
span?.endWithError(error);
|
|
2927
|
-
throw new Error(`Whisper transcription failed: ${error}`);
|
|
2928
|
-
}
|
|
3314
|
+
const audio = new Float32Array(audioSamples);
|
|
3315
|
+
return this.queueInference(audio);
|
|
2929
3316
|
}
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
|
|
2957
|
-
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
3317
|
+
queueInference(audio) {
|
|
3318
|
+
return new Promise((resolve, reject) => {
|
|
3319
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
3320
|
+
const telemetry = getTelemetry();
|
|
3321
|
+
const span = telemetry?.startSpan("SenseVoice.transcribe", {
|
|
3322
|
+
"inference.backend": this._backend,
|
|
3323
|
+
"inference.input_samples": audio.length
|
|
3324
|
+
});
|
|
3325
|
+
try {
|
|
3326
|
+
const startTime = performance.now();
|
|
3327
|
+
const preprocessStart = performance.now();
|
|
3328
|
+
const fbank = computeKaldiFbank(audio, 16e3, 80);
|
|
3329
|
+
const numFrames = fbank.length / 80;
|
|
3330
|
+
if (numFrames === 0) {
|
|
3331
|
+
resolve({
|
|
3332
|
+
text: "",
|
|
3333
|
+
inferenceTimeMs: performance.now() - startTime,
|
|
3334
|
+
preprocessTimeMs: performance.now() - preprocessStart
|
|
3335
|
+
});
|
|
3336
|
+
return;
|
|
3337
|
+
}
|
|
3338
|
+
const lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
3339
|
+
const numLfrFrames = lfrFeatures.length / 560;
|
|
3340
|
+
if (this.negMean && this.invStddev) {
|
|
3341
|
+
applyCMVN(lfrFeatures, 560, this.negMean, this.invStddev);
|
|
3342
|
+
}
|
|
3343
|
+
const preprocessTimeMs = performance.now() - preprocessStart;
|
|
3344
|
+
const ort = this.ort;
|
|
3345
|
+
const feeds = {
|
|
3346
|
+
x: new ort.Tensor("float32", lfrFeatures, [1, numLfrFrames, 560]),
|
|
3347
|
+
x_length: new ort.Tensor("int32", new Int32Array([numLfrFrames]), [1]),
|
|
3348
|
+
language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
|
|
3349
|
+
text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
|
|
3350
|
+
};
|
|
3351
|
+
const results = await this.session.run(feeds);
|
|
3352
|
+
const logitsOutput = results["logits"];
|
|
3353
|
+
if (!logitsOutput) {
|
|
3354
|
+
throw new Error('Model output missing "logits" tensor');
|
|
3355
|
+
}
|
|
3356
|
+
const logitsData = logitsOutput.data;
|
|
3357
|
+
const logitsDims = logitsOutput.dims;
|
|
3358
|
+
const seqLen = logitsDims[1];
|
|
3359
|
+
const vocabSize = logitsDims[2];
|
|
3360
|
+
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
3361
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
3362
|
+
logger4.trace("Transcription complete", {
|
|
3363
|
+
text: decoded.text.substring(0, 50),
|
|
3364
|
+
language: decoded.language,
|
|
3365
|
+
emotion: decoded.emotion,
|
|
3366
|
+
event: decoded.event,
|
|
3367
|
+
preprocessTimeMs: Math.round(preprocessTimeMs * 100) / 100,
|
|
3368
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3369
|
+
numFrames,
|
|
3370
|
+
numLfrFrames
|
|
3371
|
+
});
|
|
3372
|
+
span?.setAttributes({
|
|
3373
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
3374
|
+
"inference.preprocess_ms": preprocessTimeMs,
|
|
3375
|
+
"inference.num_frames": numFrames,
|
|
3376
|
+
"inference.text_length": decoded.text.length
|
|
3377
|
+
});
|
|
3378
|
+
span?.end();
|
|
3379
|
+
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
3380
|
+
model: "sensevoice",
|
|
3381
|
+
backend: this._backend
|
|
3382
|
+
});
|
|
3383
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3384
|
+
model: "sensevoice",
|
|
3385
|
+
backend: this._backend,
|
|
3386
|
+
status: "success"
|
|
3387
|
+
});
|
|
3388
|
+
resolve({
|
|
3389
|
+
text: decoded.text,
|
|
3390
|
+
language: decoded.language,
|
|
3391
|
+
emotion: decoded.emotion,
|
|
3392
|
+
event: decoded.event,
|
|
3393
|
+
inferenceTimeMs,
|
|
3394
|
+
preprocessTimeMs
|
|
2972
3395
|
});
|
|
3396
|
+
} catch (err) {
|
|
3397
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3398
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3399
|
+
model: "sensevoice",
|
|
3400
|
+
backend: this._backend,
|
|
3401
|
+
status: "error"
|
|
3402
|
+
});
|
|
3403
|
+
reject(err);
|
|
2973
3404
|
}
|
|
2974
|
-
}
|
|
2975
|
-
if (onUpdate) {
|
|
2976
|
-
onUpdate(result.text);
|
|
2977
|
-
}
|
|
2978
|
-
logger4.debug("Streaming transcription complete", {
|
|
2979
|
-
text: result.text,
|
|
2980
|
-
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
2981
|
-
chunksCount: result.chunks?.length || 0
|
|
2982
|
-
});
|
|
2983
|
-
span?.setAttributes({
|
|
2984
|
-
"whisper.inference_time_ms": inferenceTimeMs,
|
|
2985
|
-
"whisper.chunks_count": result.chunks?.length || 0
|
|
2986
3405
|
});
|
|
2987
|
-
|
|
2988
|
-
return {
|
|
2989
|
-
text: result.text,
|
|
2990
|
-
language: this.config.language,
|
|
2991
|
-
inferenceTimeMs,
|
|
2992
|
-
chunks: result.chunks
|
|
2993
|
-
};
|
|
2994
|
-
} catch (error) {
|
|
2995
|
-
logger4.error("Streaming transcribe error", { error });
|
|
2996
|
-
span?.endWithError(error);
|
|
2997
|
-
throw new Error(`Whisper streaming transcription failed: ${error}`);
|
|
2998
|
-
}
|
|
3406
|
+
});
|
|
2999
3407
|
}
|
|
3000
|
-
|
|
3001
|
-
* Dispose of the model and free resources
|
|
3002
|
-
*/
|
|
3408
|
+
// ─── Dispose ──────────────────────────────────────────────────────────
|
|
3003
3409
|
async dispose() {
|
|
3004
|
-
if (this.
|
|
3005
|
-
|
|
3006
|
-
|
|
3007
|
-
this.pipeline = null;
|
|
3008
|
-
this.currentModel = null;
|
|
3009
|
-
}
|
|
3010
|
-
}
|
|
3011
|
-
/**
|
|
3012
|
-
* Check if model is loaded
|
|
3013
|
-
*/
|
|
3014
|
-
get isLoaded() {
|
|
3015
|
-
return this.pipeline !== null;
|
|
3016
|
-
}
|
|
3017
|
-
/**
|
|
3018
|
-
* Get the backend being used (webgpu or wasm)
|
|
3019
|
-
*/
|
|
3020
|
-
get backend() {
|
|
3021
|
-
return this.actualBackend;
|
|
3022
|
-
}
|
|
3023
|
-
/**
|
|
3024
|
-
* Get the full model name used by transformers.js
|
|
3025
|
-
*/
|
|
3026
|
-
getModelName() {
|
|
3027
|
-
if (this.config.localModelPath) {
|
|
3028
|
-
return this.config.localModelPath;
|
|
3029
|
-
}
|
|
3030
|
-
let modelName = `onnx-community/whisper-${this.config.model}`;
|
|
3031
|
-
if (!this.config.multilingual) {
|
|
3032
|
-
modelName += ".en";
|
|
3410
|
+
if (this.session) {
|
|
3411
|
+
await this.session.release();
|
|
3412
|
+
this.session = null;
|
|
3033
3413
|
}
|
|
3034
|
-
|
|
3035
|
-
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
*
|
|
3039
|
-
* Whisper outputs special tokens for non-speech events like:
|
|
3040
|
-
* [LAUGHTER], [APPLAUSE], [MUSIC], [BLANK_AUDIO], [CLICKING], etc.
|
|
3041
|
-
*
|
|
3042
|
-
* This method strips these tokens and cleans up extra whitespace.
|
|
3043
|
-
*/
|
|
3044
|
-
removeNonSpeechTokens(text) {
|
|
3045
|
-
const cleaned = text.replace(/\[[\w\s_]+\]/g, "");
|
|
3046
|
-
return cleaned.replace(/\s+/g, " ").trim();
|
|
3414
|
+
this.ort = null;
|
|
3415
|
+
this.tokenMap = null;
|
|
3416
|
+
this.negMean = null;
|
|
3417
|
+
this.invStddev = null;
|
|
3047
3418
|
}
|
|
3048
3419
|
};
|
|
3049
3420
|
|
|
@@ -3051,18 +3422,13 @@ var WhisperInference = class _WhisperInference {
|
|
|
3051
3422
|
var logger5 = createLogger("Wav2ArkitCpu");
|
|
3052
3423
|
var Wav2ArkitCpuInference = class {
|
|
3053
3424
|
constructor(config) {
|
|
3425
|
+
this.modelId = "wav2arkit_cpu";
|
|
3054
3426
|
this.session = null;
|
|
3055
3427
|
this.ort = null;
|
|
3056
3428
|
this._backend = "wasm";
|
|
3057
3429
|
this.isLoading = false;
|
|
3058
3430
|
// Inference queue for handling concurrent calls
|
|
3059
3431
|
this.inferenceQueue = Promise.resolve();
|
|
3060
|
-
/**
|
|
3061
|
-
* Preferred chunk size: 4000 samples (250ms at 16kHz).
|
|
3062
|
-
* wav2arkit_cpu accepts variable-length input, so we use smaller chunks
|
|
3063
|
-
* for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
|
|
3064
|
-
*/
|
|
3065
|
-
this.chunkSamples = 4e3;
|
|
3066
3432
|
this.config = config;
|
|
3067
3433
|
}
|
|
3068
3434
|
get backend() {
|
|
@@ -3096,23 +3462,25 @@ var Wav2ArkitCpuInference = class {
|
|
|
3096
3462
|
this._backend = backend;
|
|
3097
3463
|
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3098
3464
|
const modelUrl = this.config.modelUrl;
|
|
3099
|
-
const
|
|
3100
|
-
|
|
3101
|
-
if (isIOS()
|
|
3102
|
-
|
|
3103
|
-
sessionOptions.externalData = [{
|
|
3104
|
-
path: dataFilename,
|
|
3105
|
-
data: this.config.modelDataUrl
|
|
3106
|
-
}];
|
|
3107
|
-
logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
|
|
3465
|
+
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
3466
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
3467
|
+
if (isIOS()) {
|
|
3468
|
+
logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
3108
3469
|
modelUrl,
|
|
3109
|
-
|
|
3110
|
-
dataUrl: this.config.modelDataUrl
|
|
3470
|
+
dataUrl
|
|
3111
3471
|
});
|
|
3472
|
+
if (dataUrl) {
|
|
3473
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
3474
|
+
sessionOptions.externalData = [{
|
|
3475
|
+
path: dataFilename,
|
|
3476
|
+
data: dataUrl
|
|
3477
|
+
// URL string — ORT fetches directly into WASM
|
|
3478
|
+
}];
|
|
3479
|
+
}
|
|
3112
3480
|
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
3113
3481
|
} else {
|
|
3114
3482
|
const cache = getModelCache();
|
|
3115
|
-
isCached = await cache.has(modelUrl);
|
|
3483
|
+
const isCached = await cache.has(modelUrl);
|
|
3116
3484
|
let modelBuffer;
|
|
3117
3485
|
if (isCached) {
|
|
3118
3486
|
logger5.debug("Loading model from cache", { modelUrl });
|
|
@@ -3123,42 +3491,48 @@ var Wav2ArkitCpuInference = class {
|
|
|
3123
3491
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3124
3492
|
}
|
|
3125
3493
|
} else {
|
|
3126
|
-
logger5.debug("Fetching and caching model", { modelUrl });
|
|
3494
|
+
logger5.debug("Fetching and caching model graph", { modelUrl });
|
|
3127
3495
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3128
3496
|
}
|
|
3129
3497
|
if (!modelBuffer) {
|
|
3130
3498
|
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
3131
3499
|
}
|
|
3132
|
-
let externalDataBuffer;
|
|
3133
|
-
if (
|
|
3134
|
-
|
|
3135
|
-
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
|
|
3139
|
-
|
|
3140
|
-
|
|
3141
|
-
|
|
3500
|
+
let externalDataBuffer = null;
|
|
3501
|
+
if (dataUrl) {
|
|
3502
|
+
try {
|
|
3503
|
+
const isDataCached = await cache.has(dataUrl);
|
|
3504
|
+
if (isDataCached) {
|
|
3505
|
+
logger5.debug("Loading external data from cache", { dataUrl });
|
|
3506
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
3507
|
+
if (!externalDataBuffer) {
|
|
3508
|
+
logger5.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
3509
|
+
await cache.delete(dataUrl);
|
|
3510
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3511
|
+
}
|
|
3512
|
+
} else {
|
|
3513
|
+
logger5.info("Fetching external model data", {
|
|
3514
|
+
dataUrl,
|
|
3515
|
+
note: "This may be a large download (400MB+)"
|
|
3516
|
+
});
|
|
3142
3517
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3143
3518
|
}
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3519
|
+
logger5.info("External data loaded", {
|
|
3520
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
3521
|
+
});
|
|
3522
|
+
} catch (err) {
|
|
3523
|
+
logger5.debug("No external data file found (single-file model)", {
|
|
3524
|
+
dataUrl,
|
|
3525
|
+
error: err.message
|
|
3147
3526
|
});
|
|
3148
|
-
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3149
3527
|
}
|
|
3150
|
-
logger5.debug("External data loaded", {
|
|
3151
|
-
size: formatBytes(externalDataBuffer.byteLength)
|
|
3152
|
-
});
|
|
3153
3528
|
}
|
|
3154
3529
|
logger5.debug("Creating ONNX session", {
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
|
|
3530
|
+
graphSize: formatBytes(modelBuffer.byteLength),
|
|
3531
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
3158
3532
|
backend: this._backend
|
|
3159
3533
|
});
|
|
3160
3534
|
if (externalDataBuffer) {
|
|
3161
|
-
const dataFilename =
|
|
3535
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
3162
3536
|
sessionOptions.externalData = [{
|
|
3163
3537
|
path: dataFilename,
|
|
3164
3538
|
data: new Uint8Array(externalDataBuffer)
|
|
@@ -3177,7 +3551,7 @@ var Wav2ArkitCpuInference = class {
|
|
|
3177
3551
|
span?.setAttributes({
|
|
3178
3552
|
"model.backend": this._backend,
|
|
3179
3553
|
"model.load_time_ms": loadTimeMs,
|
|
3180
|
-
"model.cached":
|
|
3554
|
+
"model.cached": !isIOS()
|
|
3181
3555
|
});
|
|
3182
3556
|
span?.end();
|
|
3183
3557
|
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
@@ -3258,11 +3632,11 @@ var Wav2ArkitCpuInference = class {
|
|
|
3258
3632
|
const blendshapes = [];
|
|
3259
3633
|
for (let f = 0; f < numFrames; f++) {
|
|
3260
3634
|
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
3261
|
-
const
|
|
3262
|
-
blendshapes.push(
|
|
3635
|
+
const symmetrized = symmetrizeBlendshapes(rawFrame);
|
|
3636
|
+
blendshapes.push(symmetrized);
|
|
3263
3637
|
}
|
|
3264
3638
|
logger5.trace("Inference completed", {
|
|
3265
|
-
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
3639
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3266
3640
|
numFrames,
|
|
3267
3641
|
inputSamples
|
|
3268
3642
|
});
|
|
@@ -3328,14 +3702,14 @@ function createLipSync(config) {
|
|
|
3328
3702
|
});
|
|
3329
3703
|
}
|
|
3330
3704
|
if (useCpu) {
|
|
3331
|
-
logger6.info("Creating Wav2ArkitCpuInference (WASM)");
|
|
3705
|
+
logger6.info("Creating Wav2ArkitCpuInference (404MB, WASM)");
|
|
3332
3706
|
return new Wav2ArkitCpuInference({
|
|
3333
|
-
modelUrl: config.cpuModelUrl
|
|
3334
|
-
modelDataUrl: config.cpuModelDataUrl
|
|
3707
|
+
modelUrl: config.cpuModelUrl
|
|
3335
3708
|
});
|
|
3336
3709
|
}
|
|
3337
3710
|
const gpuInstance = new Wav2Vec2Inference({
|
|
3338
3711
|
modelUrl: config.gpuModelUrl,
|
|
3712
|
+
externalDataUrl: config.gpuExternalDataUrl,
|
|
3339
3713
|
backend: config.gpuBackend ?? "auto",
|
|
3340
3714
|
numIdentityClasses: config.numIdentityClasses
|
|
3341
3715
|
});
|
|
@@ -3352,15 +3726,15 @@ var LipSyncWithFallback = class {
|
|
|
3352
3726
|
this.implementation = gpuInstance;
|
|
3353
3727
|
this.config = config;
|
|
3354
3728
|
}
|
|
3729
|
+
get modelId() {
|
|
3730
|
+
return this.implementation.modelId;
|
|
3731
|
+
}
|
|
3355
3732
|
get backend() {
|
|
3356
3733
|
return this.implementation.backend;
|
|
3357
3734
|
}
|
|
3358
3735
|
get isLoaded() {
|
|
3359
3736
|
return this.implementation.isLoaded;
|
|
3360
3737
|
}
|
|
3361
|
-
get chunkSamples() {
|
|
3362
|
-
return this.implementation.chunkSamples;
|
|
3363
|
-
}
|
|
3364
3738
|
async load() {
|
|
3365
3739
|
try {
|
|
3366
3740
|
return await this.implementation.load();
|
|
@@ -3373,8 +3747,7 @@ var LipSyncWithFallback = class {
|
|
|
3373
3747
|
} catch {
|
|
3374
3748
|
}
|
|
3375
3749
|
this.implementation = new Wav2ArkitCpuInference({
|
|
3376
|
-
modelUrl: this.config.cpuModelUrl
|
|
3377
|
-
modelDataUrl: this.config.cpuModelDataUrl
|
|
3750
|
+
modelUrl: this.config.cpuModelUrl
|
|
3378
3751
|
});
|
|
3379
3752
|
this.hasFallenBack = true;
|
|
3380
3753
|
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
@@ -3404,6 +3777,8 @@ var SileroVADInference = class {
|
|
|
3404
3777
|
// Pre-speech buffer for capturing beginning of speech
|
|
3405
3778
|
this.preSpeechBuffer = [];
|
|
3406
3779
|
this.wasSpeaking = false;
|
|
3780
|
+
// Cached sample rate tensor (int64 scalar, never changes per instance)
|
|
3781
|
+
this.srTensor = null;
|
|
3407
3782
|
const sampleRate = config.sampleRate ?? 16e3;
|
|
3408
3783
|
if (sampleRate !== 8e3 && sampleRate !== 16e3) {
|
|
3409
3784
|
throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
|
|
@@ -3534,6 +3909,24 @@ var SileroVADInference = class {
|
|
|
3534
3909
|
this.context = new Float32Array(this.contextSize);
|
|
3535
3910
|
this.preSpeechBuffer = [];
|
|
3536
3911
|
this.wasSpeaking = false;
|
|
3912
|
+
if (!this.srTensor) {
|
|
3913
|
+
try {
|
|
3914
|
+
this.srTensor = new this.ort.Tensor(
|
|
3915
|
+
"int64",
|
|
3916
|
+
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
3917
|
+
[]
|
|
3918
|
+
);
|
|
3919
|
+
} catch (e) {
|
|
3920
|
+
logger7.warn("BigInt64Array not available, using bigint array fallback", {
|
|
3921
|
+
error: e instanceof Error ? e.message : String(e)
|
|
3922
|
+
});
|
|
3923
|
+
this.srTensor = new this.ort.Tensor(
|
|
3924
|
+
"int64",
|
|
3925
|
+
[BigInt(this.config.sampleRate)],
|
|
3926
|
+
[]
|
|
3927
|
+
);
|
|
3928
|
+
}
|
|
3929
|
+
}
|
|
3537
3930
|
}
|
|
3538
3931
|
/**
|
|
3539
3932
|
* Process a single audio chunk
|
|
@@ -3665,20 +4058,7 @@ var SileroVADInference = class {
|
|
|
3665
4058
|
inputBuffer.set(audioChunkCopy, this.contextSize);
|
|
3666
4059
|
const inputBufferCopy = new Float32Array(inputBuffer);
|
|
3667
4060
|
const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
|
|
3668
|
-
|
|
3669
|
-
try {
|
|
3670
|
-
srTensor = new this.ort.Tensor(
|
|
3671
|
-
"int64",
|
|
3672
|
-
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
3673
|
-
[]
|
|
3674
|
-
);
|
|
3675
|
-
} catch {
|
|
3676
|
-
srTensor = new this.ort.Tensor(
|
|
3677
|
-
"int64",
|
|
3678
|
-
[BigInt(this.config.sampleRate)],
|
|
3679
|
-
[]
|
|
3680
|
-
);
|
|
3681
|
-
}
|
|
4061
|
+
const srTensor = this.srTensor;
|
|
3682
4062
|
const stateCopy = new Float32Array(this.state.data);
|
|
3683
4063
|
const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
|
|
3684
4064
|
const feeds = {
|
|
@@ -3767,6 +4147,7 @@ var SileroVADInference = class {
|
|
|
3767
4147
|
this.session = null;
|
|
3768
4148
|
}
|
|
3769
4149
|
this.state = null;
|
|
4150
|
+
this.srTensor = null;
|
|
3770
4151
|
}
|
|
3771
4152
|
};
|
|
3772
4153
|
/**
|
|
@@ -4429,268 +4810,8 @@ var VADWorkerWithFallback = class {
|
|
|
4429
4810
|
}
|
|
4430
4811
|
};
|
|
4431
4812
|
|
|
4432
|
-
// src/inference/Emotion2VecInference.ts
|
|
4433
|
-
var logger10 = createLogger("Emotion2Vec");
|
|
4434
|
-
var EMOTION2VEC_LABELS = ["neutral", "happy", "angry", "sad"];
|
|
4435
|
-
var Emotion2VecInference = class {
|
|
4436
|
-
constructor(config) {
|
|
4437
|
-
this.session = null;
|
|
4438
|
-
this.ort = null;
|
|
4439
|
-
this._backend = "wasm";
|
|
4440
|
-
this.isLoading = false;
|
|
4441
|
-
this.inferenceQueue = Promise.resolve();
|
|
4442
|
-
this.config = {
|
|
4443
|
-
modelUrl: config.modelUrl,
|
|
4444
|
-
backend: config.backend ?? "auto",
|
|
4445
|
-
sampleRate: config.sampleRate ?? 16e3
|
|
4446
|
-
};
|
|
4447
|
-
}
|
|
4448
|
-
get backend() {
|
|
4449
|
-
return this.session ? this._backend : null;
|
|
4450
|
-
}
|
|
4451
|
-
get isLoaded() {
|
|
4452
|
-
return this.session !== null;
|
|
4453
|
-
}
|
|
4454
|
-
get sampleRate() {
|
|
4455
|
-
return this.config.sampleRate;
|
|
4456
|
-
}
|
|
4457
|
-
/**
|
|
4458
|
-
* Load the ONNX model
|
|
4459
|
-
*/
|
|
4460
|
-
async load() {
|
|
4461
|
-
if (this.isLoading) {
|
|
4462
|
-
throw new Error("Model is already loading");
|
|
4463
|
-
}
|
|
4464
|
-
if (this.session) {
|
|
4465
|
-
throw new Error("Model already loaded. Call dispose() first.");
|
|
4466
|
-
}
|
|
4467
|
-
this.isLoading = true;
|
|
4468
|
-
const startTime = performance.now();
|
|
4469
|
-
const telemetry = getTelemetry();
|
|
4470
|
-
const span = telemetry?.startSpan("Emotion2Vec.load", {
|
|
4471
|
-
"model.url": this.config.modelUrl,
|
|
4472
|
-
"model.backend_requested": this.config.backend
|
|
4473
|
-
});
|
|
4474
|
-
try {
|
|
4475
|
-
logger10.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
4476
|
-
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
4477
|
-
this.ort = ort;
|
|
4478
|
-
this._backend = backend;
|
|
4479
|
-
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
4480
|
-
logger10.info("Checking model cache...");
|
|
4481
|
-
const cache = getModelCache();
|
|
4482
|
-
const modelUrl = this.config.modelUrl;
|
|
4483
|
-
const isCached = await cache.has(modelUrl);
|
|
4484
|
-
logger10.info("Cache check complete", { modelUrl, isCached });
|
|
4485
|
-
let modelBuffer;
|
|
4486
|
-
if (isCached) {
|
|
4487
|
-
logger10.info("Loading model from cache...", { modelUrl });
|
|
4488
|
-
modelBuffer = await cache.get(modelUrl);
|
|
4489
|
-
logger10.info("Model loaded from cache", { size: formatBytes(modelBuffer.byteLength) });
|
|
4490
|
-
} else {
|
|
4491
|
-
logger10.info("Fetching model (not cached)...", { modelUrl });
|
|
4492
|
-
modelBuffer = await fetchWithCache(modelUrl);
|
|
4493
|
-
logger10.info("Model fetched and cached", { size: formatBytes(modelBuffer.byteLength) });
|
|
4494
|
-
}
|
|
4495
|
-
logger10.info("Creating ONNX session (this may take a while for large models)...");
|
|
4496
|
-
logger10.debug("Creating ONNX session", {
|
|
4497
|
-
size: formatBytes(modelBuffer.byteLength),
|
|
4498
|
-
backend: this._backend
|
|
4499
|
-
});
|
|
4500
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
4501
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
4502
|
-
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
4503
|
-
const loadTimeMs = performance.now() - startTime;
|
|
4504
|
-
logger10.info("Model loaded successfully", {
|
|
4505
|
-
backend: this._backend,
|
|
4506
|
-
loadTimeMs: Math.round(loadTimeMs),
|
|
4507
|
-
sampleRate: this.config.sampleRate,
|
|
4508
|
-
inputNames: [...this.session.inputNames],
|
|
4509
|
-
outputNames: [...this.session.outputNames]
|
|
4510
|
-
});
|
|
4511
|
-
span?.setAttributes({
|
|
4512
|
-
"model.backend": this._backend,
|
|
4513
|
-
"model.load_time_ms": loadTimeMs,
|
|
4514
|
-
"model.cached": isCached
|
|
4515
|
-
});
|
|
4516
|
-
span?.end();
|
|
4517
|
-
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
4518
|
-
model: "emotion2vec",
|
|
4519
|
-
backend: this._backend
|
|
4520
|
-
});
|
|
4521
|
-
return {
|
|
4522
|
-
backend: this._backend,
|
|
4523
|
-
loadTimeMs,
|
|
4524
|
-
inputNames: [...this.session.inputNames],
|
|
4525
|
-
outputNames: [...this.session.outputNames],
|
|
4526
|
-
sampleRate: this.config.sampleRate
|
|
4527
|
-
};
|
|
4528
|
-
} catch (error) {
|
|
4529
|
-
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
4530
|
-
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
4531
|
-
model: "emotion2vec",
|
|
4532
|
-
error_type: "load_failed"
|
|
4533
|
-
});
|
|
4534
|
-
throw error;
|
|
4535
|
-
} finally {
|
|
4536
|
-
this.isLoading = false;
|
|
4537
|
-
}
|
|
4538
|
-
}
|
|
4539
|
-
/**
|
|
4540
|
-
* Run emotion inference on audio samples
|
|
4541
|
-
*
|
|
4542
|
-
* @param audio - Float32Array of 16kHz audio samples
|
|
4543
|
-
* @returns Frame-level emotion results at 50Hz
|
|
4544
|
-
*/
|
|
4545
|
-
async infer(audio) {
|
|
4546
|
-
if (!this.session) {
|
|
4547
|
-
throw new Error("Model not loaded. Call load() first.");
|
|
4548
|
-
}
|
|
4549
|
-
return this.queueInference(audio);
|
|
4550
|
-
}
|
|
4551
|
-
queueInference(audio) {
|
|
4552
|
-
const audioCopy = new Float32Array(audio);
|
|
4553
|
-
return new Promise((resolve, reject) => {
|
|
4554
|
-
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
4555
|
-
const telemetry = getTelemetry();
|
|
4556
|
-
const span = telemetry?.startSpan("Emotion2Vec.infer", {
|
|
4557
|
-
"inference.backend": this._backend,
|
|
4558
|
-
"inference.audio_samples": audioCopy.length
|
|
4559
|
-
});
|
|
4560
|
-
try {
|
|
4561
|
-
const startTime = performance.now();
|
|
4562
|
-
const inputTensor = new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length]);
|
|
4563
|
-
const results = await this.session.run({ audio: inputTensor });
|
|
4564
|
-
const logitsTensor = results["logits"];
|
|
4565
|
-
const embeddingsTensor = results["layer_norm_25"];
|
|
4566
|
-
if (!logitsTensor) {
|
|
4567
|
-
throw new Error(
|
|
4568
|
-
`Missing logits tensor from SUPERB model. Got outputs: ${Object.keys(results).join(", ")}`
|
|
4569
|
-
);
|
|
4570
|
-
}
|
|
4571
|
-
const logitsData = logitsTensor.data;
|
|
4572
|
-
const logits = new Float32Array(logitsData);
|
|
4573
|
-
const probs = this.softmax(logits);
|
|
4574
|
-
const probabilities = {
|
|
4575
|
-
neutral: probs[0],
|
|
4576
|
-
happy: probs[1],
|
|
4577
|
-
angry: probs[2],
|
|
4578
|
-
sad: probs[3]
|
|
4579
|
-
};
|
|
4580
|
-
let maxIdx = 0;
|
|
4581
|
-
let maxProb = probs[0];
|
|
4582
|
-
for (let i = 1; i < probs.length; i++) {
|
|
4583
|
-
if (probs[i] > maxProb) {
|
|
4584
|
-
maxProb = probs[i];
|
|
4585
|
-
maxIdx = i;
|
|
4586
|
-
}
|
|
4587
|
-
}
|
|
4588
|
-
const dominant = {
|
|
4589
|
-
emotion: EMOTION2VEC_LABELS[maxIdx],
|
|
4590
|
-
confidence: maxProb,
|
|
4591
|
-
probabilities
|
|
4592
|
-
};
|
|
4593
|
-
let embeddings = [];
|
|
4594
|
-
let numFrames = 1;
|
|
4595
|
-
if (embeddingsTensor) {
|
|
4596
|
-
const embeddingData = embeddingsTensor.data;
|
|
4597
|
-
const dims = embeddingsTensor.dims;
|
|
4598
|
-
if (dims.length === 3) {
|
|
4599
|
-
numFrames = dims[1];
|
|
4600
|
-
const embeddingDim = dims[2];
|
|
4601
|
-
for (let i = 0; i < numFrames; i++) {
|
|
4602
|
-
const start = i * embeddingDim;
|
|
4603
|
-
embeddings.push(new Float32Array(embeddingData.slice(start, start + embeddingDim)));
|
|
4604
|
-
}
|
|
4605
|
-
}
|
|
4606
|
-
}
|
|
4607
|
-
const frames = [];
|
|
4608
|
-
for (let i = 0; i < numFrames; i++) {
|
|
4609
|
-
frames.push({
|
|
4610
|
-
emotion: dominant.emotion,
|
|
4611
|
-
confidence: dominant.confidence,
|
|
4612
|
-
probabilities: { ...probabilities }
|
|
4613
|
-
});
|
|
4614
|
-
}
|
|
4615
|
-
const inferenceTimeMs = performance.now() - startTime;
|
|
4616
|
-
logger10.debug("Emotion inference completed", {
|
|
4617
|
-
numFrames,
|
|
4618
|
-
dominant: dominant.emotion,
|
|
4619
|
-
confidence: Math.round(dominant.confidence * 100),
|
|
4620
|
-
inferenceTimeMs: Math.round(inferenceTimeMs)
|
|
4621
|
-
});
|
|
4622
|
-
span?.setAttributes({
|
|
4623
|
-
"inference.duration_ms": inferenceTimeMs,
|
|
4624
|
-
"inference.num_frames": numFrames,
|
|
4625
|
-
"inference.dominant_emotion": dominant.emotion
|
|
4626
|
-
});
|
|
4627
|
-
span?.end();
|
|
4628
|
-
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
4629
|
-
model: "emotion2vec",
|
|
4630
|
-
backend: this._backend
|
|
4631
|
-
});
|
|
4632
|
-
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4633
|
-
model: "emotion2vec",
|
|
4634
|
-
backend: this._backend,
|
|
4635
|
-
status: "success"
|
|
4636
|
-
});
|
|
4637
|
-
resolve({
|
|
4638
|
-
frames,
|
|
4639
|
-
dominant,
|
|
4640
|
-
embeddings,
|
|
4641
|
-
logits,
|
|
4642
|
-
inferenceTimeMs
|
|
4643
|
-
});
|
|
4644
|
-
} catch (err) {
|
|
4645
|
-
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4646
|
-
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4647
|
-
model: "emotion2vec",
|
|
4648
|
-
backend: this._backend,
|
|
4649
|
-
status: "error"
|
|
4650
|
-
});
|
|
4651
|
-
reject(err);
|
|
4652
|
-
}
|
|
4653
|
-
});
|
|
4654
|
-
});
|
|
4655
|
-
}
|
|
4656
|
-
/**
|
|
4657
|
-
* Apply softmax to convert logits to probabilities
|
|
4658
|
-
*/
|
|
4659
|
-
softmax(logits) {
|
|
4660
|
-
let max = logits[0];
|
|
4661
|
-
for (let i = 1; i < logits.length; i++) {
|
|
4662
|
-
if (logits[i] > max) max = logits[i];
|
|
4663
|
-
}
|
|
4664
|
-
const exp = new Float32Array(logits.length);
|
|
4665
|
-
let sum = 0;
|
|
4666
|
-
for (let i = 0; i < logits.length; i++) {
|
|
4667
|
-
exp[i] = Math.exp(logits[i] - max);
|
|
4668
|
-
sum += exp[i];
|
|
4669
|
-
}
|
|
4670
|
-
const probs = new Float32Array(logits.length);
|
|
4671
|
-
for (let i = 0; i < logits.length; i++) {
|
|
4672
|
-
probs[i] = exp[i] / sum;
|
|
4673
|
-
}
|
|
4674
|
-
return probs;
|
|
4675
|
-
}
|
|
4676
|
-
/**
|
|
4677
|
-
* Dispose of the model and free resources
|
|
4678
|
-
*/
|
|
4679
|
-
async dispose() {
|
|
4680
|
-
if (this.session) {
|
|
4681
|
-
await this.session.release();
|
|
4682
|
-
this.session = null;
|
|
4683
|
-
}
|
|
4684
|
-
}
|
|
4685
|
-
};
|
|
4686
|
-
/**
|
|
4687
|
-
* Check if WebGPU is available and working
|
|
4688
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
4689
|
-
*/
|
|
4690
|
-
Emotion2VecInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
4691
|
-
|
|
4692
4813
|
// src/inference/SafariSpeechRecognition.ts
|
|
4693
|
-
var
|
|
4814
|
+
var logger10 = createLogger("SafariSpeech");
|
|
4694
4815
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
4695
4816
|
constructor(config = {}) {
|
|
4696
4817
|
this.recognition = null;
|
|
@@ -4709,7 +4830,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4709
4830
|
interimResults: config.interimResults ?? true,
|
|
4710
4831
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
4711
4832
|
};
|
|
4712
|
-
|
|
4833
|
+
logger10.debug("SafariSpeechRecognition created", {
|
|
4713
4834
|
language: this.config.language,
|
|
4714
4835
|
continuous: this.config.continuous
|
|
4715
4836
|
});
|
|
@@ -4770,7 +4891,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4770
4891
|
*/
|
|
4771
4892
|
async start() {
|
|
4772
4893
|
if (this.isListening) {
|
|
4773
|
-
|
|
4894
|
+
logger10.warn("Already listening");
|
|
4774
4895
|
return;
|
|
4775
4896
|
}
|
|
4776
4897
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -4800,7 +4921,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4800
4921
|
this.isListening = true;
|
|
4801
4922
|
this.startTime = performance.now();
|
|
4802
4923
|
this.accumulatedText = "";
|
|
4803
|
-
|
|
4924
|
+
logger10.info("Speech recognition started", {
|
|
4804
4925
|
language: this.config.language
|
|
4805
4926
|
});
|
|
4806
4927
|
span?.end();
|
|
@@ -4815,7 +4936,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4815
4936
|
*/
|
|
4816
4937
|
async stop() {
|
|
4817
4938
|
if (!this.isListening || !this.recognition) {
|
|
4818
|
-
|
|
4939
|
+
logger10.warn("Not currently listening");
|
|
4819
4940
|
return {
|
|
4820
4941
|
text: this.accumulatedText,
|
|
4821
4942
|
language: this.config.language,
|
|
@@ -4844,7 +4965,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4844
4965
|
if (this.recognition && this.isListening) {
|
|
4845
4966
|
this.recognition.abort();
|
|
4846
4967
|
this.isListening = false;
|
|
4847
|
-
|
|
4968
|
+
logger10.info("Speech recognition aborted");
|
|
4848
4969
|
}
|
|
4849
4970
|
}
|
|
4850
4971
|
/**
|
|
@@ -4875,7 +4996,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4875
4996
|
this.isListening = false;
|
|
4876
4997
|
this.resultCallbacks = [];
|
|
4877
4998
|
this.errorCallbacks = [];
|
|
4878
|
-
|
|
4999
|
+
logger10.debug("SafariSpeechRecognition disposed");
|
|
4879
5000
|
}
|
|
4880
5001
|
/**
|
|
4881
5002
|
* Set up event handlers for the recognition instance
|
|
@@ -4903,7 +5024,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4903
5024
|
confidence: alternative.confidence
|
|
4904
5025
|
};
|
|
4905
5026
|
this.emitResult(speechResult);
|
|
4906
|
-
|
|
5027
|
+
logger10.trace("Speech result", {
|
|
4907
5028
|
text: text.substring(0, 50),
|
|
4908
5029
|
isFinal,
|
|
4909
5030
|
confidence: alternative.confidence
|
|
@@ -4913,12 +5034,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4913
5034
|
span?.end();
|
|
4914
5035
|
} catch (error) {
|
|
4915
5036
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
4916
|
-
|
|
5037
|
+
logger10.error("Error processing speech result", { error });
|
|
4917
5038
|
}
|
|
4918
5039
|
};
|
|
4919
5040
|
this.recognition.onerror = (event) => {
|
|
4920
5041
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
4921
|
-
|
|
5042
|
+
logger10.error("Speech recognition error", { error: event.error, message: event.message });
|
|
4922
5043
|
this.emitError(error);
|
|
4923
5044
|
if (this.stopRejecter) {
|
|
4924
5045
|
this.stopRejecter(error);
|
|
@@ -4928,7 +5049,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4928
5049
|
};
|
|
4929
5050
|
this.recognition.onend = () => {
|
|
4930
5051
|
this.isListening = false;
|
|
4931
|
-
|
|
5052
|
+
logger10.info("Speech recognition ended", {
|
|
4932
5053
|
totalText: this.accumulatedText.length,
|
|
4933
5054
|
durationMs: performance.now() - this.startTime
|
|
4934
5055
|
});
|
|
@@ -4945,13 +5066,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4945
5066
|
}
|
|
4946
5067
|
};
|
|
4947
5068
|
this.recognition.onstart = () => {
|
|
4948
|
-
|
|
5069
|
+
logger10.debug("Speech recognition started by browser");
|
|
4949
5070
|
};
|
|
4950
5071
|
this.recognition.onspeechstart = () => {
|
|
4951
|
-
|
|
5072
|
+
logger10.debug("Speech detected");
|
|
4952
5073
|
};
|
|
4953
5074
|
this.recognition.onspeechend = () => {
|
|
4954
|
-
|
|
5075
|
+
logger10.debug("Speech ended");
|
|
4955
5076
|
};
|
|
4956
5077
|
}
|
|
4957
5078
|
/**
|
|
@@ -4962,7 +5083,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4962
5083
|
try {
|
|
4963
5084
|
callback(result);
|
|
4964
5085
|
} catch (error) {
|
|
4965
|
-
|
|
5086
|
+
logger10.error("Error in result callback", { error });
|
|
4966
5087
|
}
|
|
4967
5088
|
}
|
|
4968
5089
|
}
|
|
@@ -4974,7 +5095,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4974
5095
|
try {
|
|
4975
5096
|
callback(error);
|
|
4976
5097
|
} catch (callbackError) {
|
|
4977
|
-
|
|
5098
|
+
logger10.error("Error in error callback", { error: callbackError });
|
|
4978
5099
|
}
|
|
4979
5100
|
}
|
|
4980
5101
|
}
|
|
@@ -5148,7 +5269,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5148
5269
|
this._sessionId = null;
|
|
5149
5270
|
this._isConnected = false;
|
|
5150
5271
|
// Sub-components
|
|
5151
|
-
this.
|
|
5272
|
+
this.asr = null;
|
|
5152
5273
|
this.vad = null;
|
|
5153
5274
|
this.lam = null;
|
|
5154
5275
|
this.pipeline = null;
|
|
@@ -5187,7 +5308,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5187
5308
|
try {
|
|
5188
5309
|
const authToken = await this.getAuthToken(config.tenant);
|
|
5189
5310
|
await Promise.all([
|
|
5190
|
-
this.
|
|
5311
|
+
this.initASR(),
|
|
5191
5312
|
this.initLAM()
|
|
5192
5313
|
]);
|
|
5193
5314
|
await this.connectWebSocket(authToken, config);
|
|
@@ -5217,7 +5338,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5217
5338
|
this.ws = null;
|
|
5218
5339
|
}
|
|
5219
5340
|
await Promise.all([
|
|
5220
|
-
this.
|
|
5341
|
+
this.asr?.dispose(),
|
|
5221
5342
|
this.vad?.dispose(),
|
|
5222
5343
|
this.lam?.dispose()
|
|
5223
5344
|
]);
|
|
@@ -5349,16 +5470,15 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5349
5470
|
});
|
|
5350
5471
|
return token;
|
|
5351
5472
|
}
|
|
5352
|
-
async
|
|
5473
|
+
async initASR() {
|
|
5353
5474
|
await Promise.all([
|
|
5354
|
-
//
|
|
5475
|
+
// SenseVoice ASR
|
|
5355
5476
|
(async () => {
|
|
5356
|
-
this.
|
|
5357
|
-
|
|
5358
|
-
|
|
5359
|
-
language: "en"
|
|
5477
|
+
this.asr = new SenseVoiceInference({
|
|
5478
|
+
modelUrl: "/models/sensevoice/model.int8.onnx",
|
|
5479
|
+
language: "auto"
|
|
5360
5480
|
});
|
|
5361
|
-
await this.
|
|
5481
|
+
await this.asr.load();
|
|
5362
5482
|
})(),
|
|
5363
5483
|
// Silero VAD for accurate voice activity detection
|
|
5364
5484
|
(async () => {
|
|
@@ -5544,17 +5664,17 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5544
5664
|
console.debug("[AgentCore] Skipping silent audio", { rms, samples: audio.length });
|
|
5545
5665
|
return;
|
|
5546
5666
|
}
|
|
5547
|
-
if (this.
|
|
5667
|
+
if (this.asr) {
|
|
5548
5668
|
this.setState("listening");
|
|
5549
5669
|
this.emit("user.speech.start", { timestamp: Date.now() });
|
|
5550
|
-
this.
|
|
5670
|
+
this.asr.transcribe(audio).then((result) => {
|
|
5551
5671
|
this.emit("user.transcript.final", {
|
|
5552
5672
|
text: result.text,
|
|
5553
5673
|
confidence: 1
|
|
5554
5674
|
});
|
|
5555
5675
|
this.emit("user.speech.end", { timestamp: Date.now(), durationMs: result.inferenceTimeMs });
|
|
5556
5676
|
const cleanText = result.text.trim();
|
|
5557
|
-
if (cleanText
|
|
5677
|
+
if (cleanText) {
|
|
5558
5678
|
this.sendText(cleanText).catch((error) => {
|
|
5559
5679
|
console.error("[AgentCore] Send text error:", error);
|
|
5560
5680
|
});
|
|
@@ -6368,228 +6488,6 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
6368
6488
|
}
|
|
6369
6489
|
};
|
|
6370
6490
|
|
|
6371
|
-
// src/cache/huggingFaceCDN.ts
|
|
6372
|
-
var HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
|
|
6373
|
-
function parseHuggingFaceUrl(url) {
|
|
6374
|
-
const pattern = /^https:\/\/huggingface\.co\/([^/]+)\/([^/]+)\/resolve\/([^/]+)\/(.+)$/;
|
|
6375
|
-
const match = url.match(pattern);
|
|
6376
|
-
if (!match) {
|
|
6377
|
-
return null;
|
|
6378
|
-
}
|
|
6379
|
-
return {
|
|
6380
|
-
org: match[1],
|
|
6381
|
-
model: match[2],
|
|
6382
|
-
branch: match[3],
|
|
6383
|
-
file: match[4]
|
|
6384
|
-
};
|
|
6385
|
-
}
|
|
6386
|
-
async function isHuggingFaceCDNReachable(testUrl = HF_CDN_TEST_URL) {
|
|
6387
|
-
try {
|
|
6388
|
-
const response = await fetch(testUrl, {
|
|
6389
|
-
method: "HEAD",
|
|
6390
|
-
cache: "no-store"
|
|
6391
|
-
// Don't use cached response for reachability check
|
|
6392
|
-
});
|
|
6393
|
-
return response.ok;
|
|
6394
|
-
} catch {
|
|
6395
|
-
return false;
|
|
6396
|
-
}
|
|
6397
|
-
}
|
|
6398
|
-
|
|
6399
|
-
// src/utils/transformersCacheClear.ts
|
|
6400
|
-
var logger12 = createLogger("TransformersCache");
|
|
6401
|
-
async function clearTransformersCache(options) {
|
|
6402
|
-
const verbose = options?.verbose ?? true;
|
|
6403
|
-
const additionalPatterns = options?.additionalPatterns ?? [];
|
|
6404
|
-
if (!("caches" in window)) {
|
|
6405
|
-
logger12.warn("Cache API not available in this environment");
|
|
6406
|
-
return [];
|
|
6407
|
-
}
|
|
6408
|
-
try {
|
|
6409
|
-
const cacheNames = await caches.keys();
|
|
6410
|
-
const deletedCaches = [];
|
|
6411
|
-
const patterns = [
|
|
6412
|
-
"transformers",
|
|
6413
|
-
"huggingface",
|
|
6414
|
-
"onnx",
|
|
6415
|
-
...additionalPatterns
|
|
6416
|
-
];
|
|
6417
|
-
for (const cacheName of cacheNames) {
|
|
6418
|
-
const shouldDelete = patterns.some(
|
|
6419
|
-
(pattern) => cacheName.toLowerCase().includes(pattern.toLowerCase())
|
|
6420
|
-
);
|
|
6421
|
-
if (shouldDelete) {
|
|
6422
|
-
if (verbose) {
|
|
6423
|
-
logger12.info("Deleting cache", { cacheName });
|
|
6424
|
-
}
|
|
6425
|
-
const deleted = await caches.delete(cacheName);
|
|
6426
|
-
if (deleted) {
|
|
6427
|
-
deletedCaches.push(cacheName);
|
|
6428
|
-
} else if (verbose) {
|
|
6429
|
-
logger12.warn("Failed to delete cache", { cacheName });
|
|
6430
|
-
}
|
|
6431
|
-
}
|
|
6432
|
-
}
|
|
6433
|
-
if (verbose) {
|
|
6434
|
-
logger12.info("Cache clearing complete", {
|
|
6435
|
-
totalCaches: cacheNames.length,
|
|
6436
|
-
deletedCount: deletedCaches.length,
|
|
6437
|
-
deletedCaches
|
|
6438
|
-
});
|
|
6439
|
-
}
|
|
6440
|
-
return deletedCaches;
|
|
6441
|
-
} catch (error) {
|
|
6442
|
-
logger12.error("Error clearing caches", { error });
|
|
6443
|
-
throw error;
|
|
6444
|
-
}
|
|
6445
|
-
}
|
|
6446
|
-
async function clearSpecificCache(cacheName) {
|
|
6447
|
-
if (!("caches" in window)) {
|
|
6448
|
-
logger12.warn("Cache API not available in this environment");
|
|
6449
|
-
return false;
|
|
6450
|
-
}
|
|
6451
|
-
try {
|
|
6452
|
-
const deleted = await caches.delete(cacheName);
|
|
6453
|
-
logger12.info("Cache deletion attempt", { cacheName, deleted });
|
|
6454
|
-
return deleted;
|
|
6455
|
-
} catch (error) {
|
|
6456
|
-
logger12.error("Error deleting cache", { cacheName, error });
|
|
6457
|
-
return false;
|
|
6458
|
-
}
|
|
6459
|
-
}
|
|
6460
|
-
async function listCaches() {
|
|
6461
|
-
if (!("caches" in window)) {
|
|
6462
|
-
logger12.warn("Cache API not available in this environment");
|
|
6463
|
-
return [];
|
|
6464
|
-
}
|
|
6465
|
-
try {
|
|
6466
|
-
const cacheNames = await caches.keys();
|
|
6467
|
-
logger12.debug("Available caches", { cacheNames });
|
|
6468
|
-
return cacheNames;
|
|
6469
|
-
} catch (error) {
|
|
6470
|
-
logger12.error("Error listing caches", { error });
|
|
6471
|
-
return [];
|
|
6472
|
-
}
|
|
6473
|
-
}
|
|
6474
|
-
async function validateCachedResponse(cacheName, requestUrl) {
|
|
6475
|
-
if (!("caches" in window)) {
|
|
6476
|
-
return {
|
|
6477
|
-
exists: false,
|
|
6478
|
-
valid: false,
|
|
6479
|
-
contentType: null,
|
|
6480
|
-
isHtml: false,
|
|
6481
|
-
reason: "Cache API not available"
|
|
6482
|
-
};
|
|
6483
|
-
}
|
|
6484
|
-
try {
|
|
6485
|
-
const cache = await caches.open(cacheName);
|
|
6486
|
-
const response = await cache.match(requestUrl);
|
|
6487
|
-
if (!response) {
|
|
6488
|
-
return {
|
|
6489
|
-
exists: false,
|
|
6490
|
-
valid: false,
|
|
6491
|
-
contentType: null,
|
|
6492
|
-
isHtml: false,
|
|
6493
|
-
reason: "Not in cache"
|
|
6494
|
-
};
|
|
6495
|
-
}
|
|
6496
|
-
const contentType = response.headers.get("content-type");
|
|
6497
|
-
const isHtml = contentType?.includes("text/html") || contentType?.includes("text/plain");
|
|
6498
|
-
const clonedResponse = response.clone();
|
|
6499
|
-
const text = await clonedResponse.text();
|
|
6500
|
-
const looksLikeHtml = text.trim().startsWith("<") || text.includes("<!DOCTYPE");
|
|
6501
|
-
const valid = Boolean(
|
|
6502
|
-
response.status === 200 && !isHtml && !looksLikeHtml && contentType && (contentType.includes("application/json") || contentType.includes("application/octet-stream") || contentType.includes("binary"))
|
|
6503
|
-
);
|
|
6504
|
-
return {
|
|
6505
|
-
exists: true,
|
|
6506
|
-
valid,
|
|
6507
|
-
contentType,
|
|
6508
|
-
isHtml: isHtml || looksLikeHtml,
|
|
6509
|
-
reason: valid ? "Valid response" : `Invalid: status=${response.status}, contentType=${contentType}, isHtml=${isHtml || looksLikeHtml}`
|
|
6510
|
-
};
|
|
6511
|
-
} catch (error) {
|
|
6512
|
-
logger12.error("Error validating cached response", { cacheName, requestUrl, error });
|
|
6513
|
-
return {
|
|
6514
|
-
exists: false,
|
|
6515
|
-
valid: false,
|
|
6516
|
-
contentType: null,
|
|
6517
|
-
isHtml: false,
|
|
6518
|
-
reason: `Error: ${error}`
|
|
6519
|
-
};
|
|
6520
|
-
}
|
|
6521
|
-
}
|
|
6522
|
-
async function scanForInvalidCaches() {
|
|
6523
|
-
if (!("caches" in window)) {
|
|
6524
|
-
return { totalCaches: 0, scannedEntries: 0, invalidEntries: [] };
|
|
6525
|
-
}
|
|
6526
|
-
const invalidEntries = [];
|
|
6527
|
-
let scannedEntries = 0;
|
|
6528
|
-
try {
|
|
6529
|
-
const cacheNames = await caches.keys();
|
|
6530
|
-
for (const cacheName of cacheNames) {
|
|
6531
|
-
if (!cacheName.toLowerCase().includes("transformers")) {
|
|
6532
|
-
continue;
|
|
6533
|
-
}
|
|
6534
|
-
const cache = await caches.open(cacheName);
|
|
6535
|
-
const requests = await cache.keys();
|
|
6536
|
-
for (const request of requests) {
|
|
6537
|
-
scannedEntries++;
|
|
6538
|
-
const url = request.url;
|
|
6539
|
-
const validation = await validateCachedResponse(cacheName, url);
|
|
6540
|
-
if (validation.exists && !validation.valid) {
|
|
6541
|
-
invalidEntries.push({
|
|
6542
|
-
cacheName,
|
|
6543
|
-
url,
|
|
6544
|
-
reason: validation.reason || "Unknown"
|
|
6545
|
-
});
|
|
6546
|
-
}
|
|
6547
|
-
}
|
|
6548
|
-
}
|
|
6549
|
-
logger12.info("Cache scan complete", {
|
|
6550
|
-
totalCaches: cacheNames.length,
|
|
6551
|
-
scannedEntries,
|
|
6552
|
-
invalidCount: invalidEntries.length
|
|
6553
|
-
});
|
|
6554
|
-
return {
|
|
6555
|
-
totalCaches: cacheNames.length,
|
|
6556
|
-
scannedEntries,
|
|
6557
|
-
invalidEntries
|
|
6558
|
-
};
|
|
6559
|
-
} catch (error) {
|
|
6560
|
-
logger12.error("Error scanning caches", { error });
|
|
6561
|
-
throw error;
|
|
6562
|
-
}
|
|
6563
|
-
}
|
|
6564
|
-
async function nukeBrowserCaches(preventRecreation = false) {
|
|
6565
|
-
if (!("caches" in window)) {
|
|
6566
|
-
logger12.warn("Cache API not available in this environment");
|
|
6567
|
-
return 0;
|
|
6568
|
-
}
|
|
6569
|
-
try {
|
|
6570
|
-
const cacheNames = await caches.keys();
|
|
6571
|
-
let deletedCount = 0;
|
|
6572
|
-
for (const cacheName of cacheNames) {
|
|
6573
|
-
const deleted = await caches.delete(cacheName);
|
|
6574
|
-
if (deleted) {
|
|
6575
|
-
deletedCount++;
|
|
6576
|
-
}
|
|
6577
|
-
}
|
|
6578
|
-
logger12.info("All browser caches cleared", {
|
|
6579
|
-
totalDeleted: deletedCount
|
|
6580
|
-
});
|
|
6581
|
-
if (preventRecreation) {
|
|
6582
|
-
const { env } = await import("./transformers.web-MHLR33H6.mjs");
|
|
6583
|
-
env.useBrowserCache = false;
|
|
6584
|
-
logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
|
|
6585
|
-
}
|
|
6586
|
-
return deletedCount;
|
|
6587
|
-
} catch (error) {
|
|
6588
|
-
logger12.error("Error nuking caches", { error });
|
|
6589
|
-
throw error;
|
|
6590
|
-
}
|
|
6591
|
-
}
|
|
6592
|
-
|
|
6593
6491
|
// src/animation/types.ts
|
|
6594
6492
|
var DEFAULT_ANIMATION_CONFIG = {
|
|
6595
6493
|
initialState: "idle",
|
|
@@ -7129,7 +7027,6 @@ export {
|
|
|
7129
7027
|
EmotionPresets,
|
|
7130
7028
|
EmphasisDetector,
|
|
7131
7029
|
EventEmitter,
|
|
7132
|
-
HF_CDN_TEST_URL,
|
|
7133
7030
|
INFERENCE_LATENCY_BUCKETS,
|
|
7134
7031
|
InterruptionHandler,
|
|
7135
7032
|
LAMPipeline,
|
|
@@ -7143,6 +7040,7 @@ export {
|
|
|
7143
7040
|
OmoteTelemetry,
|
|
7144
7041
|
RingBuffer,
|
|
7145
7042
|
SafariSpeechRecognition,
|
|
7043
|
+
SenseVoiceInference,
|
|
7146
7044
|
SileroVADInference,
|
|
7147
7045
|
SileroVADWorker,
|
|
7148
7046
|
SyncedAudioPipeline,
|
|
@@ -7150,12 +7048,12 @@ export {
|
|
|
7150
7048
|
WAV2ARKIT_BLENDSHAPES,
|
|
7151
7049
|
Wav2ArkitCpuInference,
|
|
7152
7050
|
Wav2Vec2Inference,
|
|
7153
|
-
|
|
7051
|
+
applyCMVN,
|
|
7052
|
+
applyLFR,
|
|
7154
7053
|
blendEmotions,
|
|
7155
7054
|
calculatePeak,
|
|
7156
7055
|
calculateRMS,
|
|
7157
|
-
|
|
7158
|
-
clearTransformersCache,
|
|
7056
|
+
computeKaldiFbank,
|
|
7159
7057
|
configureCacheLimit,
|
|
7160
7058
|
configureLogging,
|
|
7161
7059
|
configureTelemetry,
|
|
@@ -7164,6 +7062,7 @@ export {
|
|
|
7164
7062
|
createLogger,
|
|
7165
7063
|
createSessionWithFallback,
|
|
7166
7064
|
createSileroVAD,
|
|
7065
|
+
ctcGreedyDecode,
|
|
7167
7066
|
fetchWithCache,
|
|
7168
7067
|
formatBytes,
|
|
7169
7068
|
getCacheConfig,
|
|
@@ -7180,7 +7079,6 @@ export {
|
|
|
7180
7079
|
getTelemetry,
|
|
7181
7080
|
hasWebGPUApi,
|
|
7182
7081
|
isAndroid,
|
|
7183
|
-
isHuggingFaceCDNReachable,
|
|
7184
7082
|
isIOS,
|
|
7185
7083
|
isIOSSafari,
|
|
7186
7084
|
isMobile,
|
|
@@ -7189,15 +7087,16 @@ export {
|
|
|
7189
7087
|
isSpeechRecognitionAvailable,
|
|
7190
7088
|
isWebGPUAvailable,
|
|
7191
7089
|
lerpEmotion,
|
|
7192
|
-
listCaches,
|
|
7193
7090
|
noopLogger,
|
|
7194
|
-
|
|
7195
|
-
|
|
7091
|
+
parseCMVNFromMetadata,
|
|
7092
|
+
parseTokensFile,
|
|
7196
7093
|
preloadModels,
|
|
7094
|
+
preloadOnnxRuntime,
|
|
7197
7095
|
remapWav2ArkitToLam,
|
|
7198
7096
|
resetLoggingConfig,
|
|
7199
7097
|
resolveBackend,
|
|
7200
|
-
|
|
7098
|
+
resolveLanguageId,
|
|
7099
|
+
resolveTextNormId,
|
|
7201
7100
|
setLogLevel,
|
|
7202
7101
|
setLoggingEnabled,
|
|
7203
7102
|
shouldEnableWasmProxy,
|
|
@@ -7205,7 +7104,6 @@ export {
|
|
|
7205
7104
|
shouldUseNativeASR,
|
|
7206
7105
|
shouldUseServerLipSync,
|
|
7207
7106
|
supportsVADWorker,
|
|
7208
|
-
symmetrizeBlendshapes
|
|
7209
|
-
validateCachedResponse
|
|
7107
|
+
symmetrizeBlendshapes
|
|
7210
7108
|
};
|
|
7211
7109
|
//# sourceMappingURL=index.mjs.map
|