@omote/core 0.3.25 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/events/index.mjs +0 -1
- package/dist/index.d.mts +201 -259
- package/dist/index.d.ts +201 -259
- package/dist/index.js +706 -38696
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +723 -930
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.mjs +0 -1
- package/package.json +1 -2
- package/dist/chunk-B6TIE56N.mjs +0 -37779
- package/dist/chunk-B6TIE56N.mjs.map +0 -1
- package/dist/chunk-NSSMTXJJ.mjs +0 -8
- package/dist/chunk-NSSMTXJJ.mjs.map +0 -1
- package/dist/transformers.web-T5LWC34T.mjs +0 -1718
- package/dist/transformers.web-T5LWC34T.mjs.map +0 -1
package/dist/index.mjs
CHANGED
|
@@ -12,11 +12,6 @@ import {
|
|
|
12
12
|
setLogLevel,
|
|
13
13
|
setLoggingEnabled
|
|
14
14
|
} from "./chunk-ESU52TDS.mjs";
|
|
15
|
-
import {
|
|
16
|
-
__webpack_exports__env,
|
|
17
|
-
__webpack_exports__pipeline
|
|
18
|
-
} from "./chunk-B6TIE56N.mjs";
|
|
19
|
-
import "./chunk-NSSMTXJJ.mjs";
|
|
20
15
|
|
|
21
16
|
// src/audio/MicrophoneCapture.ts
|
|
22
17
|
var MicrophoneCapture = class {
|
|
@@ -2274,6 +2269,14 @@ function getSessionOptions(backend) {
|
|
|
2274
2269
|
graphOptimizationLevel: "all"
|
|
2275
2270
|
};
|
|
2276
2271
|
}
|
|
2272
|
+
if (isIOS()) {
|
|
2273
|
+
return {
|
|
2274
|
+
executionProviders: ["wasm"],
|
|
2275
|
+
graphOptimizationLevel: "basic",
|
|
2276
|
+
enableCpuMemArena: false,
|
|
2277
|
+
enableMemPattern: false
|
|
2278
|
+
};
|
|
2279
|
+
}
|
|
2277
2280
|
return {
|
|
2278
2281
|
executionProviders: ["wasm"],
|
|
2279
2282
|
graphOptimizationLevel: "all"
|
|
@@ -2549,77 +2552,108 @@ var Wav2Vec2Inference = class {
|
|
|
2549
2552
|
this.ort = ort;
|
|
2550
2553
|
this._backend = backend;
|
|
2551
2554
|
logger2.info("ONNX Runtime loaded", { backend: this._backend });
|
|
2552
|
-
const cache = getModelCache();
|
|
2553
2555
|
const modelUrl = this.config.modelUrl;
|
|
2554
|
-
const
|
|
2555
|
-
|
|
2556
|
-
|
|
2557
|
-
|
|
2558
|
-
|
|
2559
|
-
|
|
2560
|
-
|
|
2561
|
-
|
|
2562
|
-
|
|
2563
|
-
|
|
2556
|
+
const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
|
|
2557
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
2558
|
+
let isCached = false;
|
|
2559
|
+
if (isIOS()) {
|
|
2560
|
+
logger2.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
2561
|
+
modelUrl,
|
|
2562
|
+
dataUrl
|
|
2563
|
+
});
|
|
2564
|
+
if (dataUrl) {
|
|
2565
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
2566
|
+
logger2.info("iOS: setting externalData", { dataFilename, dataUrl });
|
|
2567
|
+
sessionOptions.externalData = [{
|
|
2568
|
+
path: dataFilename,
|
|
2569
|
+
data: dataUrl
|
|
2570
|
+
// URL string — ORT fetches directly into WASM
|
|
2571
|
+
}];
|
|
2564
2572
|
}
|
|
2565
|
-
|
|
2566
|
-
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
|
|
2571
|
-
|
|
2572
|
-
throw new Error(errorMsg);
|
|
2573
|
-
}
|
|
2574
|
-
let externalDataBuffer = null;
|
|
2575
|
-
if (this.config.externalDataUrl !== false) {
|
|
2576
|
-
const dataUrl = typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`;
|
|
2573
|
+
logger2.info("iOS: calling InferenceSession.create() with URL string", {
|
|
2574
|
+
modelUrl,
|
|
2575
|
+
sessionOptions: JSON.stringify(
|
|
2576
|
+
sessionOptions,
|
|
2577
|
+
(_, v) => typeof v === "string" && v.length > 100 ? v.slice(0, 100) + "..." : v
|
|
2578
|
+
)
|
|
2579
|
+
});
|
|
2577
2580
|
try {
|
|
2578
|
-
|
|
2579
|
-
|
|
2580
|
-
|
|
2581
|
-
|
|
2582
|
-
|
|
2583
|
-
|
|
2584
|
-
|
|
2581
|
+
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
2582
|
+
} catch (sessionErr) {
|
|
2583
|
+
logger2.error("iOS: InferenceSession.create() failed", {
|
|
2584
|
+
error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
|
|
2585
|
+
errorType: sessionErr?.constructor?.name,
|
|
2586
|
+
stack: sessionErr instanceof Error ? sessionErr.stack : void 0
|
|
2587
|
+
});
|
|
2588
|
+
throw sessionErr;
|
|
2589
|
+
}
|
|
2590
|
+
logger2.info("iOS: session created successfully", {
|
|
2591
|
+
inputNames: this.session.inputNames,
|
|
2592
|
+
outputNames: this.session.outputNames
|
|
2593
|
+
});
|
|
2594
|
+
} else {
|
|
2595
|
+
const cache = getModelCache();
|
|
2596
|
+
isCached = await cache.has(modelUrl);
|
|
2597
|
+
let modelBuffer;
|
|
2598
|
+
if (isCached) {
|
|
2599
|
+
logger2.debug("Loading model from cache", { modelUrl });
|
|
2600
|
+
modelBuffer = await cache.get(modelUrl);
|
|
2601
|
+
if (!modelBuffer) {
|
|
2602
|
+
logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
2603
|
+
await cache.delete(modelUrl);
|
|
2604
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
2605
|
+
}
|
|
2606
|
+
} else {
|
|
2607
|
+
logger2.debug("Fetching and caching model", { modelUrl });
|
|
2608
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
2609
|
+
}
|
|
2610
|
+
if (!modelBuffer) {
|
|
2611
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
2612
|
+
}
|
|
2613
|
+
let externalDataBuffer = null;
|
|
2614
|
+
if (dataUrl) {
|
|
2615
|
+
try {
|
|
2616
|
+
const isDataCached = await cache.has(dataUrl);
|
|
2617
|
+
if (isDataCached) {
|
|
2618
|
+
logger2.debug("Loading external data from cache", { dataUrl });
|
|
2619
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
2620
|
+
if (!externalDataBuffer) {
|
|
2621
|
+
logger2.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
2622
|
+
await cache.delete(dataUrl);
|
|
2623
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2624
|
+
}
|
|
2625
|
+
} else {
|
|
2626
|
+
logger2.info("Fetching external model data", {
|
|
2627
|
+
dataUrl,
|
|
2628
|
+
note: "This may be a large download (383MB+)"
|
|
2629
|
+
});
|
|
2585
2630
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2586
2631
|
}
|
|
2587
|
-
|
|
2588
|
-
|
|
2632
|
+
logger2.info("External data loaded", {
|
|
2633
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
2634
|
+
});
|
|
2635
|
+
} catch (err) {
|
|
2636
|
+
logger2.debug("No external data file found (single-file model)", {
|
|
2589
2637
|
dataUrl,
|
|
2590
|
-
|
|
2638
|
+
error: err.message
|
|
2591
2639
|
});
|
|
2592
|
-
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2593
2640
|
}
|
|
2594
|
-
logger2.info("External data loaded", {
|
|
2595
|
-
size: formatBytes(externalDataBuffer.byteLength)
|
|
2596
|
-
});
|
|
2597
|
-
} catch (err) {
|
|
2598
|
-
logger2.debug("No external data file found (single-file model)", {
|
|
2599
|
-
dataUrl,
|
|
2600
|
-
error: err.message
|
|
2601
|
-
});
|
|
2602
2641
|
}
|
|
2642
|
+
logger2.debug("Creating ONNX session", {
|
|
2643
|
+
graphSize: formatBytes(modelBuffer.byteLength),
|
|
2644
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
2645
|
+
backend: this._backend
|
|
2646
|
+
});
|
|
2647
|
+
if (externalDataBuffer) {
|
|
2648
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
2649
|
+
sessionOptions.externalData = [{
|
|
2650
|
+
path: dataFilename,
|
|
2651
|
+
data: new Uint8Array(externalDataBuffer)
|
|
2652
|
+
}];
|
|
2653
|
+
}
|
|
2654
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
2655
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
2603
2656
|
}
|
|
2604
|
-
logger2.debug("Creating ONNX session", {
|
|
2605
|
-
graphSize: formatBytes(modelBuffer.byteLength),
|
|
2606
|
-
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
2607
|
-
backend: this._backend
|
|
2608
|
-
});
|
|
2609
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
2610
|
-
if (externalDataBuffer) {
|
|
2611
|
-
const dataFilename = (typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`).split("/").pop();
|
|
2612
|
-
sessionOptions.externalData = [{
|
|
2613
|
-
path: dataFilename,
|
|
2614
|
-
data: new Uint8Array(externalDataBuffer)
|
|
2615
|
-
}];
|
|
2616
|
-
}
|
|
2617
|
-
logger2.info("Creating session with execution provider", {
|
|
2618
|
-
executionProvider: this._backend,
|
|
2619
|
-
hasExternalData: !!externalDataBuffer
|
|
2620
|
-
});
|
|
2621
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
2622
|
-
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
2623
2657
|
logger2.info("ONNX session created successfully", {
|
|
2624
2658
|
executionProvider: this._backend,
|
|
2625
2659
|
backend: this._backend
|
|
@@ -2634,7 +2668,7 @@ var Wav2Vec2Inference = class {
|
|
|
2634
2668
|
span?.setAttributes({
|
|
2635
2669
|
"model.backend": this._backend,
|
|
2636
2670
|
"model.load_time_ms": loadTimeMs,
|
|
2637
|
-
"model.cached": isCached
|
|
2671
|
+
"model.cached": !isIOS() && isCached
|
|
2638
2672
|
});
|
|
2639
2673
|
span?.end();
|
|
2640
2674
|
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
@@ -2644,12 +2678,23 @@ var Wav2Vec2Inference = class {
|
|
|
2644
2678
|
logger2.debug("Running warmup inference to initialize GPU context");
|
|
2645
2679
|
const warmupStart = performance.now();
|
|
2646
2680
|
const silentAudio = new Float32Array(16e3);
|
|
2647
|
-
|
|
2681
|
+
const WARMUP_TIMEOUT_MS = 15e3;
|
|
2682
|
+
const warmupResult = await Promise.race([
|
|
2683
|
+
this.infer(silentAudio, 0).then(() => "ok"),
|
|
2684
|
+
new Promise((r) => setTimeout(() => r("timeout"), WARMUP_TIMEOUT_MS))
|
|
2685
|
+
]);
|
|
2648
2686
|
const warmupTimeMs = performance.now() - warmupStart;
|
|
2649
|
-
|
|
2650
|
-
|
|
2651
|
-
|
|
2652
|
-
|
|
2687
|
+
if (warmupResult === "timeout") {
|
|
2688
|
+
logger2.warn("Warmup inference timed out \u2014 GPU may be unresponsive. Continuing without warmup.", {
|
|
2689
|
+
timeoutMs: WARMUP_TIMEOUT_MS,
|
|
2690
|
+
backend: this._backend
|
|
2691
|
+
});
|
|
2692
|
+
} else {
|
|
2693
|
+
logger2.info("Warmup inference complete", {
|
|
2694
|
+
warmupTimeMs: Math.round(warmupTimeMs),
|
|
2695
|
+
backend: this._backend
|
|
2696
|
+
});
|
|
2697
|
+
}
|
|
2653
2698
|
telemetry?.recordHistogram("omote.model.warmup_time", warmupTimeMs, {
|
|
2654
2699
|
model: "wav2vec2",
|
|
2655
2700
|
backend: this._backend
|
|
@@ -2837,334 +2882,316 @@ LAM_BLENDSHAPES.forEach((name, index) => {
|
|
|
2837
2882
|
});
|
|
2838
2883
|
var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
|
|
2839
2884
|
|
|
2840
|
-
// src/inference/
|
|
2841
|
-
|
|
2842
|
-
|
|
2843
|
-
|
|
2844
|
-
|
|
2845
|
-
|
|
2846
|
-
|
|
2847
|
-
|
|
2848
|
-
|
|
2849
|
-
|
|
2850
|
-
|
|
2851
|
-
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
|
|
2855
|
-
|
|
2856
|
-
|
|
2857
|
-
|
|
2858
|
-
|
|
2859
|
-
|
|
2860
|
-
|
|
2861
|
-
|
|
2862
|
-
|
|
2863
|
-
|
|
2864
|
-
|
|
2865
|
-
|
|
2866
|
-
|
|
2867
|
-
|
|
2868
|
-
|
|
2869
|
-
|
|
2870
|
-
|
|
2871
|
-
|
|
2872
|
-
|
|
2873
|
-
|
|
2874
|
-
|
|
2875
|
-
|
|
2876
|
-
|
|
2877
|
-
|
|
2878
|
-
|
|
2879
|
-
if (this.pipeline !== null && this.currentModel === modelName) {
|
|
2880
|
-
logger4.debug("Model already loaded", { model: modelName });
|
|
2881
|
-
return;
|
|
2882
|
-
}
|
|
2883
|
-
this.isLoading = true;
|
|
2884
|
-
const telemetry = getTelemetry();
|
|
2885
|
-
const span = telemetry?.startSpan("whisper.load", {
|
|
2886
|
-
"whisper.model": modelName,
|
|
2887
|
-
"whisper.dtype": this.config.dtype,
|
|
2888
|
-
"whisper.device": this.config.device
|
|
2889
|
-
});
|
|
2890
|
-
try {
|
|
2891
|
-
const loadStart = performance.now();
|
|
2892
|
-
logger4.info("Loading model", {
|
|
2893
|
-
model: modelName,
|
|
2894
|
-
dtype: this.config.dtype,
|
|
2895
|
-
device: this.config.device,
|
|
2896
|
-
multilingual: this.config.multilingual
|
|
2897
|
-
});
|
|
2898
|
-
if (this.pipeline !== null && this.currentModel !== modelName) {
|
|
2899
|
-
logger4.debug("Disposing old model", { oldModel: this.currentModel });
|
|
2900
|
-
await this.pipeline.dispose();
|
|
2901
|
-
this.pipeline = null;
|
|
2902
|
-
}
|
|
2903
|
-
const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
|
|
2904
|
-
const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
|
|
2905
|
-
logger4.info("Creating pipeline", { device, hasWebGPU });
|
|
2906
|
-
__webpack_exports__env.allowLocalModels = false;
|
|
2907
|
-
__webpack_exports__env.allowRemoteModels = true;
|
|
2908
|
-
__webpack_exports__env.useBrowserCache = false;
|
|
2909
|
-
__webpack_exports__env.useCustomCache = false;
|
|
2910
|
-
__webpack_exports__env.useWasmCache = false;
|
|
2911
|
-
if (__webpack_exports__env.backends.onnx.wasm) {
|
|
2912
|
-
__webpack_exports__env.backends.onnx.wasm.proxy = false;
|
|
2913
|
-
__webpack_exports__env.backends.onnx.wasm.numThreads = 1;
|
|
2914
|
-
}
|
|
2915
|
-
logger4.info("Configured transformers.js env", {
|
|
2916
|
-
allowLocalModels: __webpack_exports__env.allowLocalModels,
|
|
2917
|
-
useBrowserCache: __webpack_exports__env.useBrowserCache,
|
|
2918
|
-
useWasmCache: __webpack_exports__env.useWasmCache
|
|
2919
|
-
});
|
|
2920
|
-
const pipelineOptions = {
|
|
2921
|
-
dtype: this.config.dtype,
|
|
2922
|
-
device,
|
|
2923
|
-
progress_callback: onProgress,
|
|
2924
|
-
// For medium models, use no_attentions revision to save memory
|
|
2925
|
-
revision: modelName.includes("whisper-medium") ? "no_attentions" : "main",
|
|
2926
|
-
// Pass HuggingFace token to bypass rate limits
|
|
2927
|
-
...this.config.token && { token: this.config.token }
|
|
2928
|
-
};
|
|
2929
|
-
if (device === "webgpu") {
|
|
2930
|
-
pipelineOptions.session_options = {
|
|
2931
|
-
executionProviders: ["webgpu"]
|
|
2932
|
-
};
|
|
2933
|
-
logger4.info("Forcing WebGPU execution providers");
|
|
2885
|
+
// src/inference/kaldiFbank.ts
|
|
2886
|
+
function fft(re, im) {
|
|
2887
|
+
const n = re.length;
|
|
2888
|
+
for (let i = 1, j = 0; i < n; i++) {
|
|
2889
|
+
let bit = n >> 1;
|
|
2890
|
+
while (j & bit) {
|
|
2891
|
+
j ^= bit;
|
|
2892
|
+
bit >>= 1;
|
|
2893
|
+
}
|
|
2894
|
+
j ^= bit;
|
|
2895
|
+
if (i < j) {
|
|
2896
|
+
let tmp = re[i];
|
|
2897
|
+
re[i] = re[j];
|
|
2898
|
+
re[j] = tmp;
|
|
2899
|
+
tmp = im[i];
|
|
2900
|
+
im[i] = im[j];
|
|
2901
|
+
im[j] = tmp;
|
|
2902
|
+
}
|
|
2903
|
+
}
|
|
2904
|
+
for (let len = 2; len <= n; len *= 2) {
|
|
2905
|
+
const halfLen = len / 2;
|
|
2906
|
+
const angle = -2 * Math.PI / len;
|
|
2907
|
+
const wRe = Math.cos(angle);
|
|
2908
|
+
const wIm = Math.sin(angle);
|
|
2909
|
+
for (let i = 0; i < n; i += len) {
|
|
2910
|
+
let curRe = 1;
|
|
2911
|
+
let curIm = 0;
|
|
2912
|
+
for (let j = 0; j < halfLen; j++) {
|
|
2913
|
+
const a = i + j;
|
|
2914
|
+
const b = a + halfLen;
|
|
2915
|
+
const tRe = curRe * re[b] - curIm * im[b];
|
|
2916
|
+
const tIm = curRe * im[b] + curIm * re[b];
|
|
2917
|
+
re[b] = re[a] - tRe;
|
|
2918
|
+
im[b] = im[a] - tIm;
|
|
2919
|
+
re[a] += tRe;
|
|
2920
|
+
im[a] += tIm;
|
|
2921
|
+
const nextRe = curRe * wRe - curIm * wIm;
|
|
2922
|
+
curIm = curRe * wIm + curIm * wRe;
|
|
2923
|
+
curRe = nextRe;
|
|
2934
2924
|
}
|
|
2935
|
-
this.pipeline = await __webpack_exports__pipeline(
|
|
2936
|
-
"automatic-speech-recognition",
|
|
2937
|
-
modelName,
|
|
2938
|
-
pipelineOptions
|
|
2939
|
-
);
|
|
2940
|
-
this.actualBackend = device;
|
|
2941
|
-
this.currentModel = modelName;
|
|
2942
|
-
const loadTimeMs = performance.now() - loadStart;
|
|
2943
|
-
logger4.info("Model loaded successfully", {
|
|
2944
|
-
model: modelName,
|
|
2945
|
-
loadTimeMs: Math.round(loadTimeMs)
|
|
2946
|
-
});
|
|
2947
|
-
span?.setAttributes({
|
|
2948
|
-
"whisper.load_time_ms": loadTimeMs
|
|
2949
|
-
});
|
|
2950
|
-
span?.end();
|
|
2951
|
-
} catch (error) {
|
|
2952
|
-
const errorDetails = {
|
|
2953
|
-
message: error instanceof Error ? error.message : String(error),
|
|
2954
|
-
stack: error instanceof Error ? error.stack : void 0,
|
|
2955
|
-
name: error instanceof Error ? error.name : void 0,
|
|
2956
|
-
error
|
|
2957
|
-
};
|
|
2958
|
-
logger4.error("Failed to load model", errorDetails);
|
|
2959
|
-
span?.endWithError(error);
|
|
2960
|
-
throw error;
|
|
2961
|
-
} finally {
|
|
2962
|
-
this.isLoading = false;
|
|
2963
2925
|
}
|
|
2964
2926
|
}
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
|
|
2971
|
-
|
|
2972
|
-
|
|
2973
|
-
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
|
|
2992
|
-
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
|
|
2997
|
-
stride_length_s: options?.strideLengthS || (isShortAudio ? 0 : 5),
|
|
2998
|
-
// Timestamps
|
|
2999
|
-
return_timestamps: options?.returnTimestamps || false,
|
|
3000
|
-
force_full_sequences: false
|
|
3001
|
-
};
|
|
3002
|
-
if (this.config.multilingual) {
|
|
3003
|
-
transcribeOptions.language = options?.language || this.config.language;
|
|
3004
|
-
transcribeOptions.task = options?.task || this.config.task;
|
|
3005
|
-
}
|
|
3006
|
-
const rawResult = await this.pipeline(audioCopy, transcribeOptions);
|
|
3007
|
-
const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
|
|
3008
|
-
const inferenceTimeMs = performance.now() - inferStart;
|
|
3009
|
-
let cleanedText = result.text;
|
|
3010
|
-
if (this.config.suppressNonSpeech) {
|
|
3011
|
-
cleanedText = this.removeNonSpeechTokens(cleanedText);
|
|
2927
|
+
}
|
|
2928
|
+
function htkMel(freq) {
|
|
2929
|
+
return 1127 * Math.log(1 + freq / 700);
|
|
2930
|
+
}
|
|
2931
|
+
function htkMelInverse(mel) {
|
|
2932
|
+
return 700 * (Math.exp(mel / 1127) - 1);
|
|
2933
|
+
}
|
|
2934
|
+
function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
|
|
2935
|
+
const numFftBins = fftSize / 2 + 1;
|
|
2936
|
+
const lowMel = htkMel(lowFreq);
|
|
2937
|
+
const highMel = htkMel(highFreq);
|
|
2938
|
+
const melPoints = new Float64Array(numBins + 2);
|
|
2939
|
+
for (let i = 0; i < numBins + 2; i++) {
|
|
2940
|
+
melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
|
|
2941
|
+
}
|
|
2942
|
+
const binFreqs = new Float64Array(numBins + 2);
|
|
2943
|
+
for (let i = 0; i < numBins + 2; i++) {
|
|
2944
|
+
binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
|
|
2945
|
+
}
|
|
2946
|
+
const filters = [];
|
|
2947
|
+
for (let m = 0; m < numBins; m++) {
|
|
2948
|
+
const left = binFreqs[m];
|
|
2949
|
+
const center = binFreqs[m + 1];
|
|
2950
|
+
const right = binFreqs[m + 2];
|
|
2951
|
+
const startBin = Math.max(0, Math.ceil(left));
|
|
2952
|
+
const endBin = Math.min(numFftBins - 1, Math.floor(right));
|
|
2953
|
+
const weights = new Float32Array(endBin - startBin + 1);
|
|
2954
|
+
for (let k = startBin; k <= endBin; k++) {
|
|
2955
|
+
if (k <= center) {
|
|
2956
|
+
weights[k - startBin] = center - left > 0 ? (k - left) / (center - left) : 0;
|
|
2957
|
+
} else {
|
|
2958
|
+
weights[k - startBin] = right - center > 0 ? (right - k) / (right - center) : 0;
|
|
3012
2959
|
}
|
|
3013
|
-
const transcription = {
|
|
3014
|
-
text: cleanedText,
|
|
3015
|
-
language: this.config.language,
|
|
3016
|
-
inferenceTimeMs,
|
|
3017
|
-
chunks: result.chunks
|
|
3018
|
-
};
|
|
3019
|
-
logger4.debug("Transcription complete", {
|
|
3020
|
-
text: transcription.text,
|
|
3021
|
-
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
3022
|
-
chunksCount: result.chunks?.length || 0
|
|
3023
|
-
});
|
|
3024
|
-
span?.setAttributes({
|
|
3025
|
-
"whisper.inference_time_ms": inferenceTimeMs,
|
|
3026
|
-
"whisper.text_length": transcription.text.length
|
|
3027
|
-
});
|
|
3028
|
-
span?.end();
|
|
3029
|
-
return transcription;
|
|
3030
|
-
} catch (error) {
|
|
3031
|
-
logger4.error("Transcribe error", { error });
|
|
3032
|
-
span?.endWithError(error);
|
|
3033
|
-
throw new Error(`Whisper transcription failed: ${error}`);
|
|
3034
2960
|
}
|
|
2961
|
+
filters.push({ startBin, weights });
|
|
3035
2962
|
}
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
|
|
3040
|
-
|
|
3041
|
-
|
|
3042
|
-
|
|
3043
|
-
|
|
3044
|
-
|
|
3045
|
-
|
|
3046
|
-
|
|
3047
|
-
|
|
3048
|
-
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
2963
|
+
return filters;
|
|
2964
|
+
}
|
|
2965
|
+
function createHammingWindow(length) {
|
|
2966
|
+
const window2 = new Float32Array(length);
|
|
2967
|
+
for (let i = 0; i < length; i++) {
|
|
2968
|
+
window2[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
|
|
2969
|
+
}
|
|
2970
|
+
return window2;
|
|
2971
|
+
}
|
|
2972
|
+
function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
|
|
2973
|
+
const frameLengthMs = opts?.frameLengthMs ?? 25;
|
|
2974
|
+
const frameShiftMs = opts?.frameShiftMs ?? 10;
|
|
2975
|
+
const lowFreq = opts?.lowFreq ?? 20;
|
|
2976
|
+
const highFreq = opts?.highFreq ?? sampleRate / 2;
|
|
2977
|
+
const dither = opts?.dither ?? 0;
|
|
2978
|
+
const preemphasis = opts?.preemphasis ?? 0.97;
|
|
2979
|
+
const frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1e3);
|
|
2980
|
+
const frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1e3);
|
|
2981
|
+
const scaled = new Float32Array(audio.length);
|
|
2982
|
+
for (let i = 0; i < audio.length; i++) {
|
|
2983
|
+
scaled[i] = audio[i] * 32768;
|
|
2984
|
+
}
|
|
2985
|
+
if (dither > 0) {
|
|
2986
|
+
for (let i = 0; i < scaled.length; i++) {
|
|
2987
|
+
const u1 = Math.random();
|
|
2988
|
+
const u2 = Math.random();
|
|
2989
|
+
scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
2990
|
+
}
|
|
2991
|
+
}
|
|
2992
|
+
const numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
|
|
2993
|
+
if (numFrames === 0) {
|
|
2994
|
+
return new Float32Array(0);
|
|
2995
|
+
}
|
|
2996
|
+
let fftSize = 1;
|
|
2997
|
+
while (fftSize < frameLengthSamples) fftSize *= 2;
|
|
2998
|
+
const numFftBins = fftSize / 2 + 1;
|
|
2999
|
+
const window2 = createHammingWindow(frameLengthSamples);
|
|
3000
|
+
const filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
|
|
3001
|
+
const output = new Float32Array(numFrames * numMelBins);
|
|
3002
|
+
const fftRe = new Float64Array(fftSize);
|
|
3003
|
+
const fftIm = new Float64Array(fftSize);
|
|
3004
|
+
for (let f = 0; f < numFrames; f++) {
|
|
3005
|
+
const offset = f * frameShiftSamples;
|
|
3006
|
+
fftRe.fill(0);
|
|
3007
|
+
fftIm.fill(0);
|
|
3008
|
+
for (let i = 0; i < frameLengthSamples; i++) {
|
|
3009
|
+
let sample = scaled[offset + i];
|
|
3010
|
+
if (preemphasis > 0 && i > 0) {
|
|
3011
|
+
sample -= preemphasis * scaled[offset + i - 1];
|
|
3012
|
+
} else if (preemphasis > 0 && i === 0 && offset > 0) {
|
|
3013
|
+
sample -= preemphasis * scaled[offset - 1];
|
|
3069
3014
|
}
|
|
3070
|
-
|
|
3071
|
-
|
|
3072
|
-
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3015
|
+
fftRe[i] = sample * window2[i];
|
|
3016
|
+
}
|
|
3017
|
+
fft(fftRe, fftIm);
|
|
3018
|
+
const outOffset = f * numMelBins;
|
|
3019
|
+
for (let m = 0; m < numMelBins; m++) {
|
|
3020
|
+
const filter = filters[m];
|
|
3021
|
+
let energy = 0;
|
|
3022
|
+
for (let k = 0; k < filter.weights.length; k++) {
|
|
3023
|
+
const bin = filter.startBin + k;
|
|
3024
|
+
if (bin < numFftBins) {
|
|
3025
|
+
const powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
|
|
3026
|
+
energy += filter.weights[k] * powerSpec;
|
|
3079
3027
|
}
|
|
3080
3028
|
}
|
|
3081
|
-
|
|
3082
|
-
onUpdate(result.text);
|
|
3083
|
-
}
|
|
3084
|
-
logger4.debug("Streaming transcription complete", {
|
|
3085
|
-
text: result.text,
|
|
3086
|
-
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
3087
|
-
chunksCount: result.chunks?.length || 0
|
|
3088
|
-
});
|
|
3089
|
-
span?.setAttributes({
|
|
3090
|
-
"whisper.inference_time_ms": inferenceTimeMs,
|
|
3091
|
-
"whisper.chunks_count": result.chunks?.length || 0
|
|
3092
|
-
});
|
|
3093
|
-
span?.end();
|
|
3094
|
-
return {
|
|
3095
|
-
text: result.text,
|
|
3096
|
-
language: this.config.language,
|
|
3097
|
-
inferenceTimeMs,
|
|
3098
|
-
chunks: result.chunks
|
|
3099
|
-
};
|
|
3100
|
-
} catch (error) {
|
|
3101
|
-
logger4.error("Streaming transcribe error", { error });
|
|
3102
|
-
span?.endWithError(error);
|
|
3103
|
-
throw new Error(`Whisper streaming transcription failed: ${error}`);
|
|
3029
|
+
output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
|
|
3104
3030
|
}
|
|
3105
3031
|
}
|
|
3106
|
-
|
|
3107
|
-
|
|
3108
|
-
|
|
3109
|
-
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3114
|
-
|
|
3032
|
+
return output;
|
|
3033
|
+
}
|
|
3034
|
+
function applyLFR(features, featureDim, lfrM = 7, lfrN = 6) {
|
|
3035
|
+
const numFrames = features.length / featureDim;
|
|
3036
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
3037
|
+
const leftPad = Math.floor((lfrM - 1) / 2);
|
|
3038
|
+
const paddedLen = numFrames + leftPad;
|
|
3039
|
+
const numOutputFrames = Math.ceil(paddedLen / lfrN);
|
|
3040
|
+
const outputDim = featureDim * lfrM;
|
|
3041
|
+
const output = new Float32Array(numOutputFrames * outputDim);
|
|
3042
|
+
for (let i = 0; i < numOutputFrames; i++) {
|
|
3043
|
+
const startFrame = i * lfrN - leftPad;
|
|
3044
|
+
for (let j = 0; j < lfrM; j++) {
|
|
3045
|
+
let srcFrame = startFrame + j;
|
|
3046
|
+
if (srcFrame < 0) srcFrame = 0;
|
|
3047
|
+
if (srcFrame >= numFrames) srcFrame = numFrames - 1;
|
|
3048
|
+
const srcOffset = srcFrame * featureDim;
|
|
3049
|
+
const dstOffset = i * outputDim + j * featureDim;
|
|
3050
|
+
for (let k = 0; k < featureDim; k++) {
|
|
3051
|
+
output[dstOffset + k] = features[srcOffset + k];
|
|
3052
|
+
}
|
|
3115
3053
|
}
|
|
3116
3054
|
}
|
|
3117
|
-
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3121
|
-
|
|
3055
|
+
return output;
|
|
3056
|
+
}
|
|
3057
|
+
function applyCMVN(features, dim, negMean, invStddev) {
|
|
3058
|
+
for (let i = 0; i < features.length; i++) {
|
|
3059
|
+
const d = i % dim;
|
|
3060
|
+
features[i] = (features[i] + negMean[d]) * invStddev[d];
|
|
3122
3061
|
}
|
|
3123
|
-
|
|
3124
|
-
|
|
3125
|
-
|
|
3126
|
-
|
|
3127
|
-
|
|
3062
|
+
return features;
|
|
3063
|
+
}
|
|
3064
|
+
function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
|
|
3065
|
+
const negMean = new Float32Array(
|
|
3066
|
+
negMeanStr.split(",").map((s) => parseFloat(s.trim()))
|
|
3067
|
+
);
|
|
3068
|
+
const invStddev = new Float32Array(
|
|
3069
|
+
invStddevStr.split(",").map((s) => parseFloat(s.trim()))
|
|
3070
|
+
);
|
|
3071
|
+
return { negMean, invStddev };
|
|
3072
|
+
}
|
|
3073
|
+
|
|
3074
|
+
// src/inference/ctcDecoder.ts
|
|
3075
|
+
function resolveLanguageId(language) {
|
|
3076
|
+
const map = {
|
|
3077
|
+
auto: 0,
|
|
3078
|
+
zh: 3,
|
|
3079
|
+
en: 4,
|
|
3080
|
+
yue: 7,
|
|
3081
|
+
ja: 11,
|
|
3082
|
+
ko: 12
|
|
3083
|
+
};
|
|
3084
|
+
return map[language] ?? 0;
|
|
3085
|
+
}
|
|
3086
|
+
function resolveTextNormId(textNorm) {
|
|
3087
|
+
return textNorm === "without_itn" ? 15 : 14;
|
|
3088
|
+
}
|
|
3089
|
+
function parseTokensFile(content) {
|
|
3090
|
+
const map = /* @__PURE__ */ new Map();
|
|
3091
|
+
const lines = content.split("\n");
|
|
3092
|
+
for (const line of lines) {
|
|
3093
|
+
const trimmed = line.trim();
|
|
3094
|
+
if (!trimmed) continue;
|
|
3095
|
+
const lastSpace = trimmed.lastIndexOf(" ");
|
|
3096
|
+
if (lastSpace === -1) continue;
|
|
3097
|
+
const token = trimmed.substring(0, lastSpace);
|
|
3098
|
+
const id = parseInt(trimmed.substring(lastSpace + 1), 10);
|
|
3099
|
+
if (!isNaN(id)) {
|
|
3100
|
+
map.set(id, token);
|
|
3101
|
+
}
|
|
3102
|
+
}
|
|
3103
|
+
return map;
|
|
3104
|
+
}
|
|
3105
|
+
function parseStructuredToken(token) {
|
|
3106
|
+
const match = token.match(/^<\|(.+)\|>$/);
|
|
3107
|
+
if (!match) return null;
|
|
3108
|
+
const value = match[1];
|
|
3109
|
+
if (value === "zh" || value === "en" || value === "ja" || value === "ko" || value === "yue" || value === "nospeech") {
|
|
3110
|
+
return { type: "language", value };
|
|
3128
3111
|
}
|
|
3129
|
-
|
|
3130
|
-
|
|
3131
|
-
|
|
3132
|
-
|
|
3133
|
-
|
|
3134
|
-
|
|
3112
|
+
const emotions = ["HAPPY", "SAD", "ANGRY", "NEUTRAL", "FEARFUL", "DISGUSTED", "SURPRISED", "EMO_UNKNOWN"];
|
|
3113
|
+
if (emotions.includes(value)) {
|
|
3114
|
+
return { type: "emotion", value };
|
|
3115
|
+
}
|
|
3116
|
+
const events = ["Speech", "BGM", "Applause", "Laughter", "Crying", "Coughing", "Sneezing", "EVENT_UNKNOWN"];
|
|
3117
|
+
if (events.includes(value)) {
|
|
3118
|
+
return { type: "event", value };
|
|
3119
|
+
}
|
|
3120
|
+
if (value === "withitn" || value === "woitn" || value === "with_itn" || value === "without_itn") {
|
|
3121
|
+
return { type: "textnorm", value };
|
|
3122
|
+
}
|
|
3123
|
+
return null;
|
|
3124
|
+
}
|
|
3125
|
+
function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
3126
|
+
const tokenIds = [];
|
|
3127
|
+
for (let t = 0; t < seqLen; t++) {
|
|
3128
|
+
const offset = t * vocabSize;
|
|
3129
|
+
let maxIdx = 0;
|
|
3130
|
+
let maxVal = logits[offset];
|
|
3131
|
+
for (let v = 1; v < vocabSize; v++) {
|
|
3132
|
+
if (logits[offset + v] > maxVal) {
|
|
3133
|
+
maxVal = logits[offset + v];
|
|
3134
|
+
maxIdx = v;
|
|
3135
|
+
}
|
|
3135
3136
|
}
|
|
3136
|
-
|
|
3137
|
-
|
|
3138
|
-
|
|
3137
|
+
tokenIds.push(maxIdx);
|
|
3138
|
+
}
|
|
3139
|
+
const collapsed = [];
|
|
3140
|
+
let prev = -1;
|
|
3141
|
+
for (const id of tokenIds) {
|
|
3142
|
+
if (id !== prev) {
|
|
3143
|
+
collapsed.push(id);
|
|
3144
|
+
prev = id;
|
|
3145
|
+
}
|
|
3146
|
+
}
|
|
3147
|
+
const filtered = collapsed.filter((id) => id !== 0 && id !== 1 && id !== 2);
|
|
3148
|
+
let language;
|
|
3149
|
+
let emotion;
|
|
3150
|
+
let event;
|
|
3151
|
+
const textTokens = [];
|
|
3152
|
+
for (const id of filtered) {
|
|
3153
|
+
const token = tokenMap.get(id);
|
|
3154
|
+
if (!token) continue;
|
|
3155
|
+
const structured = parseStructuredToken(token);
|
|
3156
|
+
if (structured) {
|
|
3157
|
+
if (structured.type === "language") language = structured.value;
|
|
3158
|
+
else if (structured.type === "emotion") emotion = structured.value;
|
|
3159
|
+
else if (structured.type === "event") event = structured.value;
|
|
3160
|
+
} else {
|
|
3161
|
+
textTokens.push(token);
|
|
3139
3162
|
}
|
|
3140
|
-
return modelName;
|
|
3141
|
-
}
|
|
3142
|
-
/**
|
|
3143
|
-
* Remove non-speech event tokens from transcription
|
|
3144
|
-
*
|
|
3145
|
-
* Whisper outputs special tokens for non-speech events like:
|
|
3146
|
-
* [LAUGHTER], [APPLAUSE], [MUSIC], [BLANK_AUDIO], [CLICKING], etc.
|
|
3147
|
-
*
|
|
3148
|
-
* This method strips these tokens and cleans up extra whitespace.
|
|
3149
|
-
*/
|
|
3150
|
-
removeNonSpeechTokens(text) {
|
|
3151
|
-
const cleaned = text.replace(/\[[\w\s_]+\]/g, "");
|
|
3152
|
-
return cleaned.replace(/\s+/g, " ").trim();
|
|
3153
3163
|
}
|
|
3154
|
-
|
|
3164
|
+
let text = textTokens.join("");
|
|
3165
|
+
text = text.replace(/\u2581/g, " ").trim();
|
|
3166
|
+
return { text, language, emotion, event };
|
|
3167
|
+
}
|
|
3155
3168
|
|
|
3156
|
-
// src/inference/
|
|
3157
|
-
var
|
|
3158
|
-
var
|
|
3169
|
+
// src/inference/SenseVoiceInference.ts
|
|
3170
|
+
var logger4 = createLogger("SenseVoice");
|
|
3171
|
+
var SenseVoiceInference = class {
|
|
3159
3172
|
constructor(config) {
|
|
3160
|
-
this.modelId = "wav2arkit_cpu";
|
|
3161
3173
|
this.session = null;
|
|
3162
3174
|
this.ort = null;
|
|
3163
3175
|
this._backend = "wasm";
|
|
3164
3176
|
this.isLoading = false;
|
|
3165
|
-
// Inference queue for handling concurrent calls
|
|
3166
3177
|
this.inferenceQueue = Promise.resolve();
|
|
3167
|
-
|
|
3178
|
+
// Preprocessing state (loaded once)
|
|
3179
|
+
this.tokenMap = null;
|
|
3180
|
+
this.negMean = null;
|
|
3181
|
+
this.invStddev = null;
|
|
3182
|
+
this.languageId = 0;
|
|
3183
|
+
this.textNormId = 14;
|
|
3184
|
+
const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
|
|
3185
|
+
const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
|
|
3186
|
+
this.config = {
|
|
3187
|
+
modelUrl: config.modelUrl,
|
|
3188
|
+
tokensUrl,
|
|
3189
|
+
language: config.language ?? "auto",
|
|
3190
|
+
textNorm: config.textNorm ?? "with_itn",
|
|
3191
|
+
backend: config.backend ?? "auto"
|
|
3192
|
+
};
|
|
3193
|
+
this.languageId = resolveLanguageId(this.config.language);
|
|
3194
|
+
this.textNormId = resolveTextNormId(this.config.textNorm);
|
|
3168
3195
|
}
|
|
3169
3196
|
get backend() {
|
|
3170
3197
|
return this.session ? this._backend : null;
|
|
@@ -3172,10 +3199,8 @@ var Wav2ArkitCpuInference = class {
|
|
|
3172
3199
|
get isLoaded() {
|
|
3173
3200
|
return this.session !== null;
|
|
3174
3201
|
}
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
*/
|
|
3178
|
-
async load() {
|
|
3202
|
+
// ─── Load ───────────────────────────────────────────────────────────────
|
|
3203
|
+
async load(onProgress) {
|
|
3179
3204
|
if (this.isLoading) {
|
|
3180
3205
|
throw new Error("Model is already loading");
|
|
3181
3206
|
}
|
|
@@ -3185,30 +3210,281 @@ var Wav2ArkitCpuInference = class {
|
|
|
3185
3210
|
this.isLoading = true;
|
|
3186
3211
|
const startTime = performance.now();
|
|
3187
3212
|
const telemetry = getTelemetry();
|
|
3188
|
-
const span = telemetry?.startSpan("
|
|
3213
|
+
const span = telemetry?.startSpan("SenseVoice.load", {
|
|
3189
3214
|
"model.url": this.config.modelUrl,
|
|
3190
|
-
"model.backend_requested": this.config.backend
|
|
3215
|
+
"model.backend_requested": this.config.backend
|
|
3191
3216
|
});
|
|
3192
3217
|
try {
|
|
3193
|
-
|
|
3194
|
-
|
|
3195
|
-
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
3218
|
+
logger4.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3219
|
+
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3196
3220
|
this.ort = ort;
|
|
3197
3221
|
this._backend = backend;
|
|
3198
|
-
|
|
3199
|
-
|
|
3200
|
-
const
|
|
3222
|
+
logger4.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3223
|
+
logger4.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3224
|
+
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3225
|
+
if (!tokensResponse.ok) {
|
|
3226
|
+
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
3227
|
+
}
|
|
3228
|
+
const tokensText = await tokensResponse.text();
|
|
3229
|
+
this.tokenMap = parseTokensFile(tokensText);
|
|
3230
|
+
logger4.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3201
3231
|
const sessionOptions = getSessionOptions(this._backend);
|
|
3232
|
+
if (this._backend === "webgpu") {
|
|
3233
|
+
sessionOptions.graphOptimizationLevel = "basic";
|
|
3234
|
+
}
|
|
3235
|
+
let isCached = false;
|
|
3202
3236
|
if (isIOS()) {
|
|
3203
|
-
|
|
3204
|
-
modelUrl
|
|
3205
|
-
dataUrl
|
|
3237
|
+
logger4.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3238
|
+
modelUrl: this.config.modelUrl
|
|
3206
3239
|
});
|
|
3207
|
-
|
|
3208
|
-
|
|
3209
|
-
sessionOptions
|
|
3210
|
-
|
|
3211
|
-
|
|
3240
|
+
this.session = await this.ort.InferenceSession.create(
|
|
3241
|
+
this.config.modelUrl,
|
|
3242
|
+
sessionOptions
|
|
3243
|
+
);
|
|
3244
|
+
} else {
|
|
3245
|
+
const cache = getModelCache();
|
|
3246
|
+
isCached = await cache.has(this.config.modelUrl);
|
|
3247
|
+
let modelBuffer;
|
|
3248
|
+
if (isCached) {
|
|
3249
|
+
logger4.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3250
|
+
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3251
|
+
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3252
|
+
} else {
|
|
3253
|
+
logger4.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3254
|
+
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
3255
|
+
}
|
|
3256
|
+
logger4.debug("Creating ONNX session", {
|
|
3257
|
+
size: formatBytes(modelBuffer.byteLength),
|
|
3258
|
+
backend: this._backend
|
|
3259
|
+
});
|
|
3260
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
3261
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3262
|
+
}
|
|
3263
|
+
try {
|
|
3264
|
+
const metadata = this.session.handler?.metadata;
|
|
3265
|
+
if (metadata?.neg_mean && metadata?.inv_stddev) {
|
|
3266
|
+
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
3267
|
+
this.negMean = cmvn.negMean;
|
|
3268
|
+
this.invStddev = cmvn.invStddev;
|
|
3269
|
+
logger4.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
3270
|
+
} else {
|
|
3271
|
+
logger4.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
3272
|
+
}
|
|
3273
|
+
} catch (cmvnErr) {
|
|
3274
|
+
logger4.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
3275
|
+
}
|
|
3276
|
+
const loadTimeMs = performance.now() - startTime;
|
|
3277
|
+
logger4.info("SenseVoice model loaded", {
|
|
3278
|
+
backend: this._backend,
|
|
3279
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
3280
|
+
vocabSize: this.tokenMap.size,
|
|
3281
|
+
inputs: this.session.inputNames,
|
|
3282
|
+
outputs: this.session.outputNames,
|
|
3283
|
+
hasCMVN: this.negMean !== null
|
|
3284
|
+
});
|
|
3285
|
+
span?.setAttributes({
|
|
3286
|
+
"model.backend": this._backend,
|
|
3287
|
+
"model.load_time_ms": loadTimeMs,
|
|
3288
|
+
"model.cached": !isIOS() && isCached,
|
|
3289
|
+
"model.vocab_size": this.tokenMap.size
|
|
3290
|
+
});
|
|
3291
|
+
span?.end();
|
|
3292
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
3293
|
+
model: "sensevoice",
|
|
3294
|
+
backend: this._backend
|
|
3295
|
+
});
|
|
3296
|
+
return {
|
|
3297
|
+
backend: this._backend,
|
|
3298
|
+
loadTimeMs,
|
|
3299
|
+
inputNames: [...this.session.inputNames],
|
|
3300
|
+
outputNames: [...this.session.outputNames],
|
|
3301
|
+
vocabSize: this.tokenMap.size
|
|
3302
|
+
};
|
|
3303
|
+
} catch (error) {
|
|
3304
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
3305
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
3306
|
+
model: "sensevoice",
|
|
3307
|
+
error_type: "load_failed"
|
|
3308
|
+
});
|
|
3309
|
+
throw error;
|
|
3310
|
+
} finally {
|
|
3311
|
+
this.isLoading = false;
|
|
3312
|
+
}
|
|
3313
|
+
}
|
|
3314
|
+
// ─── Transcribe ─────────────────────────────────────────────────────────
|
|
3315
|
+
/**
|
|
3316
|
+
* Transcribe audio samples to text
|
|
3317
|
+
*
|
|
3318
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
3319
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
3320
|
+
*/
|
|
3321
|
+
async transcribe(audioSamples) {
|
|
3322
|
+
if (!this.session || !this.ort || !this.tokenMap) {
|
|
3323
|
+
throw new Error("Model not loaded. Call load() first.");
|
|
3324
|
+
}
|
|
3325
|
+
const audio = new Float32Array(audioSamples);
|
|
3326
|
+
return this.queueInference(audio);
|
|
3327
|
+
}
|
|
3328
|
+
queueInference(audio) {
|
|
3329
|
+
return new Promise((resolve, reject) => {
|
|
3330
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
3331
|
+
const telemetry = getTelemetry();
|
|
3332
|
+
const span = telemetry?.startSpan("SenseVoice.transcribe", {
|
|
3333
|
+
"inference.backend": this._backend,
|
|
3334
|
+
"inference.input_samples": audio.length
|
|
3335
|
+
});
|
|
3336
|
+
try {
|
|
3337
|
+
const startTime = performance.now();
|
|
3338
|
+
const preprocessStart = performance.now();
|
|
3339
|
+
const fbank = computeKaldiFbank(audio, 16e3, 80);
|
|
3340
|
+
const numFrames = fbank.length / 80;
|
|
3341
|
+
if (numFrames === 0) {
|
|
3342
|
+
resolve({
|
|
3343
|
+
text: "",
|
|
3344
|
+
inferenceTimeMs: performance.now() - startTime,
|
|
3345
|
+
preprocessTimeMs: performance.now() - preprocessStart
|
|
3346
|
+
});
|
|
3347
|
+
return;
|
|
3348
|
+
}
|
|
3349
|
+
const lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
3350
|
+
const numLfrFrames = lfrFeatures.length / 560;
|
|
3351
|
+
if (this.negMean && this.invStddev) {
|
|
3352
|
+
applyCMVN(lfrFeatures, 560, this.negMean, this.invStddev);
|
|
3353
|
+
}
|
|
3354
|
+
const preprocessTimeMs = performance.now() - preprocessStart;
|
|
3355
|
+
const ort = this.ort;
|
|
3356
|
+
const feeds = {
|
|
3357
|
+
x: new ort.Tensor("float32", lfrFeatures, [1, numLfrFrames, 560]),
|
|
3358
|
+
x_length: new ort.Tensor("int32", new Int32Array([numLfrFrames]), [1]),
|
|
3359
|
+
language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
|
|
3360
|
+
text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
|
|
3361
|
+
};
|
|
3362
|
+
const results = await this.session.run(feeds);
|
|
3363
|
+
const logitsOutput = results["logits"];
|
|
3364
|
+
if (!logitsOutput) {
|
|
3365
|
+
throw new Error('Model output missing "logits" tensor');
|
|
3366
|
+
}
|
|
3367
|
+
const logitsData = logitsOutput.data;
|
|
3368
|
+
const logitsDims = logitsOutput.dims;
|
|
3369
|
+
const seqLen = logitsDims[1];
|
|
3370
|
+
const vocabSize = logitsDims[2];
|
|
3371
|
+
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
3372
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
3373
|
+
logger4.trace("Transcription complete", {
|
|
3374
|
+
text: decoded.text.substring(0, 50),
|
|
3375
|
+
language: decoded.language,
|
|
3376
|
+
emotion: decoded.emotion,
|
|
3377
|
+
event: decoded.event,
|
|
3378
|
+
preprocessTimeMs: Math.round(preprocessTimeMs * 100) / 100,
|
|
3379
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3380
|
+
numFrames,
|
|
3381
|
+
numLfrFrames
|
|
3382
|
+
});
|
|
3383
|
+
span?.setAttributes({
|
|
3384
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
3385
|
+
"inference.preprocess_ms": preprocessTimeMs,
|
|
3386
|
+
"inference.num_frames": numFrames,
|
|
3387
|
+
"inference.text_length": decoded.text.length
|
|
3388
|
+
});
|
|
3389
|
+
span?.end();
|
|
3390
|
+
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
3391
|
+
model: "sensevoice",
|
|
3392
|
+
backend: this._backend
|
|
3393
|
+
});
|
|
3394
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3395
|
+
model: "sensevoice",
|
|
3396
|
+
backend: this._backend,
|
|
3397
|
+
status: "success"
|
|
3398
|
+
});
|
|
3399
|
+
resolve({
|
|
3400
|
+
text: decoded.text,
|
|
3401
|
+
language: decoded.language,
|
|
3402
|
+
emotion: decoded.emotion,
|
|
3403
|
+
event: decoded.event,
|
|
3404
|
+
inferenceTimeMs,
|
|
3405
|
+
preprocessTimeMs
|
|
3406
|
+
});
|
|
3407
|
+
} catch (err) {
|
|
3408
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3409
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3410
|
+
model: "sensevoice",
|
|
3411
|
+
backend: this._backend,
|
|
3412
|
+
status: "error"
|
|
3413
|
+
});
|
|
3414
|
+
reject(err);
|
|
3415
|
+
}
|
|
3416
|
+
});
|
|
3417
|
+
});
|
|
3418
|
+
}
|
|
3419
|
+
// ─── Dispose ──────────────────────────────────────────────────────────
|
|
3420
|
+
async dispose() {
|
|
3421
|
+
if (this.session) {
|
|
3422
|
+
await this.session.release();
|
|
3423
|
+
this.session = null;
|
|
3424
|
+
}
|
|
3425
|
+
this.ort = null;
|
|
3426
|
+
this.tokenMap = null;
|
|
3427
|
+
this.negMean = null;
|
|
3428
|
+
this.invStddev = null;
|
|
3429
|
+
}
|
|
3430
|
+
};
|
|
3431
|
+
|
|
3432
|
+
// src/inference/Wav2ArkitCpuInference.ts
|
|
3433
|
+
var logger5 = createLogger("Wav2ArkitCpu");
|
|
3434
|
+
var Wav2ArkitCpuInference = class {
|
|
3435
|
+
constructor(config) {
|
|
3436
|
+
this.modelId = "wav2arkit_cpu";
|
|
3437
|
+
this.session = null;
|
|
3438
|
+
this.ort = null;
|
|
3439
|
+
this._backend = "wasm";
|
|
3440
|
+
this.isLoading = false;
|
|
3441
|
+
// Inference queue for handling concurrent calls
|
|
3442
|
+
this.inferenceQueue = Promise.resolve();
|
|
3443
|
+
this.config = config;
|
|
3444
|
+
}
|
|
3445
|
+
get backend() {
|
|
3446
|
+
return this.session ? this._backend : null;
|
|
3447
|
+
}
|
|
3448
|
+
get isLoaded() {
|
|
3449
|
+
return this.session !== null;
|
|
3450
|
+
}
|
|
3451
|
+
/**
|
|
3452
|
+
* Load the ONNX model
|
|
3453
|
+
*/
|
|
3454
|
+
async load() {
|
|
3455
|
+
if (this.isLoading) {
|
|
3456
|
+
throw new Error("Model is already loading");
|
|
3457
|
+
}
|
|
3458
|
+
if (this.session) {
|
|
3459
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
3460
|
+
}
|
|
3461
|
+
this.isLoading = true;
|
|
3462
|
+
const startTime = performance.now();
|
|
3463
|
+
const telemetry = getTelemetry();
|
|
3464
|
+
const span = telemetry?.startSpan("Wav2ArkitCpu.load", {
|
|
3465
|
+
"model.url": this.config.modelUrl,
|
|
3466
|
+
"model.backend_requested": this.config.backend || "wasm"
|
|
3467
|
+
});
|
|
3468
|
+
try {
|
|
3469
|
+
const preference = this.config.backend || "wasm";
|
|
3470
|
+
logger5.info("Loading ONNX Runtime...", { preference });
|
|
3471
|
+
const { ort, backend } = await getOnnxRuntimeForPreference(preference);
|
|
3472
|
+
this.ort = ort;
|
|
3473
|
+
this._backend = backend;
|
|
3474
|
+
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3475
|
+
const modelUrl = this.config.modelUrl;
|
|
3476
|
+
const dataUrl = this.config.externalDataUrl !== false ? this.config.externalDataUrl || `${modelUrl}.data` : null;
|
|
3477
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
3478
|
+
if (isIOS()) {
|
|
3479
|
+
logger5.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
3480
|
+
modelUrl,
|
|
3481
|
+
dataUrl
|
|
3482
|
+
});
|
|
3483
|
+
if (dataUrl) {
|
|
3484
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
3485
|
+
sessionOptions.externalData = [{
|
|
3486
|
+
path: dataFilename,
|
|
3487
|
+
data: dataUrl
|
|
3212
3488
|
// URL string — ORT fetches directly into WASM
|
|
3213
3489
|
}];
|
|
3214
3490
|
}
|
|
@@ -3474,21 +3750,22 @@ var LipSyncWithFallback = class {
|
|
|
3474
3750
|
try {
|
|
3475
3751
|
return await this.implementation.load();
|
|
3476
3752
|
} catch (error) {
|
|
3477
|
-
|
|
3478
|
-
error: error instanceof Error ? error.message : String(error)
|
|
3479
|
-
});
|
|
3480
|
-
try {
|
|
3481
|
-
await this.implementation.dispose();
|
|
3482
|
-
} catch {
|
|
3483
|
-
}
|
|
3484
|
-
this.implementation = new Wav2ArkitCpuInference({
|
|
3485
|
-
modelUrl: this.config.cpuModelUrl
|
|
3486
|
-
});
|
|
3487
|
-
this.hasFallenBack = true;
|
|
3488
|
-
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
3489
|
-
return await this.implementation.load();
|
|
3753
|
+
return this.fallbackToCpu(error instanceof Error ? error.message : String(error));
|
|
3490
3754
|
}
|
|
3491
3755
|
}
|
|
3756
|
+
async fallbackToCpu(reason) {
|
|
3757
|
+
logger6.warn("GPU model load failed, falling back to CPU model", { reason });
|
|
3758
|
+
try {
|
|
3759
|
+
await this.implementation.dispose();
|
|
3760
|
+
} catch {
|
|
3761
|
+
}
|
|
3762
|
+
this.implementation = new Wav2ArkitCpuInference({
|
|
3763
|
+
modelUrl: this.config.cpuModelUrl
|
|
3764
|
+
});
|
|
3765
|
+
this.hasFallenBack = true;
|
|
3766
|
+
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
3767
|
+
return await this.implementation.load();
|
|
3768
|
+
}
|
|
3492
3769
|
async infer(audioSamples, identityIndex) {
|
|
3493
3770
|
return this.implementation.infer(audioSamples, identityIndex);
|
|
3494
3771
|
}
|
|
@@ -4545,268 +4822,8 @@ var VADWorkerWithFallback = class {
|
|
|
4545
4822
|
}
|
|
4546
4823
|
};
|
|
4547
4824
|
|
|
4548
|
-
// src/inference/Emotion2VecInference.ts
|
|
4549
|
-
var logger10 = createLogger("Emotion2Vec");
|
|
4550
|
-
var EMOTION2VEC_LABELS = ["neutral", "happy", "angry", "sad"];
|
|
4551
|
-
var Emotion2VecInference = class {
|
|
4552
|
-
constructor(config) {
|
|
4553
|
-
this.session = null;
|
|
4554
|
-
this.ort = null;
|
|
4555
|
-
this._backend = "wasm";
|
|
4556
|
-
this.isLoading = false;
|
|
4557
|
-
this.inferenceQueue = Promise.resolve();
|
|
4558
|
-
this.config = {
|
|
4559
|
-
modelUrl: config.modelUrl,
|
|
4560
|
-
backend: config.backend ?? "auto",
|
|
4561
|
-
sampleRate: config.sampleRate ?? 16e3
|
|
4562
|
-
};
|
|
4563
|
-
}
|
|
4564
|
-
get backend() {
|
|
4565
|
-
return this.session ? this._backend : null;
|
|
4566
|
-
}
|
|
4567
|
-
get isLoaded() {
|
|
4568
|
-
return this.session !== null;
|
|
4569
|
-
}
|
|
4570
|
-
get sampleRate() {
|
|
4571
|
-
return this.config.sampleRate;
|
|
4572
|
-
}
|
|
4573
|
-
/**
|
|
4574
|
-
* Load the ONNX model
|
|
4575
|
-
*/
|
|
4576
|
-
async load() {
|
|
4577
|
-
if (this.isLoading) {
|
|
4578
|
-
throw new Error("Model is already loading");
|
|
4579
|
-
}
|
|
4580
|
-
if (this.session) {
|
|
4581
|
-
throw new Error("Model already loaded. Call dispose() first.");
|
|
4582
|
-
}
|
|
4583
|
-
this.isLoading = true;
|
|
4584
|
-
const startTime = performance.now();
|
|
4585
|
-
const telemetry = getTelemetry();
|
|
4586
|
-
const span = telemetry?.startSpan("Emotion2Vec.load", {
|
|
4587
|
-
"model.url": this.config.modelUrl,
|
|
4588
|
-
"model.backend_requested": this.config.backend
|
|
4589
|
-
});
|
|
4590
|
-
try {
|
|
4591
|
-
logger10.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
4592
|
-
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
4593
|
-
this.ort = ort;
|
|
4594
|
-
this._backend = backend;
|
|
4595
|
-
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
4596
|
-
logger10.info("Checking model cache...");
|
|
4597
|
-
const cache = getModelCache();
|
|
4598
|
-
const modelUrl = this.config.modelUrl;
|
|
4599
|
-
const isCached = await cache.has(modelUrl);
|
|
4600
|
-
logger10.info("Cache check complete", { modelUrl, isCached });
|
|
4601
|
-
let modelBuffer;
|
|
4602
|
-
if (isCached) {
|
|
4603
|
-
logger10.info("Loading model from cache...", { modelUrl });
|
|
4604
|
-
modelBuffer = await cache.get(modelUrl);
|
|
4605
|
-
logger10.info("Model loaded from cache", { size: formatBytes(modelBuffer.byteLength) });
|
|
4606
|
-
} else {
|
|
4607
|
-
logger10.info("Fetching model (not cached)...", { modelUrl });
|
|
4608
|
-
modelBuffer = await fetchWithCache(modelUrl);
|
|
4609
|
-
logger10.info("Model fetched and cached", { size: formatBytes(modelBuffer.byteLength) });
|
|
4610
|
-
}
|
|
4611
|
-
logger10.info("Creating ONNX session (this may take a while for large models)...");
|
|
4612
|
-
logger10.debug("Creating ONNX session", {
|
|
4613
|
-
size: formatBytes(modelBuffer.byteLength),
|
|
4614
|
-
backend: this._backend
|
|
4615
|
-
});
|
|
4616
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
4617
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
4618
|
-
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
4619
|
-
const loadTimeMs = performance.now() - startTime;
|
|
4620
|
-
logger10.info("Model loaded successfully", {
|
|
4621
|
-
backend: this._backend,
|
|
4622
|
-
loadTimeMs: Math.round(loadTimeMs),
|
|
4623
|
-
sampleRate: this.config.sampleRate,
|
|
4624
|
-
inputNames: [...this.session.inputNames],
|
|
4625
|
-
outputNames: [...this.session.outputNames]
|
|
4626
|
-
});
|
|
4627
|
-
span?.setAttributes({
|
|
4628
|
-
"model.backend": this._backend,
|
|
4629
|
-
"model.load_time_ms": loadTimeMs,
|
|
4630
|
-
"model.cached": isCached
|
|
4631
|
-
});
|
|
4632
|
-
span?.end();
|
|
4633
|
-
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
4634
|
-
model: "emotion2vec",
|
|
4635
|
-
backend: this._backend
|
|
4636
|
-
});
|
|
4637
|
-
return {
|
|
4638
|
-
backend: this._backend,
|
|
4639
|
-
loadTimeMs,
|
|
4640
|
-
inputNames: [...this.session.inputNames],
|
|
4641
|
-
outputNames: [...this.session.outputNames],
|
|
4642
|
-
sampleRate: this.config.sampleRate
|
|
4643
|
-
};
|
|
4644
|
-
} catch (error) {
|
|
4645
|
-
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
4646
|
-
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
4647
|
-
model: "emotion2vec",
|
|
4648
|
-
error_type: "load_failed"
|
|
4649
|
-
});
|
|
4650
|
-
throw error;
|
|
4651
|
-
} finally {
|
|
4652
|
-
this.isLoading = false;
|
|
4653
|
-
}
|
|
4654
|
-
}
|
|
4655
|
-
/**
|
|
4656
|
-
* Run emotion inference on audio samples
|
|
4657
|
-
*
|
|
4658
|
-
* @param audio - Float32Array of 16kHz audio samples
|
|
4659
|
-
* @returns Frame-level emotion results at 50Hz
|
|
4660
|
-
*/
|
|
4661
|
-
async infer(audio) {
|
|
4662
|
-
if (!this.session) {
|
|
4663
|
-
throw new Error("Model not loaded. Call load() first.");
|
|
4664
|
-
}
|
|
4665
|
-
return this.queueInference(audio);
|
|
4666
|
-
}
|
|
4667
|
-
queueInference(audio) {
|
|
4668
|
-
const audioCopy = new Float32Array(audio);
|
|
4669
|
-
return new Promise((resolve, reject) => {
|
|
4670
|
-
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
4671
|
-
const telemetry = getTelemetry();
|
|
4672
|
-
const span = telemetry?.startSpan("Emotion2Vec.infer", {
|
|
4673
|
-
"inference.backend": this._backend,
|
|
4674
|
-
"inference.audio_samples": audioCopy.length
|
|
4675
|
-
});
|
|
4676
|
-
try {
|
|
4677
|
-
const startTime = performance.now();
|
|
4678
|
-
const inputTensor = new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length]);
|
|
4679
|
-
const results = await this.session.run({ audio: inputTensor });
|
|
4680
|
-
const logitsTensor = results["logits"];
|
|
4681
|
-
const embeddingsTensor = results["layer_norm_25"];
|
|
4682
|
-
if (!logitsTensor) {
|
|
4683
|
-
throw new Error(
|
|
4684
|
-
`Missing logits tensor from SUPERB model. Got outputs: ${Object.keys(results).join(", ")}`
|
|
4685
|
-
);
|
|
4686
|
-
}
|
|
4687
|
-
const logitsData = logitsTensor.data;
|
|
4688
|
-
const logits = new Float32Array(logitsData);
|
|
4689
|
-
const probs = this.softmax(logits);
|
|
4690
|
-
const probabilities = {
|
|
4691
|
-
neutral: probs[0],
|
|
4692
|
-
happy: probs[1],
|
|
4693
|
-
angry: probs[2],
|
|
4694
|
-
sad: probs[3]
|
|
4695
|
-
};
|
|
4696
|
-
let maxIdx = 0;
|
|
4697
|
-
let maxProb = probs[0];
|
|
4698
|
-
for (let i = 1; i < probs.length; i++) {
|
|
4699
|
-
if (probs[i] > maxProb) {
|
|
4700
|
-
maxProb = probs[i];
|
|
4701
|
-
maxIdx = i;
|
|
4702
|
-
}
|
|
4703
|
-
}
|
|
4704
|
-
const dominant = {
|
|
4705
|
-
emotion: EMOTION2VEC_LABELS[maxIdx],
|
|
4706
|
-
confidence: maxProb,
|
|
4707
|
-
probabilities
|
|
4708
|
-
};
|
|
4709
|
-
let embeddings = [];
|
|
4710
|
-
let numFrames = 1;
|
|
4711
|
-
if (embeddingsTensor) {
|
|
4712
|
-
const embeddingData = embeddingsTensor.data;
|
|
4713
|
-
const dims = embeddingsTensor.dims;
|
|
4714
|
-
if (dims.length === 3) {
|
|
4715
|
-
numFrames = dims[1];
|
|
4716
|
-
const embeddingDim = dims[2];
|
|
4717
|
-
for (let i = 0; i < numFrames; i++) {
|
|
4718
|
-
const start = i * embeddingDim;
|
|
4719
|
-
embeddings.push(new Float32Array(embeddingData.slice(start, start + embeddingDim)));
|
|
4720
|
-
}
|
|
4721
|
-
}
|
|
4722
|
-
}
|
|
4723
|
-
const frames = [];
|
|
4724
|
-
for (let i = 0; i < numFrames; i++) {
|
|
4725
|
-
frames.push({
|
|
4726
|
-
emotion: dominant.emotion,
|
|
4727
|
-
confidence: dominant.confidence,
|
|
4728
|
-
probabilities: { ...probabilities }
|
|
4729
|
-
});
|
|
4730
|
-
}
|
|
4731
|
-
const inferenceTimeMs = performance.now() - startTime;
|
|
4732
|
-
logger10.debug("Emotion inference completed", {
|
|
4733
|
-
numFrames,
|
|
4734
|
-
dominant: dominant.emotion,
|
|
4735
|
-
confidence: Math.round(dominant.confidence * 100),
|
|
4736
|
-
inferenceTimeMs: Math.round(inferenceTimeMs)
|
|
4737
|
-
});
|
|
4738
|
-
span?.setAttributes({
|
|
4739
|
-
"inference.duration_ms": inferenceTimeMs,
|
|
4740
|
-
"inference.num_frames": numFrames,
|
|
4741
|
-
"inference.dominant_emotion": dominant.emotion
|
|
4742
|
-
});
|
|
4743
|
-
span?.end();
|
|
4744
|
-
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
4745
|
-
model: "emotion2vec",
|
|
4746
|
-
backend: this._backend
|
|
4747
|
-
});
|
|
4748
|
-
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4749
|
-
model: "emotion2vec",
|
|
4750
|
-
backend: this._backend,
|
|
4751
|
-
status: "success"
|
|
4752
|
-
});
|
|
4753
|
-
resolve({
|
|
4754
|
-
frames,
|
|
4755
|
-
dominant,
|
|
4756
|
-
embeddings,
|
|
4757
|
-
logits,
|
|
4758
|
-
inferenceTimeMs
|
|
4759
|
-
});
|
|
4760
|
-
} catch (err) {
|
|
4761
|
-
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4762
|
-
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4763
|
-
model: "emotion2vec",
|
|
4764
|
-
backend: this._backend,
|
|
4765
|
-
status: "error"
|
|
4766
|
-
});
|
|
4767
|
-
reject(err);
|
|
4768
|
-
}
|
|
4769
|
-
});
|
|
4770
|
-
});
|
|
4771
|
-
}
|
|
4772
|
-
/**
|
|
4773
|
-
* Apply softmax to convert logits to probabilities
|
|
4774
|
-
*/
|
|
4775
|
-
softmax(logits) {
|
|
4776
|
-
let max = logits[0];
|
|
4777
|
-
for (let i = 1; i < logits.length; i++) {
|
|
4778
|
-
if (logits[i] > max) max = logits[i];
|
|
4779
|
-
}
|
|
4780
|
-
const exp = new Float32Array(logits.length);
|
|
4781
|
-
let sum = 0;
|
|
4782
|
-
for (let i = 0; i < logits.length; i++) {
|
|
4783
|
-
exp[i] = Math.exp(logits[i] - max);
|
|
4784
|
-
sum += exp[i];
|
|
4785
|
-
}
|
|
4786
|
-
const probs = new Float32Array(logits.length);
|
|
4787
|
-
for (let i = 0; i < logits.length; i++) {
|
|
4788
|
-
probs[i] = exp[i] / sum;
|
|
4789
|
-
}
|
|
4790
|
-
return probs;
|
|
4791
|
-
}
|
|
4792
|
-
/**
|
|
4793
|
-
* Dispose of the model and free resources
|
|
4794
|
-
*/
|
|
4795
|
-
async dispose() {
|
|
4796
|
-
if (this.session) {
|
|
4797
|
-
await this.session.release();
|
|
4798
|
-
this.session = null;
|
|
4799
|
-
}
|
|
4800
|
-
}
|
|
4801
|
-
};
|
|
4802
|
-
/**
|
|
4803
|
-
* Check if WebGPU is available and working
|
|
4804
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
4805
|
-
*/
|
|
4806
|
-
Emotion2VecInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
4807
|
-
|
|
4808
4825
|
// src/inference/SafariSpeechRecognition.ts
|
|
4809
|
-
var
|
|
4826
|
+
var logger10 = createLogger("SafariSpeech");
|
|
4810
4827
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
4811
4828
|
constructor(config = {}) {
|
|
4812
4829
|
this.recognition = null;
|
|
@@ -4825,7 +4842,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4825
4842
|
interimResults: config.interimResults ?? true,
|
|
4826
4843
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
4827
4844
|
};
|
|
4828
|
-
|
|
4845
|
+
logger10.debug("SafariSpeechRecognition created", {
|
|
4829
4846
|
language: this.config.language,
|
|
4830
4847
|
continuous: this.config.continuous
|
|
4831
4848
|
});
|
|
@@ -4886,7 +4903,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4886
4903
|
*/
|
|
4887
4904
|
async start() {
|
|
4888
4905
|
if (this.isListening) {
|
|
4889
|
-
|
|
4906
|
+
logger10.warn("Already listening");
|
|
4890
4907
|
return;
|
|
4891
4908
|
}
|
|
4892
4909
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -4916,7 +4933,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4916
4933
|
this.isListening = true;
|
|
4917
4934
|
this.startTime = performance.now();
|
|
4918
4935
|
this.accumulatedText = "";
|
|
4919
|
-
|
|
4936
|
+
logger10.info("Speech recognition started", {
|
|
4920
4937
|
language: this.config.language
|
|
4921
4938
|
});
|
|
4922
4939
|
span?.end();
|
|
@@ -4931,7 +4948,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4931
4948
|
*/
|
|
4932
4949
|
async stop() {
|
|
4933
4950
|
if (!this.isListening || !this.recognition) {
|
|
4934
|
-
|
|
4951
|
+
logger10.warn("Not currently listening");
|
|
4935
4952
|
return {
|
|
4936
4953
|
text: this.accumulatedText,
|
|
4937
4954
|
language: this.config.language,
|
|
@@ -4960,7 +4977,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4960
4977
|
if (this.recognition && this.isListening) {
|
|
4961
4978
|
this.recognition.abort();
|
|
4962
4979
|
this.isListening = false;
|
|
4963
|
-
|
|
4980
|
+
logger10.info("Speech recognition aborted");
|
|
4964
4981
|
}
|
|
4965
4982
|
}
|
|
4966
4983
|
/**
|
|
@@ -4991,7 +5008,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4991
5008
|
this.isListening = false;
|
|
4992
5009
|
this.resultCallbacks = [];
|
|
4993
5010
|
this.errorCallbacks = [];
|
|
4994
|
-
|
|
5011
|
+
logger10.debug("SafariSpeechRecognition disposed");
|
|
4995
5012
|
}
|
|
4996
5013
|
/**
|
|
4997
5014
|
* Set up event handlers for the recognition instance
|
|
@@ -5019,7 +5036,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5019
5036
|
confidence: alternative.confidence
|
|
5020
5037
|
};
|
|
5021
5038
|
this.emitResult(speechResult);
|
|
5022
|
-
|
|
5039
|
+
logger10.trace("Speech result", {
|
|
5023
5040
|
text: text.substring(0, 50),
|
|
5024
5041
|
isFinal,
|
|
5025
5042
|
confidence: alternative.confidence
|
|
@@ -5029,12 +5046,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5029
5046
|
span?.end();
|
|
5030
5047
|
} catch (error) {
|
|
5031
5048
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
5032
|
-
|
|
5049
|
+
logger10.error("Error processing speech result", { error });
|
|
5033
5050
|
}
|
|
5034
5051
|
};
|
|
5035
5052
|
this.recognition.onerror = (event) => {
|
|
5036
5053
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
5037
|
-
|
|
5054
|
+
logger10.error("Speech recognition error", { error: event.error, message: event.message });
|
|
5038
5055
|
this.emitError(error);
|
|
5039
5056
|
if (this.stopRejecter) {
|
|
5040
5057
|
this.stopRejecter(error);
|
|
@@ -5044,7 +5061,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5044
5061
|
};
|
|
5045
5062
|
this.recognition.onend = () => {
|
|
5046
5063
|
this.isListening = false;
|
|
5047
|
-
|
|
5064
|
+
logger10.info("Speech recognition ended", {
|
|
5048
5065
|
totalText: this.accumulatedText.length,
|
|
5049
5066
|
durationMs: performance.now() - this.startTime
|
|
5050
5067
|
});
|
|
@@ -5061,13 +5078,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5061
5078
|
}
|
|
5062
5079
|
};
|
|
5063
5080
|
this.recognition.onstart = () => {
|
|
5064
|
-
|
|
5081
|
+
logger10.debug("Speech recognition started by browser");
|
|
5065
5082
|
};
|
|
5066
5083
|
this.recognition.onspeechstart = () => {
|
|
5067
|
-
|
|
5084
|
+
logger10.debug("Speech detected");
|
|
5068
5085
|
};
|
|
5069
5086
|
this.recognition.onspeechend = () => {
|
|
5070
|
-
|
|
5087
|
+
logger10.debug("Speech ended");
|
|
5071
5088
|
};
|
|
5072
5089
|
}
|
|
5073
5090
|
/**
|
|
@@ -5078,7 +5095,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5078
5095
|
try {
|
|
5079
5096
|
callback(result);
|
|
5080
5097
|
} catch (error) {
|
|
5081
|
-
|
|
5098
|
+
logger10.error("Error in result callback", { error });
|
|
5082
5099
|
}
|
|
5083
5100
|
}
|
|
5084
5101
|
}
|
|
@@ -5090,7 +5107,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5090
5107
|
try {
|
|
5091
5108
|
callback(error);
|
|
5092
5109
|
} catch (callbackError) {
|
|
5093
|
-
|
|
5110
|
+
logger10.error("Error in error callback", { error: callbackError });
|
|
5094
5111
|
}
|
|
5095
5112
|
}
|
|
5096
5113
|
}
|
|
@@ -5264,7 +5281,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5264
5281
|
this._sessionId = null;
|
|
5265
5282
|
this._isConnected = false;
|
|
5266
5283
|
// Sub-components
|
|
5267
|
-
this.
|
|
5284
|
+
this.asr = null;
|
|
5268
5285
|
this.vad = null;
|
|
5269
5286
|
this.lam = null;
|
|
5270
5287
|
this.pipeline = null;
|
|
@@ -5303,7 +5320,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5303
5320
|
try {
|
|
5304
5321
|
const authToken = await this.getAuthToken(config.tenant);
|
|
5305
5322
|
await Promise.all([
|
|
5306
|
-
this.
|
|
5323
|
+
this.initASR(),
|
|
5307
5324
|
this.initLAM()
|
|
5308
5325
|
]);
|
|
5309
5326
|
await this.connectWebSocket(authToken, config);
|
|
@@ -5333,7 +5350,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5333
5350
|
this.ws = null;
|
|
5334
5351
|
}
|
|
5335
5352
|
await Promise.all([
|
|
5336
|
-
this.
|
|
5353
|
+
this.asr?.dispose(),
|
|
5337
5354
|
this.vad?.dispose(),
|
|
5338
5355
|
this.lam?.dispose()
|
|
5339
5356
|
]);
|
|
@@ -5465,16 +5482,15 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5465
5482
|
});
|
|
5466
5483
|
return token;
|
|
5467
5484
|
}
|
|
5468
|
-
async
|
|
5485
|
+
async initASR() {
|
|
5469
5486
|
await Promise.all([
|
|
5470
|
-
//
|
|
5487
|
+
// SenseVoice ASR
|
|
5471
5488
|
(async () => {
|
|
5472
|
-
this.
|
|
5473
|
-
|
|
5474
|
-
|
|
5475
|
-
language: "en"
|
|
5489
|
+
this.asr = new SenseVoiceInference({
|
|
5490
|
+
modelUrl: "/models/sensevoice/model.int8.onnx",
|
|
5491
|
+
language: "auto"
|
|
5476
5492
|
});
|
|
5477
|
-
await this.
|
|
5493
|
+
await this.asr.load();
|
|
5478
5494
|
})(),
|
|
5479
5495
|
// Silero VAD for accurate voice activity detection
|
|
5480
5496
|
(async () => {
|
|
@@ -5660,17 +5676,17 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5660
5676
|
console.debug("[AgentCore] Skipping silent audio", { rms, samples: audio.length });
|
|
5661
5677
|
return;
|
|
5662
5678
|
}
|
|
5663
|
-
if (this.
|
|
5679
|
+
if (this.asr) {
|
|
5664
5680
|
this.setState("listening");
|
|
5665
5681
|
this.emit("user.speech.start", { timestamp: Date.now() });
|
|
5666
|
-
this.
|
|
5682
|
+
this.asr.transcribe(audio).then((result) => {
|
|
5667
5683
|
this.emit("user.transcript.final", {
|
|
5668
5684
|
text: result.text,
|
|
5669
5685
|
confidence: 1
|
|
5670
5686
|
});
|
|
5671
5687
|
this.emit("user.speech.end", { timestamp: Date.now(), durationMs: result.inferenceTimeMs });
|
|
5672
5688
|
const cleanText = result.text.trim();
|
|
5673
|
-
if (cleanText
|
|
5689
|
+
if (cleanText) {
|
|
5674
5690
|
this.sendText(cleanText).catch((error) => {
|
|
5675
5691
|
console.error("[AgentCore] Send text error:", error);
|
|
5676
5692
|
});
|
|
@@ -6484,228 +6500,6 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
6484
6500
|
}
|
|
6485
6501
|
};
|
|
6486
6502
|
|
|
6487
|
-
// src/cache/huggingFaceCDN.ts
|
|
6488
|
-
var HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
|
|
6489
|
-
function parseHuggingFaceUrl(url) {
|
|
6490
|
-
const pattern = /^https:\/\/huggingface\.co\/([^/]+)\/([^/]+)\/resolve\/([^/]+)\/(.+)$/;
|
|
6491
|
-
const match = url.match(pattern);
|
|
6492
|
-
if (!match) {
|
|
6493
|
-
return null;
|
|
6494
|
-
}
|
|
6495
|
-
return {
|
|
6496
|
-
org: match[1],
|
|
6497
|
-
model: match[2],
|
|
6498
|
-
branch: match[3],
|
|
6499
|
-
file: match[4]
|
|
6500
|
-
};
|
|
6501
|
-
}
|
|
6502
|
-
async function isHuggingFaceCDNReachable(testUrl = HF_CDN_TEST_URL) {
|
|
6503
|
-
try {
|
|
6504
|
-
const response = await fetch(testUrl, {
|
|
6505
|
-
method: "HEAD",
|
|
6506
|
-
cache: "no-store"
|
|
6507
|
-
// Don't use cached response for reachability check
|
|
6508
|
-
});
|
|
6509
|
-
return response.ok;
|
|
6510
|
-
} catch {
|
|
6511
|
-
return false;
|
|
6512
|
-
}
|
|
6513
|
-
}
|
|
6514
|
-
|
|
6515
|
-
// src/utils/transformersCacheClear.ts
|
|
6516
|
-
var logger12 = createLogger("TransformersCache");
|
|
6517
|
-
async function clearTransformersCache(options) {
|
|
6518
|
-
const verbose = options?.verbose ?? true;
|
|
6519
|
-
const additionalPatterns = options?.additionalPatterns ?? [];
|
|
6520
|
-
if (!("caches" in window)) {
|
|
6521
|
-
logger12.warn("Cache API not available in this environment");
|
|
6522
|
-
return [];
|
|
6523
|
-
}
|
|
6524
|
-
try {
|
|
6525
|
-
const cacheNames = await caches.keys();
|
|
6526
|
-
const deletedCaches = [];
|
|
6527
|
-
const patterns = [
|
|
6528
|
-
"transformers",
|
|
6529
|
-
"huggingface",
|
|
6530
|
-
"onnx",
|
|
6531
|
-
...additionalPatterns
|
|
6532
|
-
];
|
|
6533
|
-
for (const cacheName of cacheNames) {
|
|
6534
|
-
const shouldDelete = patterns.some(
|
|
6535
|
-
(pattern) => cacheName.toLowerCase().includes(pattern.toLowerCase())
|
|
6536
|
-
);
|
|
6537
|
-
if (shouldDelete) {
|
|
6538
|
-
if (verbose) {
|
|
6539
|
-
logger12.info("Deleting cache", { cacheName });
|
|
6540
|
-
}
|
|
6541
|
-
const deleted = await caches.delete(cacheName);
|
|
6542
|
-
if (deleted) {
|
|
6543
|
-
deletedCaches.push(cacheName);
|
|
6544
|
-
} else if (verbose) {
|
|
6545
|
-
logger12.warn("Failed to delete cache", { cacheName });
|
|
6546
|
-
}
|
|
6547
|
-
}
|
|
6548
|
-
}
|
|
6549
|
-
if (verbose) {
|
|
6550
|
-
logger12.info("Cache clearing complete", {
|
|
6551
|
-
totalCaches: cacheNames.length,
|
|
6552
|
-
deletedCount: deletedCaches.length,
|
|
6553
|
-
deletedCaches
|
|
6554
|
-
});
|
|
6555
|
-
}
|
|
6556
|
-
return deletedCaches;
|
|
6557
|
-
} catch (error) {
|
|
6558
|
-
logger12.error("Error clearing caches", { error });
|
|
6559
|
-
throw error;
|
|
6560
|
-
}
|
|
6561
|
-
}
|
|
6562
|
-
async function clearSpecificCache(cacheName) {
|
|
6563
|
-
if (!("caches" in window)) {
|
|
6564
|
-
logger12.warn("Cache API not available in this environment");
|
|
6565
|
-
return false;
|
|
6566
|
-
}
|
|
6567
|
-
try {
|
|
6568
|
-
const deleted = await caches.delete(cacheName);
|
|
6569
|
-
logger12.info("Cache deletion attempt", { cacheName, deleted });
|
|
6570
|
-
return deleted;
|
|
6571
|
-
} catch (error) {
|
|
6572
|
-
logger12.error("Error deleting cache", { cacheName, error });
|
|
6573
|
-
return false;
|
|
6574
|
-
}
|
|
6575
|
-
}
|
|
6576
|
-
async function listCaches() {
|
|
6577
|
-
if (!("caches" in window)) {
|
|
6578
|
-
logger12.warn("Cache API not available in this environment");
|
|
6579
|
-
return [];
|
|
6580
|
-
}
|
|
6581
|
-
try {
|
|
6582
|
-
const cacheNames = await caches.keys();
|
|
6583
|
-
logger12.debug("Available caches", { cacheNames });
|
|
6584
|
-
return cacheNames;
|
|
6585
|
-
} catch (error) {
|
|
6586
|
-
logger12.error("Error listing caches", { error });
|
|
6587
|
-
return [];
|
|
6588
|
-
}
|
|
6589
|
-
}
|
|
6590
|
-
async function validateCachedResponse(cacheName, requestUrl) {
|
|
6591
|
-
if (!("caches" in window)) {
|
|
6592
|
-
return {
|
|
6593
|
-
exists: false,
|
|
6594
|
-
valid: false,
|
|
6595
|
-
contentType: null,
|
|
6596
|
-
isHtml: false,
|
|
6597
|
-
reason: "Cache API not available"
|
|
6598
|
-
};
|
|
6599
|
-
}
|
|
6600
|
-
try {
|
|
6601
|
-
const cache = await caches.open(cacheName);
|
|
6602
|
-
const response = await cache.match(requestUrl);
|
|
6603
|
-
if (!response) {
|
|
6604
|
-
return {
|
|
6605
|
-
exists: false,
|
|
6606
|
-
valid: false,
|
|
6607
|
-
contentType: null,
|
|
6608
|
-
isHtml: false,
|
|
6609
|
-
reason: "Not in cache"
|
|
6610
|
-
};
|
|
6611
|
-
}
|
|
6612
|
-
const contentType = response.headers.get("content-type");
|
|
6613
|
-
const isHtml = contentType?.includes("text/html") || contentType?.includes("text/plain");
|
|
6614
|
-
const clonedResponse = response.clone();
|
|
6615
|
-
const text = await clonedResponse.text();
|
|
6616
|
-
const looksLikeHtml = text.trim().startsWith("<") || text.includes("<!DOCTYPE");
|
|
6617
|
-
const valid = Boolean(
|
|
6618
|
-
response.status === 200 && !isHtml && !looksLikeHtml && contentType && (contentType.includes("application/json") || contentType.includes("application/octet-stream") || contentType.includes("binary"))
|
|
6619
|
-
);
|
|
6620
|
-
return {
|
|
6621
|
-
exists: true,
|
|
6622
|
-
valid,
|
|
6623
|
-
contentType,
|
|
6624
|
-
isHtml: isHtml || looksLikeHtml,
|
|
6625
|
-
reason: valid ? "Valid response" : `Invalid: status=${response.status}, contentType=${contentType}, isHtml=${isHtml || looksLikeHtml}`
|
|
6626
|
-
};
|
|
6627
|
-
} catch (error) {
|
|
6628
|
-
logger12.error("Error validating cached response", { cacheName, requestUrl, error });
|
|
6629
|
-
return {
|
|
6630
|
-
exists: false,
|
|
6631
|
-
valid: false,
|
|
6632
|
-
contentType: null,
|
|
6633
|
-
isHtml: false,
|
|
6634
|
-
reason: `Error: ${error}`
|
|
6635
|
-
};
|
|
6636
|
-
}
|
|
6637
|
-
}
|
|
6638
|
-
async function scanForInvalidCaches() {
|
|
6639
|
-
if (!("caches" in window)) {
|
|
6640
|
-
return { totalCaches: 0, scannedEntries: 0, invalidEntries: [] };
|
|
6641
|
-
}
|
|
6642
|
-
const invalidEntries = [];
|
|
6643
|
-
let scannedEntries = 0;
|
|
6644
|
-
try {
|
|
6645
|
-
const cacheNames = await caches.keys();
|
|
6646
|
-
for (const cacheName of cacheNames) {
|
|
6647
|
-
if (!cacheName.toLowerCase().includes("transformers")) {
|
|
6648
|
-
continue;
|
|
6649
|
-
}
|
|
6650
|
-
const cache = await caches.open(cacheName);
|
|
6651
|
-
const requests = await cache.keys();
|
|
6652
|
-
for (const request of requests) {
|
|
6653
|
-
scannedEntries++;
|
|
6654
|
-
const url = request.url;
|
|
6655
|
-
const validation = await validateCachedResponse(cacheName, url);
|
|
6656
|
-
if (validation.exists && !validation.valid) {
|
|
6657
|
-
invalidEntries.push({
|
|
6658
|
-
cacheName,
|
|
6659
|
-
url,
|
|
6660
|
-
reason: validation.reason || "Unknown"
|
|
6661
|
-
});
|
|
6662
|
-
}
|
|
6663
|
-
}
|
|
6664
|
-
}
|
|
6665
|
-
logger12.info("Cache scan complete", {
|
|
6666
|
-
totalCaches: cacheNames.length,
|
|
6667
|
-
scannedEntries,
|
|
6668
|
-
invalidCount: invalidEntries.length
|
|
6669
|
-
});
|
|
6670
|
-
return {
|
|
6671
|
-
totalCaches: cacheNames.length,
|
|
6672
|
-
scannedEntries,
|
|
6673
|
-
invalidEntries
|
|
6674
|
-
};
|
|
6675
|
-
} catch (error) {
|
|
6676
|
-
logger12.error("Error scanning caches", { error });
|
|
6677
|
-
throw error;
|
|
6678
|
-
}
|
|
6679
|
-
}
|
|
6680
|
-
async function nukeBrowserCaches(preventRecreation = false) {
|
|
6681
|
-
if (!("caches" in window)) {
|
|
6682
|
-
logger12.warn("Cache API not available in this environment");
|
|
6683
|
-
return 0;
|
|
6684
|
-
}
|
|
6685
|
-
try {
|
|
6686
|
-
const cacheNames = await caches.keys();
|
|
6687
|
-
let deletedCount = 0;
|
|
6688
|
-
for (const cacheName of cacheNames) {
|
|
6689
|
-
const deleted = await caches.delete(cacheName);
|
|
6690
|
-
if (deleted) {
|
|
6691
|
-
deletedCount++;
|
|
6692
|
-
}
|
|
6693
|
-
}
|
|
6694
|
-
logger12.info("All browser caches cleared", {
|
|
6695
|
-
totalDeleted: deletedCount
|
|
6696
|
-
});
|
|
6697
|
-
if (preventRecreation) {
|
|
6698
|
-
const { env } = await import("./transformers.web-T5LWC34T.mjs");
|
|
6699
|
-
env.useBrowserCache = false;
|
|
6700
|
-
logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
|
|
6701
|
-
}
|
|
6702
|
-
return deletedCount;
|
|
6703
|
-
} catch (error) {
|
|
6704
|
-
logger12.error("Error nuking caches", { error });
|
|
6705
|
-
throw error;
|
|
6706
|
-
}
|
|
6707
|
-
}
|
|
6708
|
-
|
|
6709
6503
|
// src/animation/types.ts
|
|
6710
6504
|
var DEFAULT_ANIMATION_CONFIG = {
|
|
6711
6505
|
initialState: "idle",
|
|
@@ -7245,7 +7039,6 @@ export {
|
|
|
7245
7039
|
EmotionPresets,
|
|
7246
7040
|
EmphasisDetector,
|
|
7247
7041
|
EventEmitter,
|
|
7248
|
-
HF_CDN_TEST_URL,
|
|
7249
7042
|
INFERENCE_LATENCY_BUCKETS,
|
|
7250
7043
|
InterruptionHandler,
|
|
7251
7044
|
LAMPipeline,
|
|
@@ -7259,6 +7052,7 @@ export {
|
|
|
7259
7052
|
OmoteTelemetry,
|
|
7260
7053
|
RingBuffer,
|
|
7261
7054
|
SafariSpeechRecognition,
|
|
7055
|
+
SenseVoiceInference,
|
|
7262
7056
|
SileroVADInference,
|
|
7263
7057
|
SileroVADWorker,
|
|
7264
7058
|
SyncedAudioPipeline,
|
|
@@ -7266,12 +7060,12 @@ export {
|
|
|
7266
7060
|
WAV2ARKIT_BLENDSHAPES,
|
|
7267
7061
|
Wav2ArkitCpuInference,
|
|
7268
7062
|
Wav2Vec2Inference,
|
|
7269
|
-
|
|
7063
|
+
applyCMVN,
|
|
7064
|
+
applyLFR,
|
|
7270
7065
|
blendEmotions,
|
|
7271
7066
|
calculatePeak,
|
|
7272
7067
|
calculateRMS,
|
|
7273
|
-
|
|
7274
|
-
clearTransformersCache,
|
|
7068
|
+
computeKaldiFbank,
|
|
7275
7069
|
configureCacheLimit,
|
|
7276
7070
|
configureLogging,
|
|
7277
7071
|
configureTelemetry,
|
|
@@ -7280,6 +7074,7 @@ export {
|
|
|
7280
7074
|
createLogger,
|
|
7281
7075
|
createSessionWithFallback,
|
|
7282
7076
|
createSileroVAD,
|
|
7077
|
+
ctcGreedyDecode,
|
|
7283
7078
|
fetchWithCache,
|
|
7284
7079
|
formatBytes,
|
|
7285
7080
|
getCacheConfig,
|
|
@@ -7296,7 +7091,6 @@ export {
|
|
|
7296
7091
|
getTelemetry,
|
|
7297
7092
|
hasWebGPUApi,
|
|
7298
7093
|
isAndroid,
|
|
7299
|
-
isHuggingFaceCDNReachable,
|
|
7300
7094
|
isIOS,
|
|
7301
7095
|
isIOSSafari,
|
|
7302
7096
|
isMobile,
|
|
@@ -7305,16 +7099,16 @@ export {
|
|
|
7305
7099
|
isSpeechRecognitionAvailable,
|
|
7306
7100
|
isWebGPUAvailable,
|
|
7307
7101
|
lerpEmotion,
|
|
7308
|
-
listCaches,
|
|
7309
7102
|
noopLogger,
|
|
7310
|
-
|
|
7311
|
-
|
|
7103
|
+
parseCMVNFromMetadata,
|
|
7104
|
+
parseTokensFile,
|
|
7312
7105
|
preloadModels,
|
|
7313
7106
|
preloadOnnxRuntime,
|
|
7314
7107
|
remapWav2ArkitToLam,
|
|
7315
7108
|
resetLoggingConfig,
|
|
7316
7109
|
resolveBackend,
|
|
7317
|
-
|
|
7110
|
+
resolveLanguageId,
|
|
7111
|
+
resolveTextNormId,
|
|
7318
7112
|
setLogLevel,
|
|
7319
7113
|
setLoggingEnabled,
|
|
7320
7114
|
shouldEnableWasmProxy,
|
|
@@ -7322,7 +7116,6 @@ export {
|
|
|
7322
7116
|
shouldUseNativeASR,
|
|
7323
7117
|
shouldUseServerLipSync,
|
|
7324
7118
|
supportsVADWorker,
|
|
7325
|
-
symmetrizeBlendshapes
|
|
7326
|
-
validateCachedResponse
|
|
7119
|
+
symmetrizeBlendshapes
|
|
7327
7120
|
};
|
|
7328
7121
|
//# sourceMappingURL=index.mjs.map
|