@omote/core 0.3.25 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/events/index.mjs +0 -1
- package/dist/index.d.mts +201 -259
- package/dist/index.d.ts +201 -259
- package/dist/index.js +672 -38674
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +648 -867
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.mjs +0 -1
- package/package.json +1 -2
- package/dist/chunk-B6TIE56N.mjs +0 -37779
- package/dist/chunk-B6TIE56N.mjs.map +0 -1
- package/dist/chunk-NSSMTXJJ.mjs +0 -8
- package/dist/chunk-NSSMTXJJ.mjs.map +0 -1
- package/dist/transformers.web-T5LWC34T.mjs +0 -1718
- package/dist/transformers.web-T5LWC34T.mjs.map +0 -1
package/dist/index.mjs
CHANGED
|
@@ -12,11 +12,6 @@ import {
|
|
|
12
12
|
setLogLevel,
|
|
13
13
|
setLoggingEnabled
|
|
14
14
|
} from "./chunk-ESU52TDS.mjs";
|
|
15
|
-
import {
|
|
16
|
-
__webpack_exports__env,
|
|
17
|
-
__webpack_exports__pipeline
|
|
18
|
-
} from "./chunk-B6TIE56N.mjs";
|
|
19
|
-
import "./chunk-NSSMTXJJ.mjs";
|
|
20
15
|
|
|
21
16
|
// src/audio/MicrophoneCapture.ts
|
|
22
17
|
var MicrophoneCapture = class {
|
|
@@ -2274,6 +2269,14 @@ function getSessionOptions(backend) {
|
|
|
2274
2269
|
graphOptimizationLevel: "all"
|
|
2275
2270
|
};
|
|
2276
2271
|
}
|
|
2272
|
+
if (isIOS()) {
|
|
2273
|
+
return {
|
|
2274
|
+
executionProviders: ["wasm"],
|
|
2275
|
+
graphOptimizationLevel: "basic",
|
|
2276
|
+
enableCpuMemArena: false,
|
|
2277
|
+
enableMemPattern: false
|
|
2278
|
+
};
|
|
2279
|
+
}
|
|
2277
2280
|
return {
|
|
2278
2281
|
executionProviders: ["wasm"],
|
|
2279
2282
|
graphOptimizationLevel: "all"
|
|
@@ -2549,77 +2552,108 @@ var Wav2Vec2Inference = class {
|
|
|
2549
2552
|
this.ort = ort;
|
|
2550
2553
|
this._backend = backend;
|
|
2551
2554
|
logger2.info("ONNX Runtime loaded", { backend: this._backend });
|
|
2552
|
-
const cache = getModelCache();
|
|
2553
2555
|
const modelUrl = this.config.modelUrl;
|
|
2554
|
-
const
|
|
2555
|
-
|
|
2556
|
-
|
|
2557
|
-
|
|
2558
|
-
|
|
2559
|
-
|
|
2560
|
-
|
|
2561
|
-
|
|
2562
|
-
|
|
2563
|
-
|
|
2556
|
+
const dataUrl = this.config.externalDataUrl !== false ? typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data` : null;
|
|
2557
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
2558
|
+
let isCached = false;
|
|
2559
|
+
if (isIOS()) {
|
|
2560
|
+
logger2.info("iOS: passing model URLs directly to ORT (low-memory path)", {
|
|
2561
|
+
modelUrl,
|
|
2562
|
+
dataUrl
|
|
2563
|
+
});
|
|
2564
|
+
if (dataUrl) {
|
|
2565
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
2566
|
+
logger2.info("iOS: setting externalData", { dataFilename, dataUrl });
|
|
2567
|
+
sessionOptions.externalData = [{
|
|
2568
|
+
path: dataFilename,
|
|
2569
|
+
data: dataUrl
|
|
2570
|
+
// URL string — ORT fetches directly into WASM
|
|
2571
|
+
}];
|
|
2564
2572
|
}
|
|
2565
|
-
|
|
2566
|
-
|
|
2567
|
-
|
|
2568
|
-
|
|
2569
|
-
|
|
2570
|
-
|
|
2571
|
-
|
|
2572
|
-
throw new Error(errorMsg);
|
|
2573
|
-
}
|
|
2574
|
-
let externalDataBuffer = null;
|
|
2575
|
-
if (this.config.externalDataUrl !== false) {
|
|
2576
|
-
const dataUrl = typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`;
|
|
2573
|
+
logger2.info("iOS: calling InferenceSession.create() with URL string", {
|
|
2574
|
+
modelUrl,
|
|
2575
|
+
sessionOptions: JSON.stringify(
|
|
2576
|
+
sessionOptions,
|
|
2577
|
+
(_, v) => typeof v === "string" && v.length > 100 ? v.slice(0, 100) + "..." : v
|
|
2578
|
+
)
|
|
2579
|
+
});
|
|
2577
2580
|
try {
|
|
2578
|
-
|
|
2579
|
-
|
|
2580
|
-
|
|
2581
|
-
|
|
2582
|
-
|
|
2583
|
-
|
|
2584
|
-
|
|
2581
|
+
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
2582
|
+
} catch (sessionErr) {
|
|
2583
|
+
logger2.error("iOS: InferenceSession.create() failed", {
|
|
2584
|
+
error: sessionErr instanceof Error ? sessionErr.message : String(sessionErr),
|
|
2585
|
+
errorType: sessionErr?.constructor?.name,
|
|
2586
|
+
stack: sessionErr instanceof Error ? sessionErr.stack : void 0
|
|
2587
|
+
});
|
|
2588
|
+
throw sessionErr;
|
|
2589
|
+
}
|
|
2590
|
+
logger2.info("iOS: session created successfully", {
|
|
2591
|
+
inputNames: this.session.inputNames,
|
|
2592
|
+
outputNames: this.session.outputNames
|
|
2593
|
+
});
|
|
2594
|
+
} else {
|
|
2595
|
+
const cache = getModelCache();
|
|
2596
|
+
isCached = await cache.has(modelUrl);
|
|
2597
|
+
let modelBuffer;
|
|
2598
|
+
if (isCached) {
|
|
2599
|
+
logger2.debug("Loading model from cache", { modelUrl });
|
|
2600
|
+
modelBuffer = await cache.get(modelUrl);
|
|
2601
|
+
if (!modelBuffer) {
|
|
2602
|
+
logger2.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
2603
|
+
await cache.delete(modelUrl);
|
|
2604
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
2605
|
+
}
|
|
2606
|
+
} else {
|
|
2607
|
+
logger2.debug("Fetching and caching model", { modelUrl });
|
|
2608
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
2609
|
+
}
|
|
2610
|
+
if (!modelBuffer) {
|
|
2611
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
2612
|
+
}
|
|
2613
|
+
let externalDataBuffer = null;
|
|
2614
|
+
if (dataUrl) {
|
|
2615
|
+
try {
|
|
2616
|
+
const isDataCached = await cache.has(dataUrl);
|
|
2617
|
+
if (isDataCached) {
|
|
2618
|
+
logger2.debug("Loading external data from cache", { dataUrl });
|
|
2619
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
2620
|
+
if (!externalDataBuffer) {
|
|
2621
|
+
logger2.warn("Cache corruption for external data, retrying", { dataUrl });
|
|
2622
|
+
await cache.delete(dataUrl);
|
|
2623
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2624
|
+
}
|
|
2625
|
+
} else {
|
|
2626
|
+
logger2.info("Fetching external model data", {
|
|
2627
|
+
dataUrl,
|
|
2628
|
+
note: "This may be a large download (383MB+)"
|
|
2629
|
+
});
|
|
2585
2630
|
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2586
2631
|
}
|
|
2587
|
-
|
|
2588
|
-
|
|
2632
|
+
logger2.info("External data loaded", {
|
|
2633
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
2634
|
+
});
|
|
2635
|
+
} catch (err) {
|
|
2636
|
+
logger2.debug("No external data file found (single-file model)", {
|
|
2589
2637
|
dataUrl,
|
|
2590
|
-
|
|
2638
|
+
error: err.message
|
|
2591
2639
|
});
|
|
2592
|
-
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
2593
2640
|
}
|
|
2594
|
-
logger2.info("External data loaded", {
|
|
2595
|
-
size: formatBytes(externalDataBuffer.byteLength)
|
|
2596
|
-
});
|
|
2597
|
-
} catch (err) {
|
|
2598
|
-
logger2.debug("No external data file found (single-file model)", {
|
|
2599
|
-
dataUrl,
|
|
2600
|
-
error: err.message
|
|
2601
|
-
});
|
|
2602
2641
|
}
|
|
2642
|
+
logger2.debug("Creating ONNX session", {
|
|
2643
|
+
graphSize: formatBytes(modelBuffer.byteLength),
|
|
2644
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
2645
|
+
backend: this._backend
|
|
2646
|
+
});
|
|
2647
|
+
if (externalDataBuffer) {
|
|
2648
|
+
const dataFilename = dataUrl.split("/").pop();
|
|
2649
|
+
sessionOptions.externalData = [{
|
|
2650
|
+
path: dataFilename,
|
|
2651
|
+
data: new Uint8Array(externalDataBuffer)
|
|
2652
|
+
}];
|
|
2653
|
+
}
|
|
2654
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
2655
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
2603
2656
|
}
|
|
2604
|
-
logger2.debug("Creating ONNX session", {
|
|
2605
|
-
graphSize: formatBytes(modelBuffer.byteLength),
|
|
2606
|
-
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : "none",
|
|
2607
|
-
backend: this._backend
|
|
2608
|
-
});
|
|
2609
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
2610
|
-
if (externalDataBuffer) {
|
|
2611
|
-
const dataFilename = (typeof this.config.externalDataUrl === "string" ? this.config.externalDataUrl : `${modelUrl}.data`).split("/").pop();
|
|
2612
|
-
sessionOptions.externalData = [{
|
|
2613
|
-
path: dataFilename,
|
|
2614
|
-
data: new Uint8Array(externalDataBuffer)
|
|
2615
|
-
}];
|
|
2616
|
-
}
|
|
2617
|
-
logger2.info("Creating session with execution provider", {
|
|
2618
|
-
executionProvider: this._backend,
|
|
2619
|
-
hasExternalData: !!externalDataBuffer
|
|
2620
|
-
});
|
|
2621
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
2622
|
-
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
2623
2657
|
logger2.info("ONNX session created successfully", {
|
|
2624
2658
|
executionProvider: this._backend,
|
|
2625
2659
|
backend: this._backend
|
|
@@ -2634,7 +2668,7 @@ var Wav2Vec2Inference = class {
|
|
|
2634
2668
|
span?.setAttributes({
|
|
2635
2669
|
"model.backend": this._backend,
|
|
2636
2670
|
"model.load_time_ms": loadTimeMs,
|
|
2637
|
-
"model.cached": isCached
|
|
2671
|
+
"model.cached": !isIOS() && isCached
|
|
2638
2672
|
});
|
|
2639
2673
|
span?.end();
|
|
2640
2674
|
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
@@ -2837,319 +2871,550 @@ LAM_BLENDSHAPES.forEach((name, index) => {
|
|
|
2837
2871
|
});
|
|
2838
2872
|
var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
|
|
2839
2873
|
|
|
2840
|
-
// src/inference/
|
|
2841
|
-
|
|
2842
|
-
|
|
2843
|
-
|
|
2844
|
-
|
|
2845
|
-
|
|
2874
|
+
// src/inference/kaldiFbank.ts
|
|
2875
|
+
function fft(re, im) {
|
|
2876
|
+
const n = re.length;
|
|
2877
|
+
for (let i = 1, j = 0; i < n; i++) {
|
|
2878
|
+
let bit = n >> 1;
|
|
2879
|
+
while (j & bit) {
|
|
2880
|
+
j ^= bit;
|
|
2881
|
+
bit >>= 1;
|
|
2882
|
+
}
|
|
2883
|
+
j ^= bit;
|
|
2884
|
+
if (i < j) {
|
|
2885
|
+
let tmp = re[i];
|
|
2886
|
+
re[i] = re[j];
|
|
2887
|
+
re[j] = tmp;
|
|
2888
|
+
tmp = im[i];
|
|
2889
|
+
im[i] = im[j];
|
|
2890
|
+
im[j] = tmp;
|
|
2891
|
+
}
|
|
2892
|
+
}
|
|
2893
|
+
for (let len = 2; len <= n; len *= 2) {
|
|
2894
|
+
const halfLen = len / 2;
|
|
2895
|
+
const angle = -2 * Math.PI / len;
|
|
2896
|
+
const wRe = Math.cos(angle);
|
|
2897
|
+
const wIm = Math.sin(angle);
|
|
2898
|
+
for (let i = 0; i < n; i += len) {
|
|
2899
|
+
let curRe = 1;
|
|
2900
|
+
let curIm = 0;
|
|
2901
|
+
for (let j = 0; j < halfLen; j++) {
|
|
2902
|
+
const a = i + j;
|
|
2903
|
+
const b = a + halfLen;
|
|
2904
|
+
const tRe = curRe * re[b] - curIm * im[b];
|
|
2905
|
+
const tIm = curRe * im[b] + curIm * re[b];
|
|
2906
|
+
re[b] = re[a] - tRe;
|
|
2907
|
+
im[b] = im[a] - tIm;
|
|
2908
|
+
re[a] += tRe;
|
|
2909
|
+
im[a] += tIm;
|
|
2910
|
+
const nextRe = curRe * wRe - curIm * wIm;
|
|
2911
|
+
curIm = curRe * wIm + curIm * wRe;
|
|
2912
|
+
curRe = nextRe;
|
|
2913
|
+
}
|
|
2914
|
+
}
|
|
2915
|
+
}
|
|
2916
|
+
}
|
|
2917
|
+
function htkMel(freq) {
|
|
2918
|
+
return 1127 * Math.log(1 + freq / 700);
|
|
2919
|
+
}
|
|
2920
|
+
function htkMelInverse(mel) {
|
|
2921
|
+
return 700 * (Math.exp(mel / 1127) - 1);
|
|
2922
|
+
}
|
|
2923
|
+
function buildMelFilterbank(numBins, fftSize, sampleRate, lowFreq, highFreq) {
|
|
2924
|
+
const numFftBins = fftSize / 2 + 1;
|
|
2925
|
+
const lowMel = htkMel(lowFreq);
|
|
2926
|
+
const highMel = htkMel(highFreq);
|
|
2927
|
+
const melPoints = new Float64Array(numBins + 2);
|
|
2928
|
+
for (let i = 0; i < numBins + 2; i++) {
|
|
2929
|
+
melPoints[i] = lowMel + (highMel - lowMel) * i / (numBins + 1);
|
|
2930
|
+
}
|
|
2931
|
+
const binFreqs = new Float64Array(numBins + 2);
|
|
2932
|
+
for (let i = 0; i < numBins + 2; i++) {
|
|
2933
|
+
binFreqs[i] = htkMelInverse(melPoints[i]) * fftSize / sampleRate;
|
|
2934
|
+
}
|
|
2935
|
+
const filters = [];
|
|
2936
|
+
for (let m = 0; m < numBins; m++) {
|
|
2937
|
+
const left = binFreqs[m];
|
|
2938
|
+
const center = binFreqs[m + 1];
|
|
2939
|
+
const right = binFreqs[m + 2];
|
|
2940
|
+
const startBin = Math.max(0, Math.ceil(left));
|
|
2941
|
+
const endBin = Math.min(numFftBins - 1, Math.floor(right));
|
|
2942
|
+
const weights = new Float32Array(endBin - startBin + 1);
|
|
2943
|
+
for (let k = startBin; k <= endBin; k++) {
|
|
2944
|
+
if (k <= center) {
|
|
2945
|
+
weights[k - startBin] = center - left > 0 ? (k - left) / (center - left) : 0;
|
|
2946
|
+
} else {
|
|
2947
|
+
weights[k - startBin] = right - center > 0 ? (right - k) / (right - center) : 0;
|
|
2948
|
+
}
|
|
2949
|
+
}
|
|
2950
|
+
filters.push({ startBin, weights });
|
|
2951
|
+
}
|
|
2952
|
+
return filters;
|
|
2953
|
+
}
|
|
2954
|
+
function createHammingWindow(length) {
|
|
2955
|
+
const window2 = new Float32Array(length);
|
|
2956
|
+
for (let i = 0; i < length; i++) {
|
|
2957
|
+
window2[i] = 0.54 - 0.46 * Math.cos(2 * Math.PI * i / (length - 1));
|
|
2958
|
+
}
|
|
2959
|
+
return window2;
|
|
2960
|
+
}
|
|
2961
|
+
function computeKaldiFbank(audio, sampleRate, numMelBins, opts) {
|
|
2962
|
+
const frameLengthMs = opts?.frameLengthMs ?? 25;
|
|
2963
|
+
const frameShiftMs = opts?.frameShiftMs ?? 10;
|
|
2964
|
+
const lowFreq = opts?.lowFreq ?? 20;
|
|
2965
|
+
const highFreq = opts?.highFreq ?? sampleRate / 2;
|
|
2966
|
+
const dither = opts?.dither ?? 0;
|
|
2967
|
+
const preemphasis = opts?.preemphasis ?? 0.97;
|
|
2968
|
+
const frameLengthSamples = Math.round(sampleRate * frameLengthMs / 1e3);
|
|
2969
|
+
const frameShiftSamples = Math.round(sampleRate * frameShiftMs / 1e3);
|
|
2970
|
+
const scaled = new Float32Array(audio.length);
|
|
2971
|
+
for (let i = 0; i < audio.length; i++) {
|
|
2972
|
+
scaled[i] = audio[i] * 32768;
|
|
2973
|
+
}
|
|
2974
|
+
if (dither > 0) {
|
|
2975
|
+
for (let i = 0; i < scaled.length; i++) {
|
|
2976
|
+
const u1 = Math.random();
|
|
2977
|
+
const u2 = Math.random();
|
|
2978
|
+
scaled[i] += dither * Math.sqrt(-2 * Math.log(u1 + 1e-10)) * Math.cos(2 * Math.PI * u2);
|
|
2979
|
+
}
|
|
2980
|
+
}
|
|
2981
|
+
const numFrames = Math.max(0, Math.floor((scaled.length - frameLengthSamples) / frameShiftSamples) + 1);
|
|
2982
|
+
if (numFrames === 0) {
|
|
2983
|
+
return new Float32Array(0);
|
|
2984
|
+
}
|
|
2985
|
+
let fftSize = 1;
|
|
2986
|
+
while (fftSize < frameLengthSamples) fftSize *= 2;
|
|
2987
|
+
const numFftBins = fftSize / 2 + 1;
|
|
2988
|
+
const window2 = createHammingWindow(frameLengthSamples);
|
|
2989
|
+
const filters = buildMelFilterbank(numMelBins, fftSize, sampleRate, lowFreq, highFreq);
|
|
2990
|
+
const output = new Float32Array(numFrames * numMelBins);
|
|
2991
|
+
const fftRe = new Float64Array(fftSize);
|
|
2992
|
+
const fftIm = new Float64Array(fftSize);
|
|
2993
|
+
for (let f = 0; f < numFrames; f++) {
|
|
2994
|
+
const offset = f * frameShiftSamples;
|
|
2995
|
+
fftRe.fill(0);
|
|
2996
|
+
fftIm.fill(0);
|
|
2997
|
+
for (let i = 0; i < frameLengthSamples; i++) {
|
|
2998
|
+
let sample = scaled[offset + i];
|
|
2999
|
+
if (preemphasis > 0 && i > 0) {
|
|
3000
|
+
sample -= preemphasis * scaled[offset + i - 1];
|
|
3001
|
+
} else if (preemphasis > 0 && i === 0 && offset > 0) {
|
|
3002
|
+
sample -= preemphasis * scaled[offset - 1];
|
|
3003
|
+
}
|
|
3004
|
+
fftRe[i] = sample * window2[i];
|
|
3005
|
+
}
|
|
3006
|
+
fft(fftRe, fftIm);
|
|
3007
|
+
const outOffset = f * numMelBins;
|
|
3008
|
+
for (let m = 0; m < numMelBins; m++) {
|
|
3009
|
+
const filter = filters[m];
|
|
3010
|
+
let energy = 0;
|
|
3011
|
+
for (let k = 0; k < filter.weights.length; k++) {
|
|
3012
|
+
const bin = filter.startBin + k;
|
|
3013
|
+
if (bin < numFftBins) {
|
|
3014
|
+
const powerSpec = fftRe[bin] * fftRe[bin] + fftIm[bin] * fftIm[bin];
|
|
3015
|
+
energy += filter.weights[k] * powerSpec;
|
|
3016
|
+
}
|
|
3017
|
+
}
|
|
3018
|
+
output[outOffset + m] = Math.log(Math.max(energy, 1e-10));
|
|
3019
|
+
}
|
|
3020
|
+
}
|
|
3021
|
+
return output;
|
|
3022
|
+
}
|
|
3023
|
+
function applyLFR(features, featureDim, lfrM = 7, lfrN = 6) {
|
|
3024
|
+
const numFrames = features.length / featureDim;
|
|
3025
|
+
if (numFrames === 0) return new Float32Array(0);
|
|
3026
|
+
const leftPad = Math.floor((lfrM - 1) / 2);
|
|
3027
|
+
const paddedLen = numFrames + leftPad;
|
|
3028
|
+
const numOutputFrames = Math.ceil(paddedLen / lfrN);
|
|
3029
|
+
const outputDim = featureDim * lfrM;
|
|
3030
|
+
const output = new Float32Array(numOutputFrames * outputDim);
|
|
3031
|
+
for (let i = 0; i < numOutputFrames; i++) {
|
|
3032
|
+
const startFrame = i * lfrN - leftPad;
|
|
3033
|
+
for (let j = 0; j < lfrM; j++) {
|
|
3034
|
+
let srcFrame = startFrame + j;
|
|
3035
|
+
if (srcFrame < 0) srcFrame = 0;
|
|
3036
|
+
if (srcFrame >= numFrames) srcFrame = numFrames - 1;
|
|
3037
|
+
const srcOffset = srcFrame * featureDim;
|
|
3038
|
+
const dstOffset = i * outputDim + j * featureDim;
|
|
3039
|
+
for (let k = 0; k < featureDim; k++) {
|
|
3040
|
+
output[dstOffset + k] = features[srcOffset + k];
|
|
3041
|
+
}
|
|
3042
|
+
}
|
|
3043
|
+
}
|
|
3044
|
+
return output;
|
|
3045
|
+
}
|
|
3046
|
+
function applyCMVN(features, dim, negMean, invStddev) {
|
|
3047
|
+
for (let i = 0; i < features.length; i++) {
|
|
3048
|
+
const d = i % dim;
|
|
3049
|
+
features[i] = (features[i] + negMean[d]) * invStddev[d];
|
|
3050
|
+
}
|
|
3051
|
+
return features;
|
|
3052
|
+
}
|
|
3053
|
+
function parseCMVNFromMetadata(negMeanStr, invStddevStr) {
|
|
3054
|
+
const negMean = new Float32Array(
|
|
3055
|
+
negMeanStr.split(",").map((s) => parseFloat(s.trim()))
|
|
3056
|
+
);
|
|
3057
|
+
const invStddev = new Float32Array(
|
|
3058
|
+
invStddevStr.split(",").map((s) => parseFloat(s.trim()))
|
|
3059
|
+
);
|
|
3060
|
+
return { negMean, invStddev };
|
|
3061
|
+
}
|
|
3062
|
+
|
|
3063
|
+
// src/inference/ctcDecoder.ts
|
|
3064
|
+
function resolveLanguageId(language) {
|
|
3065
|
+
const map = {
|
|
3066
|
+
auto: 0,
|
|
3067
|
+
zh: 3,
|
|
3068
|
+
en: 4,
|
|
3069
|
+
yue: 7,
|
|
3070
|
+
ja: 11,
|
|
3071
|
+
ko: 12
|
|
3072
|
+
};
|
|
3073
|
+
return map[language] ?? 0;
|
|
3074
|
+
}
|
|
3075
|
+
function resolveTextNormId(textNorm) {
|
|
3076
|
+
return textNorm === "without_itn" ? 15 : 14;
|
|
3077
|
+
}
|
|
3078
|
+
function parseTokensFile(content) {
|
|
3079
|
+
const map = /* @__PURE__ */ new Map();
|
|
3080
|
+
const lines = content.split("\n");
|
|
3081
|
+
for (const line of lines) {
|
|
3082
|
+
const trimmed = line.trim();
|
|
3083
|
+
if (!trimmed) continue;
|
|
3084
|
+
const lastSpace = trimmed.lastIndexOf(" ");
|
|
3085
|
+
if (lastSpace === -1) continue;
|
|
3086
|
+
const token = trimmed.substring(0, lastSpace);
|
|
3087
|
+
const id = parseInt(trimmed.substring(lastSpace + 1), 10);
|
|
3088
|
+
if (!isNaN(id)) {
|
|
3089
|
+
map.set(id, token);
|
|
3090
|
+
}
|
|
3091
|
+
}
|
|
3092
|
+
return map;
|
|
3093
|
+
}
|
|
3094
|
+
function parseStructuredToken(token) {
|
|
3095
|
+
const match = token.match(/^<\|(.+)\|>$/);
|
|
3096
|
+
if (!match) return null;
|
|
3097
|
+
const value = match[1];
|
|
3098
|
+
if (value === "zh" || value === "en" || value === "ja" || value === "ko" || value === "yue" || value === "nospeech") {
|
|
3099
|
+
return { type: "language", value };
|
|
3100
|
+
}
|
|
3101
|
+
const emotions = ["HAPPY", "SAD", "ANGRY", "NEUTRAL", "FEARFUL", "DISGUSTED", "SURPRISED", "EMO_UNKNOWN"];
|
|
3102
|
+
if (emotions.includes(value)) {
|
|
3103
|
+
return { type: "emotion", value };
|
|
3104
|
+
}
|
|
3105
|
+
const events = ["Speech", "BGM", "Applause", "Laughter", "Crying", "Coughing", "Sneezing", "EVENT_UNKNOWN"];
|
|
3106
|
+
if (events.includes(value)) {
|
|
3107
|
+
return { type: "event", value };
|
|
3108
|
+
}
|
|
3109
|
+
if (value === "withitn" || value === "woitn" || value === "with_itn" || value === "without_itn") {
|
|
3110
|
+
return { type: "textnorm", value };
|
|
3111
|
+
}
|
|
3112
|
+
return null;
|
|
3113
|
+
}
|
|
3114
|
+
function ctcGreedyDecode(logits, seqLen, vocabSize, tokenMap) {
|
|
3115
|
+
const tokenIds = [];
|
|
3116
|
+
for (let t = 0; t < seqLen; t++) {
|
|
3117
|
+
const offset = t * vocabSize;
|
|
3118
|
+
let maxIdx = 0;
|
|
3119
|
+
let maxVal = logits[offset];
|
|
3120
|
+
for (let v = 1; v < vocabSize; v++) {
|
|
3121
|
+
if (logits[offset + v] > maxVal) {
|
|
3122
|
+
maxVal = logits[offset + v];
|
|
3123
|
+
maxIdx = v;
|
|
3124
|
+
}
|
|
3125
|
+
}
|
|
3126
|
+
tokenIds.push(maxIdx);
|
|
3127
|
+
}
|
|
3128
|
+
const collapsed = [];
|
|
3129
|
+
let prev = -1;
|
|
3130
|
+
for (const id of tokenIds) {
|
|
3131
|
+
if (id !== prev) {
|
|
3132
|
+
collapsed.push(id);
|
|
3133
|
+
prev = id;
|
|
3134
|
+
}
|
|
3135
|
+
}
|
|
3136
|
+
const filtered = collapsed.filter((id) => id !== 0 && id !== 1 && id !== 2);
|
|
3137
|
+
let language;
|
|
3138
|
+
let emotion;
|
|
3139
|
+
let event;
|
|
3140
|
+
const textTokens = [];
|
|
3141
|
+
for (const id of filtered) {
|
|
3142
|
+
const token = tokenMap.get(id);
|
|
3143
|
+
if (!token) continue;
|
|
3144
|
+
const structured = parseStructuredToken(token);
|
|
3145
|
+
if (structured) {
|
|
3146
|
+
if (structured.type === "language") language = structured.value;
|
|
3147
|
+
else if (structured.type === "emotion") emotion = structured.value;
|
|
3148
|
+
else if (structured.type === "event") event = structured.value;
|
|
3149
|
+
} else {
|
|
3150
|
+
textTokens.push(token);
|
|
3151
|
+
}
|
|
3152
|
+
}
|
|
3153
|
+
let text = textTokens.join("");
|
|
3154
|
+
text = text.replace(/\u2581/g, " ").trim();
|
|
3155
|
+
return { text, language, emotion, event };
|
|
3156
|
+
}
|
|
3157
|
+
|
|
3158
|
+
// src/inference/SenseVoiceInference.ts
|
|
3159
|
+
var logger4 = createLogger("SenseVoice");
|
|
3160
|
+
var SenseVoiceInference = class {
|
|
3161
|
+
constructor(config) {
|
|
3162
|
+
this.session = null;
|
|
3163
|
+
this.ort = null;
|
|
3164
|
+
this._backend = "wasm";
|
|
2846
3165
|
this.isLoading = false;
|
|
2847
|
-
this.
|
|
3166
|
+
this.inferenceQueue = Promise.resolve();
|
|
3167
|
+
// Preprocessing state (loaded once)
|
|
3168
|
+
this.tokenMap = null;
|
|
3169
|
+
this.negMean = null;
|
|
3170
|
+
this.invStddev = null;
|
|
3171
|
+
this.languageId = 0;
|
|
3172
|
+
this.textNormId = 14;
|
|
3173
|
+
const modelDir = config.modelUrl.substring(0, config.modelUrl.lastIndexOf("/"));
|
|
3174
|
+
const tokensUrl = config.tokensUrl ?? `${modelDir}/tokens.txt`;
|
|
2848
3175
|
this.config = {
|
|
2849
|
-
|
|
2850
|
-
|
|
2851
|
-
language: config.language
|
|
2852
|
-
|
|
2853
|
-
|
|
2854
|
-
device: config.device || "auto",
|
|
2855
|
-
localModelPath: config.localModelPath,
|
|
2856
|
-
token: config.token,
|
|
2857
|
-
suppressNonSpeech: config.suppressNonSpeech !== false
|
|
2858
|
-
// Default true
|
|
3176
|
+
modelUrl: config.modelUrl,
|
|
3177
|
+
tokensUrl,
|
|
3178
|
+
language: config.language ?? "auto",
|
|
3179
|
+
textNorm: config.textNorm ?? "with_itn",
|
|
3180
|
+
backend: config.backend ?? "auto"
|
|
2859
3181
|
};
|
|
3182
|
+
this.languageId = resolveLanguageId(this.config.language);
|
|
3183
|
+
this.textNormId = resolveTextNormId(this.config.textNorm);
|
|
2860
3184
|
}
|
|
2861
|
-
|
|
2862
|
-
|
|
2863
|
-
*/
|
|
2864
|
-
static async isWebGPUAvailable() {
|
|
2865
|
-
return "gpu" in navigator;
|
|
3185
|
+
get backend() {
|
|
3186
|
+
return this.session ? this._backend : null;
|
|
2866
3187
|
}
|
|
2867
|
-
|
|
2868
|
-
|
|
2869
|
-
|
|
3188
|
+
get isLoaded() {
|
|
3189
|
+
return this.session !== null;
|
|
3190
|
+
}
|
|
3191
|
+
// ─── Load ───────────────────────────────────────────────────────────────
|
|
2870
3192
|
async load(onProgress) {
|
|
2871
3193
|
if (this.isLoading) {
|
|
2872
|
-
|
|
2873
|
-
while (this.isLoading) {
|
|
2874
|
-
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
2875
|
-
}
|
|
2876
|
-
return;
|
|
3194
|
+
throw new Error("Model is already loading");
|
|
2877
3195
|
}
|
|
2878
|
-
|
|
2879
|
-
|
|
2880
|
-
logger4.debug("Model already loaded", { model: modelName });
|
|
2881
|
-
return;
|
|
3196
|
+
if (this.session) {
|
|
3197
|
+
throw new Error("Model already loaded. Call dispose() first.");
|
|
2882
3198
|
}
|
|
2883
3199
|
this.isLoading = true;
|
|
3200
|
+
const startTime = performance.now();
|
|
2884
3201
|
const telemetry = getTelemetry();
|
|
2885
|
-
const span = telemetry?.startSpan("
|
|
2886
|
-
"
|
|
2887
|
-
"
|
|
2888
|
-
"whisper.device": this.config.device
|
|
3202
|
+
const span = telemetry?.startSpan("SenseVoice.load", {
|
|
3203
|
+
"model.url": this.config.modelUrl,
|
|
3204
|
+
"model.backend_requested": this.config.backend
|
|
2889
3205
|
});
|
|
2890
3206
|
try {
|
|
2891
|
-
|
|
2892
|
-
|
|
2893
|
-
|
|
2894
|
-
|
|
2895
|
-
|
|
2896
|
-
|
|
2897
|
-
|
|
2898
|
-
if (
|
|
2899
|
-
|
|
2900
|
-
await this.pipeline.dispose();
|
|
2901
|
-
this.pipeline = null;
|
|
3207
|
+
logger4.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
3208
|
+
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
3209
|
+
this.ort = ort;
|
|
3210
|
+
this._backend = backend;
|
|
3211
|
+
logger4.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3212
|
+
logger4.debug("Fetching tokens vocabulary", { tokensUrl: this.config.tokensUrl });
|
|
3213
|
+
const tokensResponse = await fetch(this.config.tokensUrl);
|
|
3214
|
+
if (!tokensResponse.ok) {
|
|
3215
|
+
throw new Error(`Failed to fetch tokens.txt: ${tokensResponse.status} ${tokensResponse.statusText}`);
|
|
2902
3216
|
}
|
|
2903
|
-
const
|
|
2904
|
-
|
|
2905
|
-
logger4.
|
|
2906
|
-
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
__webpack_exports__env.useCustomCache = false;
|
|
2910
|
-
__webpack_exports__env.useWasmCache = false;
|
|
2911
|
-
if (__webpack_exports__env.backends.onnx.wasm) {
|
|
2912
|
-
__webpack_exports__env.backends.onnx.wasm.proxy = false;
|
|
2913
|
-
__webpack_exports__env.backends.onnx.wasm.numThreads = 1;
|
|
3217
|
+
const tokensText = await tokensResponse.text();
|
|
3218
|
+
this.tokenMap = parseTokensFile(tokensText);
|
|
3219
|
+
logger4.debug("Tokens loaded", { vocabSize: this.tokenMap.size });
|
|
3220
|
+
const sessionOptions = getSessionOptions(this._backend);
|
|
3221
|
+
if (this._backend === "webgpu") {
|
|
3222
|
+
sessionOptions.graphOptimizationLevel = "basic";
|
|
2914
3223
|
}
|
|
2915
|
-
|
|
2916
|
-
|
|
2917
|
-
|
|
2918
|
-
|
|
2919
|
-
|
|
2920
|
-
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2925
|
-
|
|
2926
|
-
|
|
2927
|
-
|
|
2928
|
-
|
|
2929
|
-
|
|
2930
|
-
|
|
2931
|
-
|
|
2932
|
-
}
|
|
2933
|
-
|
|
3224
|
+
let isCached = false;
|
|
3225
|
+
if (isIOS()) {
|
|
3226
|
+
logger4.info("iOS: passing model URL directly to ORT (low-memory path)", {
|
|
3227
|
+
modelUrl: this.config.modelUrl
|
|
3228
|
+
});
|
|
3229
|
+
this.session = await this.ort.InferenceSession.create(
|
|
3230
|
+
this.config.modelUrl,
|
|
3231
|
+
sessionOptions
|
|
3232
|
+
);
|
|
3233
|
+
} else {
|
|
3234
|
+
const cache = getModelCache();
|
|
3235
|
+
isCached = await cache.has(this.config.modelUrl);
|
|
3236
|
+
let modelBuffer;
|
|
3237
|
+
if (isCached) {
|
|
3238
|
+
logger4.debug("Loading model from cache", { modelUrl: this.config.modelUrl });
|
|
3239
|
+
modelBuffer = await cache.get(this.config.modelUrl);
|
|
3240
|
+
onProgress?.(modelBuffer.byteLength, modelBuffer.byteLength);
|
|
3241
|
+
} else {
|
|
3242
|
+
logger4.debug("Fetching and caching model", { modelUrl: this.config.modelUrl });
|
|
3243
|
+
modelBuffer = await fetchWithCache(this.config.modelUrl, onProgress);
|
|
3244
|
+
}
|
|
3245
|
+
logger4.debug("Creating ONNX session", {
|
|
3246
|
+
size: formatBytes(modelBuffer.byteLength),
|
|
3247
|
+
backend: this._backend
|
|
3248
|
+
});
|
|
3249
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
3250
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
2934
3251
|
}
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
3252
|
+
try {
|
|
3253
|
+
const metadata = this.session.handler?.metadata;
|
|
3254
|
+
if (metadata?.neg_mean && metadata?.inv_stddev) {
|
|
3255
|
+
const cmvn = parseCMVNFromMetadata(metadata.neg_mean, metadata.inv_stddev);
|
|
3256
|
+
this.negMean = cmvn.negMean;
|
|
3257
|
+
this.invStddev = cmvn.invStddev;
|
|
3258
|
+
logger4.debug("CMVN loaded from model metadata", { dim: this.negMean.length });
|
|
3259
|
+
} else {
|
|
3260
|
+
logger4.warn("CMVN not found in model metadata \u2014 features will not be normalized");
|
|
3261
|
+
}
|
|
3262
|
+
} catch (cmvnErr) {
|
|
3263
|
+
logger4.warn("Failed to read CMVN from model metadata", { error: cmvnErr });
|
|
3264
|
+
}
|
|
3265
|
+
const loadTimeMs = performance.now() - startTime;
|
|
3266
|
+
logger4.info("SenseVoice model loaded", {
|
|
3267
|
+
backend: this._backend,
|
|
3268
|
+
loadTimeMs: Math.round(loadTimeMs),
|
|
3269
|
+
vocabSize: this.tokenMap.size,
|
|
3270
|
+
inputs: this.session.inputNames,
|
|
3271
|
+
outputs: this.session.outputNames,
|
|
3272
|
+
hasCMVN: this.negMean !== null
|
|
2946
3273
|
});
|
|
2947
3274
|
span?.setAttributes({
|
|
2948
|
-
"
|
|
3275
|
+
"model.backend": this._backend,
|
|
3276
|
+
"model.load_time_ms": loadTimeMs,
|
|
3277
|
+
"model.cached": !isIOS() && isCached,
|
|
3278
|
+
"model.vocab_size": this.tokenMap.size
|
|
2949
3279
|
});
|
|
2950
3280
|
span?.end();
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
|
|
3281
|
+
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
3282
|
+
model: "sensevoice",
|
|
3283
|
+
backend: this._backend
|
|
3284
|
+
});
|
|
3285
|
+
return {
|
|
3286
|
+
backend: this._backend,
|
|
3287
|
+
loadTimeMs,
|
|
3288
|
+
inputNames: [...this.session.inputNames],
|
|
3289
|
+
outputNames: [...this.session.outputNames],
|
|
3290
|
+
vocabSize: this.tokenMap.size
|
|
2957
3291
|
};
|
|
2958
|
-
|
|
2959
|
-
span?.endWithError(error);
|
|
3292
|
+
} catch (error) {
|
|
3293
|
+
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
3294
|
+
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
3295
|
+
model: "sensevoice",
|
|
3296
|
+
error_type: "load_failed"
|
|
3297
|
+
});
|
|
2960
3298
|
throw error;
|
|
2961
3299
|
} finally {
|
|
2962
3300
|
this.isLoading = false;
|
|
2963
3301
|
}
|
|
2964
3302
|
}
|
|
3303
|
+
// ─── Transcribe ─────────────────────────────────────────────────────────
|
|
2965
3304
|
/**
|
|
2966
|
-
* Transcribe audio to text
|
|
3305
|
+
* Transcribe audio samples to text
|
|
2967
3306
|
*
|
|
2968
|
-
* @param audio
|
|
2969
|
-
* @
|
|
3307
|
+
* @param audioSamples Float32Array of audio samples at 16kHz, [-1, 1] range
|
|
3308
|
+
* @returns Transcription result with text, emotion, language, and event
|
|
2970
3309
|
*/
|
|
2971
|
-
async transcribe(
|
|
2972
|
-
if (!this.
|
|
3310
|
+
async transcribe(audioSamples) {
|
|
3311
|
+
if (!this.session || !this.ort || !this.tokenMap) {
|
|
2973
3312
|
throw new Error("Model not loaded. Call load() first.");
|
|
2974
3313
|
}
|
|
2975
|
-
const
|
|
2976
|
-
|
|
2977
|
-
const span = telemetry?.startSpan("whisper.transcribe", {
|
|
2978
|
-
"audio.samples": audioCopy.length,
|
|
2979
|
-
"audio.duration_s": audioCopy.length / 16e3,
|
|
2980
|
-
"whisper.model": this.currentModel
|
|
2981
|
-
});
|
|
2982
|
-
try {
|
|
2983
|
-
const inferStart = performance.now();
|
|
2984
|
-
const audioDurationSec = audioCopy.length / 16e3;
|
|
2985
|
-
const isShortAudio = audioDurationSec < 10;
|
|
2986
|
-
logger4.debug("Starting transcription", {
|
|
2987
|
-
audioSamples: audioCopy.length,
|
|
2988
|
-
durationSeconds: audioDurationSec.toFixed(2),
|
|
2989
|
-
isShortAudio
|
|
2990
|
-
});
|
|
2991
|
-
const transcribeOptions = {
|
|
2992
|
-
// Decoding strategy
|
|
2993
|
-
top_k: 0,
|
|
2994
|
-
do_sample: false,
|
|
2995
|
-
// Adaptive chunking: Disable for short audio, enable for long audio
|
|
2996
|
-
chunk_length_s: options?.chunkLengthS || (isShortAudio ? audioDurationSec : 30),
|
|
2997
|
-
stride_length_s: options?.strideLengthS || (isShortAudio ? 0 : 5),
|
|
2998
|
-
// Timestamps
|
|
2999
|
-
return_timestamps: options?.returnTimestamps || false,
|
|
3000
|
-
force_full_sequences: false
|
|
3001
|
-
};
|
|
3002
|
-
if (this.config.multilingual) {
|
|
3003
|
-
transcribeOptions.language = options?.language || this.config.language;
|
|
3004
|
-
transcribeOptions.task = options?.task || this.config.task;
|
|
3005
|
-
}
|
|
3006
|
-
const rawResult = await this.pipeline(audioCopy, transcribeOptions);
|
|
3007
|
-
const result = Array.isArray(rawResult) ? rawResult[0] : rawResult;
|
|
3008
|
-
const inferenceTimeMs = performance.now() - inferStart;
|
|
3009
|
-
let cleanedText = result.text;
|
|
3010
|
-
if (this.config.suppressNonSpeech) {
|
|
3011
|
-
cleanedText = this.removeNonSpeechTokens(cleanedText);
|
|
3012
|
-
}
|
|
3013
|
-
const transcription = {
|
|
3014
|
-
text: cleanedText,
|
|
3015
|
-
language: this.config.language,
|
|
3016
|
-
inferenceTimeMs,
|
|
3017
|
-
chunks: result.chunks
|
|
3018
|
-
};
|
|
3019
|
-
logger4.debug("Transcription complete", {
|
|
3020
|
-
text: transcription.text,
|
|
3021
|
-
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
3022
|
-
chunksCount: result.chunks?.length || 0
|
|
3023
|
-
});
|
|
3024
|
-
span?.setAttributes({
|
|
3025
|
-
"whisper.inference_time_ms": inferenceTimeMs,
|
|
3026
|
-
"whisper.text_length": transcription.text.length
|
|
3027
|
-
});
|
|
3028
|
-
span?.end();
|
|
3029
|
-
return transcription;
|
|
3030
|
-
} catch (error) {
|
|
3031
|
-
logger4.error("Transcribe error", { error });
|
|
3032
|
-
span?.endWithError(error);
|
|
3033
|
-
throw new Error(`Whisper transcription failed: ${error}`);
|
|
3034
|
-
}
|
|
3314
|
+
const audio = new Float32Array(audioSamples);
|
|
3315
|
+
return this.queueInference(audio);
|
|
3035
3316
|
}
|
|
3036
|
-
|
|
3037
|
-
|
|
3038
|
-
|
|
3039
|
-
|
|
3040
|
-
|
|
3041
|
-
|
|
3042
|
-
|
|
3043
|
-
|
|
3044
|
-
|
|
3045
|
-
|
|
3046
|
-
|
|
3047
|
-
|
|
3048
|
-
|
|
3049
|
-
|
|
3050
|
-
|
|
3051
|
-
|
|
3052
|
-
|
|
3053
|
-
|
|
3054
|
-
|
|
3055
|
-
|
|
3056
|
-
|
|
3057
|
-
|
|
3058
|
-
|
|
3059
|
-
|
|
3060
|
-
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3066
|
-
|
|
3067
|
-
|
|
3068
|
-
|
|
3069
|
-
|
|
3070
|
-
|
|
3071
|
-
|
|
3072
|
-
|
|
3073
|
-
|
|
3074
|
-
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3317
|
+
queueInference(audio) {
|
|
3318
|
+
return new Promise((resolve, reject) => {
|
|
3319
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
3320
|
+
const telemetry = getTelemetry();
|
|
3321
|
+
const span = telemetry?.startSpan("SenseVoice.transcribe", {
|
|
3322
|
+
"inference.backend": this._backend,
|
|
3323
|
+
"inference.input_samples": audio.length
|
|
3324
|
+
});
|
|
3325
|
+
try {
|
|
3326
|
+
const startTime = performance.now();
|
|
3327
|
+
const preprocessStart = performance.now();
|
|
3328
|
+
const fbank = computeKaldiFbank(audio, 16e3, 80);
|
|
3329
|
+
const numFrames = fbank.length / 80;
|
|
3330
|
+
if (numFrames === 0) {
|
|
3331
|
+
resolve({
|
|
3332
|
+
text: "",
|
|
3333
|
+
inferenceTimeMs: performance.now() - startTime,
|
|
3334
|
+
preprocessTimeMs: performance.now() - preprocessStart
|
|
3335
|
+
});
|
|
3336
|
+
return;
|
|
3337
|
+
}
|
|
3338
|
+
const lfrFeatures = applyLFR(fbank, 80, 7, 6);
|
|
3339
|
+
const numLfrFrames = lfrFeatures.length / 560;
|
|
3340
|
+
if (this.negMean && this.invStddev) {
|
|
3341
|
+
applyCMVN(lfrFeatures, 560, this.negMean, this.invStddev);
|
|
3342
|
+
}
|
|
3343
|
+
const preprocessTimeMs = performance.now() - preprocessStart;
|
|
3344
|
+
const ort = this.ort;
|
|
3345
|
+
const feeds = {
|
|
3346
|
+
x: new ort.Tensor("float32", lfrFeatures, [1, numLfrFrames, 560]),
|
|
3347
|
+
x_length: new ort.Tensor("int32", new Int32Array([numLfrFrames]), [1]),
|
|
3348
|
+
language: new ort.Tensor("int32", new Int32Array([this.languageId]), [1]),
|
|
3349
|
+
text_norm: new ort.Tensor("int32", new Int32Array([this.textNormId]), [1])
|
|
3350
|
+
};
|
|
3351
|
+
const results = await this.session.run(feeds);
|
|
3352
|
+
const logitsOutput = results["logits"];
|
|
3353
|
+
if (!logitsOutput) {
|
|
3354
|
+
throw new Error('Model output missing "logits" tensor');
|
|
3355
|
+
}
|
|
3356
|
+
const logitsData = logitsOutput.data;
|
|
3357
|
+
const logitsDims = logitsOutput.dims;
|
|
3358
|
+
const seqLen = logitsDims[1];
|
|
3359
|
+
const vocabSize = logitsDims[2];
|
|
3360
|
+
const decoded = ctcGreedyDecode(logitsData, seqLen, vocabSize, this.tokenMap);
|
|
3361
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
3362
|
+
logger4.trace("Transcription complete", {
|
|
3363
|
+
text: decoded.text.substring(0, 50),
|
|
3364
|
+
language: decoded.language,
|
|
3365
|
+
emotion: decoded.emotion,
|
|
3366
|
+
event: decoded.event,
|
|
3367
|
+
preprocessTimeMs: Math.round(preprocessTimeMs * 100) / 100,
|
|
3368
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3369
|
+
numFrames,
|
|
3370
|
+
numLfrFrames
|
|
3371
|
+
});
|
|
3372
|
+
span?.setAttributes({
|
|
3373
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
3374
|
+
"inference.preprocess_ms": preprocessTimeMs,
|
|
3375
|
+
"inference.num_frames": numFrames,
|
|
3376
|
+
"inference.text_length": decoded.text.length
|
|
3377
|
+
});
|
|
3378
|
+
span?.end();
|
|
3379
|
+
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
3380
|
+
model: "sensevoice",
|
|
3381
|
+
backend: this._backend
|
|
3382
|
+
});
|
|
3383
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3384
|
+
model: "sensevoice",
|
|
3385
|
+
backend: this._backend,
|
|
3386
|
+
status: "success"
|
|
3387
|
+
});
|
|
3388
|
+
resolve({
|
|
3389
|
+
text: decoded.text,
|
|
3390
|
+
language: decoded.language,
|
|
3391
|
+
emotion: decoded.emotion,
|
|
3392
|
+
event: decoded.event,
|
|
3393
|
+
inferenceTimeMs,
|
|
3394
|
+
preprocessTimeMs
|
|
3395
|
+
});
|
|
3396
|
+
} catch (err) {
|
|
3397
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3398
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3399
|
+
model: "sensevoice",
|
|
3400
|
+
backend: this._backend,
|
|
3401
|
+
status: "error"
|
|
3078
3402
|
});
|
|
3403
|
+
reject(err);
|
|
3079
3404
|
}
|
|
3080
|
-
}
|
|
3081
|
-
|
|
3082
|
-
onUpdate(result.text);
|
|
3083
|
-
}
|
|
3084
|
-
logger4.debug("Streaming transcription complete", {
|
|
3085
|
-
text: result.text,
|
|
3086
|
-
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
3087
|
-
chunksCount: result.chunks?.length || 0
|
|
3088
|
-
});
|
|
3089
|
-
span?.setAttributes({
|
|
3090
|
-
"whisper.inference_time_ms": inferenceTimeMs,
|
|
3091
|
-
"whisper.chunks_count": result.chunks?.length || 0
|
|
3092
|
-
});
|
|
3093
|
-
span?.end();
|
|
3094
|
-
return {
|
|
3095
|
-
text: result.text,
|
|
3096
|
-
language: this.config.language,
|
|
3097
|
-
inferenceTimeMs,
|
|
3098
|
-
chunks: result.chunks
|
|
3099
|
-
};
|
|
3100
|
-
} catch (error) {
|
|
3101
|
-
logger4.error("Streaming transcribe error", { error });
|
|
3102
|
-
span?.endWithError(error);
|
|
3103
|
-
throw new Error(`Whisper streaming transcription failed: ${error}`);
|
|
3104
|
-
}
|
|
3105
|
-
}
|
|
3106
|
-
/**
|
|
3107
|
-
* Dispose of the model and free resources
|
|
3108
|
-
*/
|
|
3109
|
-
async dispose() {
|
|
3110
|
-
if (this.pipeline) {
|
|
3111
|
-
logger4.debug("Disposing model", { model: this.currentModel });
|
|
3112
|
-
await this.pipeline.dispose();
|
|
3113
|
-
this.pipeline = null;
|
|
3114
|
-
this.currentModel = null;
|
|
3115
|
-
}
|
|
3116
|
-
}
|
|
3117
|
-
/**
|
|
3118
|
-
* Check if model is loaded
|
|
3119
|
-
*/
|
|
3120
|
-
get isLoaded() {
|
|
3121
|
-
return this.pipeline !== null;
|
|
3122
|
-
}
|
|
3123
|
-
/**
|
|
3124
|
-
* Get the backend being used (webgpu or wasm)
|
|
3125
|
-
*/
|
|
3126
|
-
get backend() {
|
|
3127
|
-
return this.actualBackend;
|
|
3128
|
-
}
|
|
3129
|
-
/**
|
|
3130
|
-
* Get the full model name used by transformers.js
|
|
3131
|
-
*/
|
|
3132
|
-
getModelName() {
|
|
3133
|
-
if (this.config.localModelPath) {
|
|
3134
|
-
return this.config.localModelPath;
|
|
3135
|
-
}
|
|
3136
|
-
let modelName = `onnx-community/whisper-${this.config.model}`;
|
|
3137
|
-
if (!this.config.multilingual) {
|
|
3138
|
-
modelName += ".en";
|
|
3139
|
-
}
|
|
3140
|
-
return modelName;
|
|
3405
|
+
});
|
|
3406
|
+
});
|
|
3141
3407
|
}
|
|
3142
|
-
|
|
3143
|
-
|
|
3144
|
-
|
|
3145
|
-
|
|
3146
|
-
|
|
3147
|
-
|
|
3148
|
-
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
return cleaned.replace(/\s+/g, " ").trim();
|
|
3408
|
+
// ─── Dispose ──────────────────────────────────────────────────────────
|
|
3409
|
+
async dispose() {
|
|
3410
|
+
if (this.session) {
|
|
3411
|
+
await this.session.release();
|
|
3412
|
+
this.session = null;
|
|
3413
|
+
}
|
|
3414
|
+
this.ort = null;
|
|
3415
|
+
this.tokenMap = null;
|
|
3416
|
+
this.negMean = null;
|
|
3417
|
+
this.invStddev = null;
|
|
3153
3418
|
}
|
|
3154
3419
|
};
|
|
3155
3420
|
|
|
@@ -4545,268 +4810,8 @@ var VADWorkerWithFallback = class {
|
|
|
4545
4810
|
}
|
|
4546
4811
|
};
|
|
4547
4812
|
|
|
4548
|
-
// src/inference/Emotion2VecInference.ts
|
|
4549
|
-
var logger10 = createLogger("Emotion2Vec");
|
|
4550
|
-
var EMOTION2VEC_LABELS = ["neutral", "happy", "angry", "sad"];
|
|
4551
|
-
var Emotion2VecInference = class {
|
|
4552
|
-
constructor(config) {
|
|
4553
|
-
this.session = null;
|
|
4554
|
-
this.ort = null;
|
|
4555
|
-
this._backend = "wasm";
|
|
4556
|
-
this.isLoading = false;
|
|
4557
|
-
this.inferenceQueue = Promise.resolve();
|
|
4558
|
-
this.config = {
|
|
4559
|
-
modelUrl: config.modelUrl,
|
|
4560
|
-
backend: config.backend ?? "auto",
|
|
4561
|
-
sampleRate: config.sampleRate ?? 16e3
|
|
4562
|
-
};
|
|
4563
|
-
}
|
|
4564
|
-
get backend() {
|
|
4565
|
-
return this.session ? this._backend : null;
|
|
4566
|
-
}
|
|
4567
|
-
get isLoaded() {
|
|
4568
|
-
return this.session !== null;
|
|
4569
|
-
}
|
|
4570
|
-
get sampleRate() {
|
|
4571
|
-
return this.config.sampleRate;
|
|
4572
|
-
}
|
|
4573
|
-
/**
|
|
4574
|
-
* Load the ONNX model
|
|
4575
|
-
*/
|
|
4576
|
-
async load() {
|
|
4577
|
-
if (this.isLoading) {
|
|
4578
|
-
throw new Error("Model is already loading");
|
|
4579
|
-
}
|
|
4580
|
-
if (this.session) {
|
|
4581
|
-
throw new Error("Model already loaded. Call dispose() first.");
|
|
4582
|
-
}
|
|
4583
|
-
this.isLoading = true;
|
|
4584
|
-
const startTime = performance.now();
|
|
4585
|
-
const telemetry = getTelemetry();
|
|
4586
|
-
const span = telemetry?.startSpan("Emotion2Vec.load", {
|
|
4587
|
-
"model.url": this.config.modelUrl,
|
|
4588
|
-
"model.backend_requested": this.config.backend
|
|
4589
|
-
});
|
|
4590
|
-
try {
|
|
4591
|
-
logger10.info("Loading ONNX Runtime...", { preference: this.config.backend });
|
|
4592
|
-
const { ort, backend } = await getOnnxRuntimeForPreference(this.config.backend);
|
|
4593
|
-
this.ort = ort;
|
|
4594
|
-
this._backend = backend;
|
|
4595
|
-
logger10.info("ONNX Runtime loaded", { backend: this._backend });
|
|
4596
|
-
logger10.info("Checking model cache...");
|
|
4597
|
-
const cache = getModelCache();
|
|
4598
|
-
const modelUrl = this.config.modelUrl;
|
|
4599
|
-
const isCached = await cache.has(modelUrl);
|
|
4600
|
-
logger10.info("Cache check complete", { modelUrl, isCached });
|
|
4601
|
-
let modelBuffer;
|
|
4602
|
-
if (isCached) {
|
|
4603
|
-
logger10.info("Loading model from cache...", { modelUrl });
|
|
4604
|
-
modelBuffer = await cache.get(modelUrl);
|
|
4605
|
-
logger10.info("Model loaded from cache", { size: formatBytes(modelBuffer.byteLength) });
|
|
4606
|
-
} else {
|
|
4607
|
-
logger10.info("Fetching model (not cached)...", { modelUrl });
|
|
4608
|
-
modelBuffer = await fetchWithCache(modelUrl);
|
|
4609
|
-
logger10.info("Model fetched and cached", { size: formatBytes(modelBuffer.byteLength) });
|
|
4610
|
-
}
|
|
4611
|
-
logger10.info("Creating ONNX session (this may take a while for large models)...");
|
|
4612
|
-
logger10.debug("Creating ONNX session", {
|
|
4613
|
-
size: formatBytes(modelBuffer.byteLength),
|
|
4614
|
-
backend: this._backend
|
|
4615
|
-
});
|
|
4616
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
4617
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
4618
|
-
this.session = await ort.InferenceSession.create(modelData, sessionOptions);
|
|
4619
|
-
const loadTimeMs = performance.now() - startTime;
|
|
4620
|
-
logger10.info("Model loaded successfully", {
|
|
4621
|
-
backend: this._backend,
|
|
4622
|
-
loadTimeMs: Math.round(loadTimeMs),
|
|
4623
|
-
sampleRate: this.config.sampleRate,
|
|
4624
|
-
inputNames: [...this.session.inputNames],
|
|
4625
|
-
outputNames: [...this.session.outputNames]
|
|
4626
|
-
});
|
|
4627
|
-
span?.setAttributes({
|
|
4628
|
-
"model.backend": this._backend,
|
|
4629
|
-
"model.load_time_ms": loadTimeMs,
|
|
4630
|
-
"model.cached": isCached
|
|
4631
|
-
});
|
|
4632
|
-
span?.end();
|
|
4633
|
-
telemetry?.recordHistogram("omote.model.load_time", loadTimeMs, {
|
|
4634
|
-
model: "emotion2vec",
|
|
4635
|
-
backend: this._backend
|
|
4636
|
-
});
|
|
4637
|
-
return {
|
|
4638
|
-
backend: this._backend,
|
|
4639
|
-
loadTimeMs,
|
|
4640
|
-
inputNames: [...this.session.inputNames],
|
|
4641
|
-
outputNames: [...this.session.outputNames],
|
|
4642
|
-
sampleRate: this.config.sampleRate
|
|
4643
|
-
};
|
|
4644
|
-
} catch (error) {
|
|
4645
|
-
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
4646
|
-
telemetry?.incrementCounter("omote.errors.total", 1, {
|
|
4647
|
-
model: "emotion2vec",
|
|
4648
|
-
error_type: "load_failed"
|
|
4649
|
-
});
|
|
4650
|
-
throw error;
|
|
4651
|
-
} finally {
|
|
4652
|
-
this.isLoading = false;
|
|
4653
|
-
}
|
|
4654
|
-
}
|
|
4655
|
-
/**
|
|
4656
|
-
* Run emotion inference on audio samples
|
|
4657
|
-
*
|
|
4658
|
-
* @param audio - Float32Array of 16kHz audio samples
|
|
4659
|
-
* @returns Frame-level emotion results at 50Hz
|
|
4660
|
-
*/
|
|
4661
|
-
async infer(audio) {
|
|
4662
|
-
if (!this.session) {
|
|
4663
|
-
throw new Error("Model not loaded. Call load() first.");
|
|
4664
|
-
}
|
|
4665
|
-
return this.queueInference(audio);
|
|
4666
|
-
}
|
|
4667
|
-
queueInference(audio) {
|
|
4668
|
-
const audioCopy = new Float32Array(audio);
|
|
4669
|
-
return new Promise((resolve, reject) => {
|
|
4670
|
-
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
4671
|
-
const telemetry = getTelemetry();
|
|
4672
|
-
const span = telemetry?.startSpan("Emotion2Vec.infer", {
|
|
4673
|
-
"inference.backend": this._backend,
|
|
4674
|
-
"inference.audio_samples": audioCopy.length
|
|
4675
|
-
});
|
|
4676
|
-
try {
|
|
4677
|
-
const startTime = performance.now();
|
|
4678
|
-
const inputTensor = new this.ort.Tensor("float32", audioCopy, [1, audioCopy.length]);
|
|
4679
|
-
const results = await this.session.run({ audio: inputTensor });
|
|
4680
|
-
const logitsTensor = results["logits"];
|
|
4681
|
-
const embeddingsTensor = results["layer_norm_25"];
|
|
4682
|
-
if (!logitsTensor) {
|
|
4683
|
-
throw new Error(
|
|
4684
|
-
`Missing logits tensor from SUPERB model. Got outputs: ${Object.keys(results).join(", ")}`
|
|
4685
|
-
);
|
|
4686
|
-
}
|
|
4687
|
-
const logitsData = logitsTensor.data;
|
|
4688
|
-
const logits = new Float32Array(logitsData);
|
|
4689
|
-
const probs = this.softmax(logits);
|
|
4690
|
-
const probabilities = {
|
|
4691
|
-
neutral: probs[0],
|
|
4692
|
-
happy: probs[1],
|
|
4693
|
-
angry: probs[2],
|
|
4694
|
-
sad: probs[3]
|
|
4695
|
-
};
|
|
4696
|
-
let maxIdx = 0;
|
|
4697
|
-
let maxProb = probs[0];
|
|
4698
|
-
for (let i = 1; i < probs.length; i++) {
|
|
4699
|
-
if (probs[i] > maxProb) {
|
|
4700
|
-
maxProb = probs[i];
|
|
4701
|
-
maxIdx = i;
|
|
4702
|
-
}
|
|
4703
|
-
}
|
|
4704
|
-
const dominant = {
|
|
4705
|
-
emotion: EMOTION2VEC_LABELS[maxIdx],
|
|
4706
|
-
confidence: maxProb,
|
|
4707
|
-
probabilities
|
|
4708
|
-
};
|
|
4709
|
-
let embeddings = [];
|
|
4710
|
-
let numFrames = 1;
|
|
4711
|
-
if (embeddingsTensor) {
|
|
4712
|
-
const embeddingData = embeddingsTensor.data;
|
|
4713
|
-
const dims = embeddingsTensor.dims;
|
|
4714
|
-
if (dims.length === 3) {
|
|
4715
|
-
numFrames = dims[1];
|
|
4716
|
-
const embeddingDim = dims[2];
|
|
4717
|
-
for (let i = 0; i < numFrames; i++) {
|
|
4718
|
-
const start = i * embeddingDim;
|
|
4719
|
-
embeddings.push(new Float32Array(embeddingData.slice(start, start + embeddingDim)));
|
|
4720
|
-
}
|
|
4721
|
-
}
|
|
4722
|
-
}
|
|
4723
|
-
const frames = [];
|
|
4724
|
-
for (let i = 0; i < numFrames; i++) {
|
|
4725
|
-
frames.push({
|
|
4726
|
-
emotion: dominant.emotion,
|
|
4727
|
-
confidence: dominant.confidence,
|
|
4728
|
-
probabilities: { ...probabilities }
|
|
4729
|
-
});
|
|
4730
|
-
}
|
|
4731
|
-
const inferenceTimeMs = performance.now() - startTime;
|
|
4732
|
-
logger10.debug("Emotion inference completed", {
|
|
4733
|
-
numFrames,
|
|
4734
|
-
dominant: dominant.emotion,
|
|
4735
|
-
confidence: Math.round(dominant.confidence * 100),
|
|
4736
|
-
inferenceTimeMs: Math.round(inferenceTimeMs)
|
|
4737
|
-
});
|
|
4738
|
-
span?.setAttributes({
|
|
4739
|
-
"inference.duration_ms": inferenceTimeMs,
|
|
4740
|
-
"inference.num_frames": numFrames,
|
|
4741
|
-
"inference.dominant_emotion": dominant.emotion
|
|
4742
|
-
});
|
|
4743
|
-
span?.end();
|
|
4744
|
-
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
4745
|
-
model: "emotion2vec",
|
|
4746
|
-
backend: this._backend
|
|
4747
|
-
});
|
|
4748
|
-
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4749
|
-
model: "emotion2vec",
|
|
4750
|
-
backend: this._backend,
|
|
4751
|
-
status: "success"
|
|
4752
|
-
});
|
|
4753
|
-
resolve({
|
|
4754
|
-
frames,
|
|
4755
|
-
dominant,
|
|
4756
|
-
embeddings,
|
|
4757
|
-
logits,
|
|
4758
|
-
inferenceTimeMs
|
|
4759
|
-
});
|
|
4760
|
-
} catch (err) {
|
|
4761
|
-
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
4762
|
-
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
4763
|
-
model: "emotion2vec",
|
|
4764
|
-
backend: this._backend,
|
|
4765
|
-
status: "error"
|
|
4766
|
-
});
|
|
4767
|
-
reject(err);
|
|
4768
|
-
}
|
|
4769
|
-
});
|
|
4770
|
-
});
|
|
4771
|
-
}
|
|
4772
|
-
/**
|
|
4773
|
-
* Apply softmax to convert logits to probabilities
|
|
4774
|
-
*/
|
|
4775
|
-
softmax(logits) {
|
|
4776
|
-
let max = logits[0];
|
|
4777
|
-
for (let i = 1; i < logits.length; i++) {
|
|
4778
|
-
if (logits[i] > max) max = logits[i];
|
|
4779
|
-
}
|
|
4780
|
-
const exp = new Float32Array(logits.length);
|
|
4781
|
-
let sum = 0;
|
|
4782
|
-
for (let i = 0; i < logits.length; i++) {
|
|
4783
|
-
exp[i] = Math.exp(logits[i] - max);
|
|
4784
|
-
sum += exp[i];
|
|
4785
|
-
}
|
|
4786
|
-
const probs = new Float32Array(logits.length);
|
|
4787
|
-
for (let i = 0; i < logits.length; i++) {
|
|
4788
|
-
probs[i] = exp[i] / sum;
|
|
4789
|
-
}
|
|
4790
|
-
return probs;
|
|
4791
|
-
}
|
|
4792
|
-
/**
|
|
4793
|
-
* Dispose of the model and free resources
|
|
4794
|
-
*/
|
|
4795
|
-
async dispose() {
|
|
4796
|
-
if (this.session) {
|
|
4797
|
-
await this.session.release();
|
|
4798
|
-
this.session = null;
|
|
4799
|
-
}
|
|
4800
|
-
}
|
|
4801
|
-
};
|
|
4802
|
-
/**
|
|
4803
|
-
* Check if WebGPU is available and working
|
|
4804
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
4805
|
-
*/
|
|
4806
|
-
Emotion2VecInference.isWebGPUAvailable = isWebGPUAvailable;
|
|
4807
|
-
|
|
4808
4813
|
// src/inference/SafariSpeechRecognition.ts
|
|
4809
|
-
var
|
|
4814
|
+
var logger10 = createLogger("SafariSpeech");
|
|
4810
4815
|
var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
4811
4816
|
constructor(config = {}) {
|
|
4812
4817
|
this.recognition = null;
|
|
@@ -4825,7 +4830,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4825
4830
|
interimResults: config.interimResults ?? true,
|
|
4826
4831
|
maxAlternatives: config.maxAlternatives ?? 1
|
|
4827
4832
|
};
|
|
4828
|
-
|
|
4833
|
+
logger10.debug("SafariSpeechRecognition created", {
|
|
4829
4834
|
language: this.config.language,
|
|
4830
4835
|
continuous: this.config.continuous
|
|
4831
4836
|
});
|
|
@@ -4886,7 +4891,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4886
4891
|
*/
|
|
4887
4892
|
async start() {
|
|
4888
4893
|
if (this.isListening) {
|
|
4889
|
-
|
|
4894
|
+
logger10.warn("Already listening");
|
|
4890
4895
|
return;
|
|
4891
4896
|
}
|
|
4892
4897
|
if (!_SafariSpeechRecognition.isAvailable()) {
|
|
@@ -4916,7 +4921,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4916
4921
|
this.isListening = true;
|
|
4917
4922
|
this.startTime = performance.now();
|
|
4918
4923
|
this.accumulatedText = "";
|
|
4919
|
-
|
|
4924
|
+
logger10.info("Speech recognition started", {
|
|
4920
4925
|
language: this.config.language
|
|
4921
4926
|
});
|
|
4922
4927
|
span?.end();
|
|
@@ -4931,7 +4936,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4931
4936
|
*/
|
|
4932
4937
|
async stop() {
|
|
4933
4938
|
if (!this.isListening || !this.recognition) {
|
|
4934
|
-
|
|
4939
|
+
logger10.warn("Not currently listening");
|
|
4935
4940
|
return {
|
|
4936
4941
|
text: this.accumulatedText,
|
|
4937
4942
|
language: this.config.language,
|
|
@@ -4960,7 +4965,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4960
4965
|
if (this.recognition && this.isListening) {
|
|
4961
4966
|
this.recognition.abort();
|
|
4962
4967
|
this.isListening = false;
|
|
4963
|
-
|
|
4968
|
+
logger10.info("Speech recognition aborted");
|
|
4964
4969
|
}
|
|
4965
4970
|
}
|
|
4966
4971
|
/**
|
|
@@ -4991,7 +4996,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
4991
4996
|
this.isListening = false;
|
|
4992
4997
|
this.resultCallbacks = [];
|
|
4993
4998
|
this.errorCallbacks = [];
|
|
4994
|
-
|
|
4999
|
+
logger10.debug("SafariSpeechRecognition disposed");
|
|
4995
5000
|
}
|
|
4996
5001
|
/**
|
|
4997
5002
|
* Set up event handlers for the recognition instance
|
|
@@ -5019,7 +5024,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5019
5024
|
confidence: alternative.confidence
|
|
5020
5025
|
};
|
|
5021
5026
|
this.emitResult(speechResult);
|
|
5022
|
-
|
|
5027
|
+
logger10.trace("Speech result", {
|
|
5023
5028
|
text: text.substring(0, 50),
|
|
5024
5029
|
isFinal,
|
|
5025
5030
|
confidence: alternative.confidence
|
|
@@ -5029,12 +5034,12 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5029
5034
|
span?.end();
|
|
5030
5035
|
} catch (error) {
|
|
5031
5036
|
span?.endWithError(error instanceof Error ? error : new Error(String(error)));
|
|
5032
|
-
|
|
5037
|
+
logger10.error("Error processing speech result", { error });
|
|
5033
5038
|
}
|
|
5034
5039
|
};
|
|
5035
5040
|
this.recognition.onerror = (event) => {
|
|
5036
5041
|
const error = new Error(`Speech recognition error: ${event.error} - ${event.message}`);
|
|
5037
|
-
|
|
5042
|
+
logger10.error("Speech recognition error", { error: event.error, message: event.message });
|
|
5038
5043
|
this.emitError(error);
|
|
5039
5044
|
if (this.stopRejecter) {
|
|
5040
5045
|
this.stopRejecter(error);
|
|
@@ -5044,7 +5049,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5044
5049
|
};
|
|
5045
5050
|
this.recognition.onend = () => {
|
|
5046
5051
|
this.isListening = false;
|
|
5047
|
-
|
|
5052
|
+
logger10.info("Speech recognition ended", {
|
|
5048
5053
|
totalText: this.accumulatedText.length,
|
|
5049
5054
|
durationMs: performance.now() - this.startTime
|
|
5050
5055
|
});
|
|
@@ -5061,13 +5066,13 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5061
5066
|
}
|
|
5062
5067
|
};
|
|
5063
5068
|
this.recognition.onstart = () => {
|
|
5064
|
-
|
|
5069
|
+
logger10.debug("Speech recognition started by browser");
|
|
5065
5070
|
};
|
|
5066
5071
|
this.recognition.onspeechstart = () => {
|
|
5067
|
-
|
|
5072
|
+
logger10.debug("Speech detected");
|
|
5068
5073
|
};
|
|
5069
5074
|
this.recognition.onspeechend = () => {
|
|
5070
|
-
|
|
5075
|
+
logger10.debug("Speech ended");
|
|
5071
5076
|
};
|
|
5072
5077
|
}
|
|
5073
5078
|
/**
|
|
@@ -5078,7 +5083,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5078
5083
|
try {
|
|
5079
5084
|
callback(result);
|
|
5080
5085
|
} catch (error) {
|
|
5081
|
-
|
|
5086
|
+
logger10.error("Error in result callback", { error });
|
|
5082
5087
|
}
|
|
5083
5088
|
}
|
|
5084
5089
|
}
|
|
@@ -5090,7 +5095,7 @@ var SafariSpeechRecognition = class _SafariSpeechRecognition {
|
|
|
5090
5095
|
try {
|
|
5091
5096
|
callback(error);
|
|
5092
5097
|
} catch (callbackError) {
|
|
5093
|
-
|
|
5098
|
+
logger10.error("Error in error callback", { error: callbackError });
|
|
5094
5099
|
}
|
|
5095
5100
|
}
|
|
5096
5101
|
}
|
|
@@ -5264,7 +5269,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5264
5269
|
this._sessionId = null;
|
|
5265
5270
|
this._isConnected = false;
|
|
5266
5271
|
// Sub-components
|
|
5267
|
-
this.
|
|
5272
|
+
this.asr = null;
|
|
5268
5273
|
this.vad = null;
|
|
5269
5274
|
this.lam = null;
|
|
5270
5275
|
this.pipeline = null;
|
|
@@ -5303,7 +5308,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5303
5308
|
try {
|
|
5304
5309
|
const authToken = await this.getAuthToken(config.tenant);
|
|
5305
5310
|
await Promise.all([
|
|
5306
|
-
this.
|
|
5311
|
+
this.initASR(),
|
|
5307
5312
|
this.initLAM()
|
|
5308
5313
|
]);
|
|
5309
5314
|
await this.connectWebSocket(authToken, config);
|
|
@@ -5333,7 +5338,7 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5333
5338
|
this.ws = null;
|
|
5334
5339
|
}
|
|
5335
5340
|
await Promise.all([
|
|
5336
|
-
this.
|
|
5341
|
+
this.asr?.dispose(),
|
|
5337
5342
|
this.vad?.dispose(),
|
|
5338
5343
|
this.lam?.dispose()
|
|
5339
5344
|
]);
|
|
@@ -5465,16 +5470,15 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5465
5470
|
});
|
|
5466
5471
|
return token;
|
|
5467
5472
|
}
|
|
5468
|
-
async
|
|
5473
|
+
async initASR() {
|
|
5469
5474
|
await Promise.all([
|
|
5470
|
-
//
|
|
5475
|
+
// SenseVoice ASR
|
|
5471
5476
|
(async () => {
|
|
5472
|
-
this.
|
|
5473
|
-
|
|
5474
|
-
|
|
5475
|
-
language: "en"
|
|
5477
|
+
this.asr = new SenseVoiceInference({
|
|
5478
|
+
modelUrl: "/models/sensevoice/model.int8.onnx",
|
|
5479
|
+
language: "auto"
|
|
5476
5480
|
});
|
|
5477
|
-
await this.
|
|
5481
|
+
await this.asr.load();
|
|
5478
5482
|
})(),
|
|
5479
5483
|
// Silero VAD for accurate voice activity detection
|
|
5480
5484
|
(async () => {
|
|
@@ -5660,17 +5664,17 @@ var AgentCoreAdapter = class extends EventEmitter {
|
|
|
5660
5664
|
console.debug("[AgentCore] Skipping silent audio", { rms, samples: audio.length });
|
|
5661
5665
|
return;
|
|
5662
5666
|
}
|
|
5663
|
-
if (this.
|
|
5667
|
+
if (this.asr) {
|
|
5664
5668
|
this.setState("listening");
|
|
5665
5669
|
this.emit("user.speech.start", { timestamp: Date.now() });
|
|
5666
|
-
this.
|
|
5670
|
+
this.asr.transcribe(audio).then((result) => {
|
|
5667
5671
|
this.emit("user.transcript.final", {
|
|
5668
5672
|
text: result.text,
|
|
5669
5673
|
confidence: 1
|
|
5670
5674
|
});
|
|
5671
5675
|
this.emit("user.speech.end", { timestamp: Date.now(), durationMs: result.inferenceTimeMs });
|
|
5672
5676
|
const cleanText = result.text.trim();
|
|
5673
|
-
if (cleanText
|
|
5677
|
+
if (cleanText) {
|
|
5674
5678
|
this.sendText(cleanText).catch((error) => {
|
|
5675
5679
|
console.error("[AgentCore] Send text error:", error);
|
|
5676
5680
|
});
|
|
@@ -6484,228 +6488,6 @@ var InterruptionHandler = class extends EventEmitter {
|
|
|
6484
6488
|
}
|
|
6485
6489
|
};
|
|
6486
6490
|
|
|
6487
|
-
// src/cache/huggingFaceCDN.ts
|
|
6488
|
-
var HF_CDN_TEST_URL = "https://huggingface.co/Xenova/whisper-tiny/resolve/main/config.json";
|
|
6489
|
-
function parseHuggingFaceUrl(url) {
|
|
6490
|
-
const pattern = /^https:\/\/huggingface\.co\/([^/]+)\/([^/]+)\/resolve\/([^/]+)\/(.+)$/;
|
|
6491
|
-
const match = url.match(pattern);
|
|
6492
|
-
if (!match) {
|
|
6493
|
-
return null;
|
|
6494
|
-
}
|
|
6495
|
-
return {
|
|
6496
|
-
org: match[1],
|
|
6497
|
-
model: match[2],
|
|
6498
|
-
branch: match[3],
|
|
6499
|
-
file: match[4]
|
|
6500
|
-
};
|
|
6501
|
-
}
|
|
6502
|
-
async function isHuggingFaceCDNReachable(testUrl = HF_CDN_TEST_URL) {
|
|
6503
|
-
try {
|
|
6504
|
-
const response = await fetch(testUrl, {
|
|
6505
|
-
method: "HEAD",
|
|
6506
|
-
cache: "no-store"
|
|
6507
|
-
// Don't use cached response for reachability check
|
|
6508
|
-
});
|
|
6509
|
-
return response.ok;
|
|
6510
|
-
} catch {
|
|
6511
|
-
return false;
|
|
6512
|
-
}
|
|
6513
|
-
}
|
|
6514
|
-
|
|
6515
|
-
// src/utils/transformersCacheClear.ts
|
|
6516
|
-
var logger12 = createLogger("TransformersCache");
|
|
6517
|
-
async function clearTransformersCache(options) {
|
|
6518
|
-
const verbose = options?.verbose ?? true;
|
|
6519
|
-
const additionalPatterns = options?.additionalPatterns ?? [];
|
|
6520
|
-
if (!("caches" in window)) {
|
|
6521
|
-
logger12.warn("Cache API not available in this environment");
|
|
6522
|
-
return [];
|
|
6523
|
-
}
|
|
6524
|
-
try {
|
|
6525
|
-
const cacheNames = await caches.keys();
|
|
6526
|
-
const deletedCaches = [];
|
|
6527
|
-
const patterns = [
|
|
6528
|
-
"transformers",
|
|
6529
|
-
"huggingface",
|
|
6530
|
-
"onnx",
|
|
6531
|
-
...additionalPatterns
|
|
6532
|
-
];
|
|
6533
|
-
for (const cacheName of cacheNames) {
|
|
6534
|
-
const shouldDelete = patterns.some(
|
|
6535
|
-
(pattern) => cacheName.toLowerCase().includes(pattern.toLowerCase())
|
|
6536
|
-
);
|
|
6537
|
-
if (shouldDelete) {
|
|
6538
|
-
if (verbose) {
|
|
6539
|
-
logger12.info("Deleting cache", { cacheName });
|
|
6540
|
-
}
|
|
6541
|
-
const deleted = await caches.delete(cacheName);
|
|
6542
|
-
if (deleted) {
|
|
6543
|
-
deletedCaches.push(cacheName);
|
|
6544
|
-
} else if (verbose) {
|
|
6545
|
-
logger12.warn("Failed to delete cache", { cacheName });
|
|
6546
|
-
}
|
|
6547
|
-
}
|
|
6548
|
-
}
|
|
6549
|
-
if (verbose) {
|
|
6550
|
-
logger12.info("Cache clearing complete", {
|
|
6551
|
-
totalCaches: cacheNames.length,
|
|
6552
|
-
deletedCount: deletedCaches.length,
|
|
6553
|
-
deletedCaches
|
|
6554
|
-
});
|
|
6555
|
-
}
|
|
6556
|
-
return deletedCaches;
|
|
6557
|
-
} catch (error) {
|
|
6558
|
-
logger12.error("Error clearing caches", { error });
|
|
6559
|
-
throw error;
|
|
6560
|
-
}
|
|
6561
|
-
}
|
|
6562
|
-
async function clearSpecificCache(cacheName) {
|
|
6563
|
-
if (!("caches" in window)) {
|
|
6564
|
-
logger12.warn("Cache API not available in this environment");
|
|
6565
|
-
return false;
|
|
6566
|
-
}
|
|
6567
|
-
try {
|
|
6568
|
-
const deleted = await caches.delete(cacheName);
|
|
6569
|
-
logger12.info("Cache deletion attempt", { cacheName, deleted });
|
|
6570
|
-
return deleted;
|
|
6571
|
-
} catch (error) {
|
|
6572
|
-
logger12.error("Error deleting cache", { cacheName, error });
|
|
6573
|
-
return false;
|
|
6574
|
-
}
|
|
6575
|
-
}
|
|
6576
|
-
async function listCaches() {
|
|
6577
|
-
if (!("caches" in window)) {
|
|
6578
|
-
logger12.warn("Cache API not available in this environment");
|
|
6579
|
-
return [];
|
|
6580
|
-
}
|
|
6581
|
-
try {
|
|
6582
|
-
const cacheNames = await caches.keys();
|
|
6583
|
-
logger12.debug("Available caches", { cacheNames });
|
|
6584
|
-
return cacheNames;
|
|
6585
|
-
} catch (error) {
|
|
6586
|
-
logger12.error("Error listing caches", { error });
|
|
6587
|
-
return [];
|
|
6588
|
-
}
|
|
6589
|
-
}
|
|
6590
|
-
async function validateCachedResponse(cacheName, requestUrl) {
|
|
6591
|
-
if (!("caches" in window)) {
|
|
6592
|
-
return {
|
|
6593
|
-
exists: false,
|
|
6594
|
-
valid: false,
|
|
6595
|
-
contentType: null,
|
|
6596
|
-
isHtml: false,
|
|
6597
|
-
reason: "Cache API not available"
|
|
6598
|
-
};
|
|
6599
|
-
}
|
|
6600
|
-
try {
|
|
6601
|
-
const cache = await caches.open(cacheName);
|
|
6602
|
-
const response = await cache.match(requestUrl);
|
|
6603
|
-
if (!response) {
|
|
6604
|
-
return {
|
|
6605
|
-
exists: false,
|
|
6606
|
-
valid: false,
|
|
6607
|
-
contentType: null,
|
|
6608
|
-
isHtml: false,
|
|
6609
|
-
reason: "Not in cache"
|
|
6610
|
-
};
|
|
6611
|
-
}
|
|
6612
|
-
const contentType = response.headers.get("content-type");
|
|
6613
|
-
const isHtml = contentType?.includes("text/html") || contentType?.includes("text/plain");
|
|
6614
|
-
const clonedResponse = response.clone();
|
|
6615
|
-
const text = await clonedResponse.text();
|
|
6616
|
-
const looksLikeHtml = text.trim().startsWith("<") || text.includes("<!DOCTYPE");
|
|
6617
|
-
const valid = Boolean(
|
|
6618
|
-
response.status === 200 && !isHtml && !looksLikeHtml && contentType && (contentType.includes("application/json") || contentType.includes("application/octet-stream") || contentType.includes("binary"))
|
|
6619
|
-
);
|
|
6620
|
-
return {
|
|
6621
|
-
exists: true,
|
|
6622
|
-
valid,
|
|
6623
|
-
contentType,
|
|
6624
|
-
isHtml: isHtml || looksLikeHtml,
|
|
6625
|
-
reason: valid ? "Valid response" : `Invalid: status=${response.status}, contentType=${contentType}, isHtml=${isHtml || looksLikeHtml}`
|
|
6626
|
-
};
|
|
6627
|
-
} catch (error) {
|
|
6628
|
-
logger12.error("Error validating cached response", { cacheName, requestUrl, error });
|
|
6629
|
-
return {
|
|
6630
|
-
exists: false,
|
|
6631
|
-
valid: false,
|
|
6632
|
-
contentType: null,
|
|
6633
|
-
isHtml: false,
|
|
6634
|
-
reason: `Error: ${error}`
|
|
6635
|
-
};
|
|
6636
|
-
}
|
|
6637
|
-
}
|
|
6638
|
-
async function scanForInvalidCaches() {
|
|
6639
|
-
if (!("caches" in window)) {
|
|
6640
|
-
return { totalCaches: 0, scannedEntries: 0, invalidEntries: [] };
|
|
6641
|
-
}
|
|
6642
|
-
const invalidEntries = [];
|
|
6643
|
-
let scannedEntries = 0;
|
|
6644
|
-
try {
|
|
6645
|
-
const cacheNames = await caches.keys();
|
|
6646
|
-
for (const cacheName of cacheNames) {
|
|
6647
|
-
if (!cacheName.toLowerCase().includes("transformers")) {
|
|
6648
|
-
continue;
|
|
6649
|
-
}
|
|
6650
|
-
const cache = await caches.open(cacheName);
|
|
6651
|
-
const requests = await cache.keys();
|
|
6652
|
-
for (const request of requests) {
|
|
6653
|
-
scannedEntries++;
|
|
6654
|
-
const url = request.url;
|
|
6655
|
-
const validation = await validateCachedResponse(cacheName, url);
|
|
6656
|
-
if (validation.exists && !validation.valid) {
|
|
6657
|
-
invalidEntries.push({
|
|
6658
|
-
cacheName,
|
|
6659
|
-
url,
|
|
6660
|
-
reason: validation.reason || "Unknown"
|
|
6661
|
-
});
|
|
6662
|
-
}
|
|
6663
|
-
}
|
|
6664
|
-
}
|
|
6665
|
-
logger12.info("Cache scan complete", {
|
|
6666
|
-
totalCaches: cacheNames.length,
|
|
6667
|
-
scannedEntries,
|
|
6668
|
-
invalidCount: invalidEntries.length
|
|
6669
|
-
});
|
|
6670
|
-
return {
|
|
6671
|
-
totalCaches: cacheNames.length,
|
|
6672
|
-
scannedEntries,
|
|
6673
|
-
invalidEntries
|
|
6674
|
-
};
|
|
6675
|
-
} catch (error) {
|
|
6676
|
-
logger12.error("Error scanning caches", { error });
|
|
6677
|
-
throw error;
|
|
6678
|
-
}
|
|
6679
|
-
}
|
|
6680
|
-
async function nukeBrowserCaches(preventRecreation = false) {
|
|
6681
|
-
if (!("caches" in window)) {
|
|
6682
|
-
logger12.warn("Cache API not available in this environment");
|
|
6683
|
-
return 0;
|
|
6684
|
-
}
|
|
6685
|
-
try {
|
|
6686
|
-
const cacheNames = await caches.keys();
|
|
6687
|
-
let deletedCount = 0;
|
|
6688
|
-
for (const cacheName of cacheNames) {
|
|
6689
|
-
const deleted = await caches.delete(cacheName);
|
|
6690
|
-
if (deleted) {
|
|
6691
|
-
deletedCount++;
|
|
6692
|
-
}
|
|
6693
|
-
}
|
|
6694
|
-
logger12.info("All browser caches cleared", {
|
|
6695
|
-
totalDeleted: deletedCount
|
|
6696
|
-
});
|
|
6697
|
-
if (preventRecreation) {
|
|
6698
|
-
const { env } = await import("./transformers.web-T5LWC34T.mjs");
|
|
6699
|
-
env.useBrowserCache = false;
|
|
6700
|
-
logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
|
|
6701
|
-
}
|
|
6702
|
-
return deletedCount;
|
|
6703
|
-
} catch (error) {
|
|
6704
|
-
logger12.error("Error nuking caches", { error });
|
|
6705
|
-
throw error;
|
|
6706
|
-
}
|
|
6707
|
-
}
|
|
6708
|
-
|
|
6709
6491
|
// src/animation/types.ts
|
|
6710
6492
|
var DEFAULT_ANIMATION_CONFIG = {
|
|
6711
6493
|
initialState: "idle",
|
|
@@ -7245,7 +7027,6 @@ export {
|
|
|
7245
7027
|
EmotionPresets,
|
|
7246
7028
|
EmphasisDetector,
|
|
7247
7029
|
EventEmitter,
|
|
7248
|
-
HF_CDN_TEST_URL,
|
|
7249
7030
|
INFERENCE_LATENCY_BUCKETS,
|
|
7250
7031
|
InterruptionHandler,
|
|
7251
7032
|
LAMPipeline,
|
|
@@ -7259,6 +7040,7 @@ export {
|
|
|
7259
7040
|
OmoteTelemetry,
|
|
7260
7041
|
RingBuffer,
|
|
7261
7042
|
SafariSpeechRecognition,
|
|
7043
|
+
SenseVoiceInference,
|
|
7262
7044
|
SileroVADInference,
|
|
7263
7045
|
SileroVADWorker,
|
|
7264
7046
|
SyncedAudioPipeline,
|
|
@@ -7266,12 +7048,12 @@ export {
|
|
|
7266
7048
|
WAV2ARKIT_BLENDSHAPES,
|
|
7267
7049
|
Wav2ArkitCpuInference,
|
|
7268
7050
|
Wav2Vec2Inference,
|
|
7269
|
-
|
|
7051
|
+
applyCMVN,
|
|
7052
|
+
applyLFR,
|
|
7270
7053
|
blendEmotions,
|
|
7271
7054
|
calculatePeak,
|
|
7272
7055
|
calculateRMS,
|
|
7273
|
-
|
|
7274
|
-
clearTransformersCache,
|
|
7056
|
+
computeKaldiFbank,
|
|
7275
7057
|
configureCacheLimit,
|
|
7276
7058
|
configureLogging,
|
|
7277
7059
|
configureTelemetry,
|
|
@@ -7280,6 +7062,7 @@ export {
|
|
|
7280
7062
|
createLogger,
|
|
7281
7063
|
createSessionWithFallback,
|
|
7282
7064
|
createSileroVAD,
|
|
7065
|
+
ctcGreedyDecode,
|
|
7283
7066
|
fetchWithCache,
|
|
7284
7067
|
formatBytes,
|
|
7285
7068
|
getCacheConfig,
|
|
@@ -7296,7 +7079,6 @@ export {
|
|
|
7296
7079
|
getTelemetry,
|
|
7297
7080
|
hasWebGPUApi,
|
|
7298
7081
|
isAndroid,
|
|
7299
|
-
isHuggingFaceCDNReachable,
|
|
7300
7082
|
isIOS,
|
|
7301
7083
|
isIOSSafari,
|
|
7302
7084
|
isMobile,
|
|
@@ -7305,16 +7087,16 @@ export {
|
|
|
7305
7087
|
isSpeechRecognitionAvailable,
|
|
7306
7088
|
isWebGPUAvailable,
|
|
7307
7089
|
lerpEmotion,
|
|
7308
|
-
listCaches,
|
|
7309
7090
|
noopLogger,
|
|
7310
|
-
|
|
7311
|
-
|
|
7091
|
+
parseCMVNFromMetadata,
|
|
7092
|
+
parseTokensFile,
|
|
7312
7093
|
preloadModels,
|
|
7313
7094
|
preloadOnnxRuntime,
|
|
7314
7095
|
remapWav2ArkitToLam,
|
|
7315
7096
|
resetLoggingConfig,
|
|
7316
7097
|
resolveBackend,
|
|
7317
|
-
|
|
7098
|
+
resolveLanguageId,
|
|
7099
|
+
resolveTextNormId,
|
|
7318
7100
|
setLogLevel,
|
|
7319
7101
|
setLoggingEnabled,
|
|
7320
7102
|
shouldEnableWasmProxy,
|
|
@@ -7322,7 +7104,6 @@ export {
|
|
|
7322
7104
|
shouldUseNativeASR,
|
|
7323
7105
|
shouldUseServerLipSync,
|
|
7324
7106
|
supportsVADWorker,
|
|
7325
|
-
symmetrizeBlendshapes
|
|
7326
|
-
validateCachedResponse
|
|
7107
|
+
symmetrizeBlendshapes
|
|
7327
7108
|
};
|
|
7328
7109
|
//# sourceMappingURL=index.mjs.map
|