@omote/core 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-6W7G6WE7.mjs +13 -0
- package/dist/chunk-6W7G6WE7.mjs.map +1 -0
- package/dist/chunk-T465MTDX.mjs +38869 -0
- package/dist/chunk-T465MTDX.mjs.map +1 -0
- package/dist/events/index.mjs +1 -1
- package/dist/index.d.mts +32 -12
- package/dist/index.d.ts +32 -12
- package/dist/index.js +38205 -25596
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +168 -103
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/dist/transformers.web-MHLR33H6.mjs +1718 -0
- package/dist/transformers.web-MHLR33H6.mjs.map +1 -0
- package/package.json +3 -2
package/dist/index.mjs
CHANGED
|
@@ -13,10 +13,10 @@ import {
|
|
|
13
13
|
setLoggingEnabled
|
|
14
14
|
} from "./chunk-ESU52TDS.mjs";
|
|
15
15
|
import {
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
} from "./chunk-
|
|
19
|
-
import "./chunk-
|
|
16
|
+
__webpack_exports__env,
|
|
17
|
+
__webpack_exports__pipeline
|
|
18
|
+
} from "./chunk-T465MTDX.mjs";
|
|
19
|
+
import "./chunk-6W7G6WE7.mjs";
|
|
20
20
|
|
|
21
21
|
// src/audio/MicrophoneCapture.ts
|
|
22
22
|
var MicrophoneCapture = class {
|
|
@@ -263,7 +263,7 @@ var AudioScheduler = class {
|
|
|
263
263
|
const ctx = await this.ensureContext();
|
|
264
264
|
const channels = this.options.channels ?? 1;
|
|
265
265
|
if (!this.isPlaying) {
|
|
266
|
-
this.nextPlayTime = ctx.currentTime + 0.05;
|
|
266
|
+
this.nextPlayTime = ctx.currentTime + (this.options.initialDelayS ?? 0.05);
|
|
267
267
|
this.isPlaying = true;
|
|
268
268
|
}
|
|
269
269
|
const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
|
|
@@ -446,8 +446,8 @@ var AudioChunkCoalescer = class {
|
|
|
446
446
|
var LAMPipeline = class {
|
|
447
447
|
constructor(options = {}) {
|
|
448
448
|
this.options = options;
|
|
449
|
-
this.
|
|
450
|
-
// 1.0s at 16kHz (
|
|
449
|
+
this.DEFAULT_CHUNK_SAMPLES = 16e3;
|
|
450
|
+
// 1.0s at 16kHz (Wav2Vec2 requirement)
|
|
451
451
|
this.FRAME_RATE = 30;
|
|
452
452
|
// LAM outputs 30fps
|
|
453
453
|
this.buffer = new Float32Array(0);
|
|
@@ -477,19 +477,20 @@ var LAMPipeline = class {
|
|
|
477
477
|
newBuffer.set(this.buffer, 0);
|
|
478
478
|
newBuffer.set(samples, this.buffer.length);
|
|
479
479
|
this.buffer = newBuffer;
|
|
480
|
-
|
|
481
|
-
|
|
480
|
+
const chunkSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
|
|
481
|
+
while (this.buffer.length >= chunkSize) {
|
|
482
|
+
await this.processBuffer(lam, chunkSize);
|
|
482
483
|
}
|
|
483
484
|
}
|
|
484
485
|
/**
|
|
485
486
|
* Process accumulated buffer through LAM inference
|
|
486
487
|
*/
|
|
487
|
-
async processBuffer(lam) {
|
|
488
|
+
async processBuffer(lam, chunkSize) {
|
|
488
489
|
try {
|
|
489
|
-
const toProcess = this.buffer.slice(0,
|
|
490
|
+
const toProcess = this.buffer.slice(0, chunkSize);
|
|
490
491
|
const processedStartTime = this.bufferStartTime;
|
|
491
|
-
this.buffer = this.buffer.slice(
|
|
492
|
-
const processedDuration =
|
|
492
|
+
this.buffer = this.buffer.slice(chunkSize);
|
|
493
|
+
const processedDuration = chunkSize / (this.options.sampleRate ?? 16e3);
|
|
493
494
|
this.bufferStartTime = processedStartTime + processedDuration;
|
|
494
495
|
const result = await lam.infer(toProcess);
|
|
495
496
|
const frameDuration = 1 / this.FRAME_RATE;
|
|
@@ -508,35 +509,22 @@ var LAMPipeline = class {
|
|
|
508
509
|
/**
|
|
509
510
|
* Get the frame that should be displayed at the current time
|
|
510
511
|
*
|
|
511
|
-
*
|
|
512
|
-
*
|
|
512
|
+
* Timestamp-synced playback for all backends. Audio playback is delayed
|
|
513
|
+
* for slow backends (WASM gets 1s head start via AudioScheduler) so
|
|
514
|
+
* frames are ready by the time their corresponding audio plays.
|
|
513
515
|
*
|
|
514
|
-
* Discard
|
|
515
|
-
*
|
|
516
|
-
*
|
|
517
|
-
*
|
|
518
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
519
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
516
|
+
* Discard window is generous for WASM to handle inference jitter.
|
|
517
|
+
* Late frames play at RAF rate (~60fps) until caught up, then settle
|
|
518
|
+
* to natural 30fps pacing via timestamp gating.
|
|
520
519
|
*
|
|
521
520
|
* @param currentTime - Current AudioContext time
|
|
522
521
|
* @param lam - LAM inference engine (optional, for backend detection)
|
|
523
522
|
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
524
523
|
*/
|
|
525
524
|
getFrameForTime(currentTime, lam) {
|
|
526
|
-
const discardWindow = lam?.backend === "wasm" ?
|
|
527
|
-
let discardedCount = 0;
|
|
525
|
+
const discardWindow = lam?.backend === "wasm" ? 10 : 0.5;
|
|
528
526
|
while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
|
|
529
|
-
|
|
530
|
-
discardedCount++;
|
|
531
|
-
if (discardedCount === 1) {
|
|
532
|
-
const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
|
|
533
|
-
console.warn("[LAM] Frame(s) discarded as too old", {
|
|
534
|
-
ageMs,
|
|
535
|
-
discardWindowMs: discardWindow * 1e3,
|
|
536
|
-
queueLength: this.frameQueue.length,
|
|
537
|
-
backend: lam?.backend ?? "unknown"
|
|
538
|
-
});
|
|
539
|
-
}
|
|
527
|
+
this.frameQueue.shift();
|
|
540
528
|
}
|
|
541
529
|
if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
|
|
542
530
|
const { frame } = this.frameQueue.shift();
|
|
@@ -555,7 +543,7 @@ var LAMPipeline = class {
|
|
|
555
543
|
* Get current buffer fill level (0-1)
|
|
556
544
|
*/
|
|
557
545
|
get fillLevel() {
|
|
558
|
-
return Math.min(1, this.buffer.length / this.
|
|
546
|
+
return Math.min(1, this.buffer.length / this.DEFAULT_CHUNK_SAMPLES);
|
|
559
547
|
}
|
|
560
548
|
/**
|
|
561
549
|
* Get number of frames queued
|
|
@@ -572,7 +560,7 @@ var LAMPipeline = class {
|
|
|
572
560
|
/**
|
|
573
561
|
* Flush remaining buffered audio
|
|
574
562
|
*
|
|
575
|
-
* Processes any remaining audio in the buffer, even if less than
|
|
563
|
+
* Processes any remaining audio in the buffer, even if less than the chunk size.
|
|
576
564
|
* This ensures the final audio chunk generates blendshape frames.
|
|
577
565
|
*
|
|
578
566
|
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
@@ -583,12 +571,17 @@ var LAMPipeline = class {
|
|
|
583
571
|
if (this.buffer.length === 0) {
|
|
584
572
|
return;
|
|
585
573
|
}
|
|
586
|
-
const padded = new Float32Array(this.REQUIRED_SAMPLES);
|
|
587
|
-
padded.set(this.buffer, 0);
|
|
588
574
|
const processedStartTime = this.bufferStartTime;
|
|
575
|
+
const sampleRate = this.options.sampleRate ?? 16e3;
|
|
576
|
+
const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
|
|
577
|
+
const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
|
|
578
|
+
const padded = new Float32Array(minSize);
|
|
579
|
+
padded.set(this.buffer, 0);
|
|
580
|
+
return padded;
|
|
581
|
+
})();
|
|
589
582
|
try {
|
|
590
|
-
const result = await lam.infer(
|
|
591
|
-
const actualDuration = this.buffer.length /
|
|
583
|
+
const result = await lam.infer(audioToInfer);
|
|
584
|
+
const actualDuration = this.buffer.length / sampleRate;
|
|
592
585
|
const frameDuration = 1 / this.FRAME_RATE;
|
|
593
586
|
const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
|
|
594
587
|
for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
|
|
@@ -647,7 +640,13 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
647
640
|
this.monitorInterval = null;
|
|
648
641
|
this.frameAnimationId = null;
|
|
649
642
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
650
|
-
|
|
643
|
+
if (!options.lam.isLoaded) {
|
|
644
|
+
throw new Error(
|
|
645
|
+
"LipSyncBackend must be loaded before constructing SyncedAudioPipeline. Call lam.load() first so backend type is known for timing configuration."
|
|
646
|
+
);
|
|
647
|
+
}
|
|
648
|
+
const initialDelayS = options.lam.backend === "wasm" ? 1 : 0.05;
|
|
649
|
+
this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
|
|
651
650
|
this.coalescer = new AudioChunkCoalescer({
|
|
652
651
|
sampleRate,
|
|
653
652
|
targetDurationMs: options.chunkTargetMs ?? 200
|
|
@@ -2132,6 +2131,22 @@ async function isWebGPUAvailable() {
|
|
|
2132
2131
|
return false;
|
|
2133
2132
|
}
|
|
2134
2133
|
}
|
|
2134
|
+
var iosWasmPatched = false;
|
|
2135
|
+
function applyIOSWasmMemoryPatch() {
|
|
2136
|
+
if (iosWasmPatched || !isIOS()) return;
|
|
2137
|
+
iosWasmPatched = true;
|
|
2138
|
+
const OrigMemory = WebAssembly.Memory;
|
|
2139
|
+
const MAX_IOS_PAGES = 16384;
|
|
2140
|
+
logger.info("Applying iOS WASM memory patch (max capped to 1GB, shared preserved)");
|
|
2141
|
+
WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
|
|
2142
|
+
const patched = { ...descriptor };
|
|
2143
|
+
if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
|
|
2144
|
+
patched.maximum = MAX_IOS_PAGES;
|
|
2145
|
+
}
|
|
2146
|
+
return new OrigMemory(patched);
|
|
2147
|
+
};
|
|
2148
|
+
WebAssembly.Memory.prototype = OrigMemory.prototype;
|
|
2149
|
+
}
|
|
2135
2150
|
function configureWasm(ort) {
|
|
2136
2151
|
ort.env.wasm.wasmPaths = WASM_CDN_PATH;
|
|
2137
2152
|
const numThreads = getOptimalWasmThreads();
|
|
@@ -2157,6 +2172,7 @@ async function getOnnxRuntime(backend) {
|
|
|
2157
2172
|
return ortInstance;
|
|
2158
2173
|
}
|
|
2159
2174
|
logger.info(`Loading ONNX Runtime with ${backend} backend...`);
|
|
2175
|
+
applyIOSWasmMemoryPatch();
|
|
2160
2176
|
try {
|
|
2161
2177
|
if (backend === "wasm") {
|
|
2162
2178
|
const module = await import("onnxruntime-web");
|
|
@@ -2781,19 +2797,19 @@ var WhisperInference = class _WhisperInference {
|
|
|
2781
2797
|
const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
|
|
2782
2798
|
const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
|
|
2783
2799
|
logger4.info("Creating pipeline", { device, hasWebGPU });
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
if (
|
|
2790
|
-
|
|
2791
|
-
|
|
2800
|
+
__webpack_exports__env.allowLocalModels = false;
|
|
2801
|
+
__webpack_exports__env.allowRemoteModels = true;
|
|
2802
|
+
__webpack_exports__env.useBrowserCache = false;
|
|
2803
|
+
__webpack_exports__env.useCustomCache = false;
|
|
2804
|
+
__webpack_exports__env.useWasmCache = false;
|
|
2805
|
+
if (__webpack_exports__env.backends.onnx.wasm) {
|
|
2806
|
+
__webpack_exports__env.backends.onnx.wasm.proxy = false;
|
|
2807
|
+
__webpack_exports__env.backends.onnx.wasm.numThreads = 1;
|
|
2792
2808
|
}
|
|
2793
2809
|
logger4.info("Configured transformers.js env", {
|
|
2794
|
-
allowLocalModels:
|
|
2795
|
-
useBrowserCache:
|
|
2796
|
-
useWasmCache:
|
|
2810
|
+
allowLocalModels: __webpack_exports__env.allowLocalModels,
|
|
2811
|
+
useBrowserCache: __webpack_exports__env.useBrowserCache,
|
|
2812
|
+
useWasmCache: __webpack_exports__env.useWasmCache
|
|
2797
2813
|
});
|
|
2798
2814
|
const pipelineOptions = {
|
|
2799
2815
|
dtype: this.config.dtype,
|
|
@@ -2810,7 +2826,7 @@ var WhisperInference = class _WhisperInference {
|
|
|
2810
2826
|
};
|
|
2811
2827
|
logger4.info("Forcing WebGPU execution providers");
|
|
2812
2828
|
}
|
|
2813
|
-
this.pipeline = await
|
|
2829
|
+
this.pipeline = await __webpack_exports__pipeline(
|
|
2814
2830
|
"automatic-speech-recognition",
|
|
2815
2831
|
modelName,
|
|
2816
2832
|
pipelineOptions
|
|
@@ -3041,6 +3057,12 @@ var Wav2ArkitCpuInference = class {
|
|
|
3041
3057
|
this.isLoading = false;
|
|
3042
3058
|
// Inference queue for handling concurrent calls
|
|
3043
3059
|
this.inferenceQueue = Promise.resolve();
|
|
3060
|
+
/**
|
|
3061
|
+
* Preferred chunk size: 4000 samples (250ms at 16kHz).
|
|
3062
|
+
* wav2arkit_cpu accepts variable-length input, so we use smaller chunks
|
|
3063
|
+
* for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
|
|
3064
|
+
*/
|
|
3065
|
+
this.chunkSamples = 4e3;
|
|
3044
3066
|
this.config = config;
|
|
3045
3067
|
}
|
|
3046
3068
|
get backend() {
|
|
@@ -3073,32 +3095,78 @@ var Wav2ArkitCpuInference = class {
|
|
|
3073
3095
|
this.ort = ort;
|
|
3074
3096
|
this._backend = backend;
|
|
3075
3097
|
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3076
|
-
const cache = getModelCache();
|
|
3077
3098
|
const modelUrl = this.config.modelUrl;
|
|
3078
|
-
const
|
|
3079
|
-
let
|
|
3080
|
-
if (
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3085
|
-
|
|
3099
|
+
const sessionOptions = { ...getSessionOptions(this._backend) };
|
|
3100
|
+
let isCached = false;
|
|
3101
|
+
if (isIOS() && this.config.modelDataUrl) {
|
|
3102
|
+
const dataFilename = this.config.modelDataUrl.split("/").pop();
|
|
3103
|
+
sessionOptions.externalData = [{
|
|
3104
|
+
path: dataFilename,
|
|
3105
|
+
data: this.config.modelDataUrl
|
|
3106
|
+
}];
|
|
3107
|
+
logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
|
|
3108
|
+
modelUrl,
|
|
3109
|
+
dataFile: dataFilename,
|
|
3110
|
+
dataUrl: this.config.modelDataUrl
|
|
3111
|
+
});
|
|
3112
|
+
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
3113
|
+
} else {
|
|
3114
|
+
const cache = getModelCache();
|
|
3115
|
+
isCached = await cache.has(modelUrl);
|
|
3116
|
+
let modelBuffer;
|
|
3117
|
+
if (isCached) {
|
|
3118
|
+
logger5.debug("Loading model from cache", { modelUrl });
|
|
3119
|
+
modelBuffer = await cache.get(modelUrl);
|
|
3120
|
+
if (!modelBuffer) {
|
|
3121
|
+
logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
3122
|
+
await cache.delete(modelUrl);
|
|
3123
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
3124
|
+
}
|
|
3125
|
+
} else {
|
|
3126
|
+
logger5.debug("Fetching and caching model", { modelUrl });
|
|
3086
3127
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3087
3128
|
}
|
|
3088
|
-
|
|
3089
|
-
|
|
3090
|
-
|
|
3091
|
-
|
|
3092
|
-
|
|
3093
|
-
|
|
3129
|
+
if (!modelBuffer) {
|
|
3130
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
3131
|
+
}
|
|
3132
|
+
let externalDataBuffer;
|
|
3133
|
+
if (this.config.modelDataUrl) {
|
|
3134
|
+
const dataUrl = this.config.modelDataUrl;
|
|
3135
|
+
const isDataCached = await cache.has(dataUrl);
|
|
3136
|
+
if (isDataCached) {
|
|
3137
|
+
logger5.debug("Loading external data from cache", { dataUrl });
|
|
3138
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
3139
|
+
if (!externalDataBuffer) {
|
|
3140
|
+
logger5.warn("External data cache corruption, re-fetching", { dataUrl });
|
|
3141
|
+
await cache.delete(dataUrl);
|
|
3142
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3143
|
+
}
|
|
3144
|
+
} else {
|
|
3145
|
+
logger5.info("Fetching external data (this may take a while on first load)", {
|
|
3146
|
+
dataUrl
|
|
3147
|
+
});
|
|
3148
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3149
|
+
}
|
|
3150
|
+
logger5.debug("External data loaded", {
|
|
3151
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
3152
|
+
});
|
|
3153
|
+
}
|
|
3154
|
+
logger5.debug("Creating ONNX session", {
|
|
3155
|
+
size: formatBytes(modelBuffer.byteLength),
|
|
3156
|
+
hasExternalData: !!externalDataBuffer,
|
|
3157
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
|
|
3158
|
+
backend: this._backend
|
|
3159
|
+
});
|
|
3160
|
+
if (externalDataBuffer) {
|
|
3161
|
+
const dataFilename = this.config.modelDataUrl.split("/").pop();
|
|
3162
|
+
sessionOptions.externalData = [{
|
|
3163
|
+
path: dataFilename,
|
|
3164
|
+
data: new Uint8Array(externalDataBuffer)
|
|
3165
|
+
}];
|
|
3166
|
+
}
|
|
3167
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
3168
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3094
3169
|
}
|
|
3095
|
-
logger5.debug("Creating ONNX session", {
|
|
3096
|
-
size: formatBytes(modelBuffer.byteLength),
|
|
3097
|
-
backend: this._backend
|
|
3098
|
-
});
|
|
3099
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
3100
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
3101
|
-
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3102
3170
|
const loadTimeMs = performance.now() - startTime;
|
|
3103
3171
|
logger5.info("Model loaded successfully", {
|
|
3104
3172
|
backend: this._backend,
|
|
@@ -3194,7 +3262,7 @@ var Wav2ArkitCpuInference = class {
|
|
|
3194
3262
|
blendshapes.push(symmetrizeBlendshapes(remapped));
|
|
3195
3263
|
}
|
|
3196
3264
|
logger5.trace("Inference completed", {
|
|
3197
|
-
inferenceTimeMs: Math.round(inferenceTimeMs
|
|
3265
|
+
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
3198
3266
|
numFrames,
|
|
3199
3267
|
inputSamples
|
|
3200
3268
|
});
|
|
@@ -3260,9 +3328,10 @@ function createLipSync(config) {
|
|
|
3260
3328
|
});
|
|
3261
3329
|
}
|
|
3262
3330
|
if (useCpu) {
|
|
3263
|
-
logger6.info("Creating Wav2ArkitCpuInference (
|
|
3331
|
+
logger6.info("Creating Wav2ArkitCpuInference (WASM)");
|
|
3264
3332
|
return new Wav2ArkitCpuInference({
|
|
3265
|
-
modelUrl: config.cpuModelUrl
|
|
3333
|
+
modelUrl: config.cpuModelUrl,
|
|
3334
|
+
modelDataUrl: config.cpuModelDataUrl
|
|
3266
3335
|
});
|
|
3267
3336
|
}
|
|
3268
3337
|
const gpuInstance = new Wav2Vec2Inference({
|
|
@@ -3289,6 +3358,9 @@ var LipSyncWithFallback = class {
|
|
|
3289
3358
|
get isLoaded() {
|
|
3290
3359
|
return this.implementation.isLoaded;
|
|
3291
3360
|
}
|
|
3361
|
+
get chunkSamples() {
|
|
3362
|
+
return this.implementation.chunkSamples;
|
|
3363
|
+
}
|
|
3292
3364
|
async load() {
|
|
3293
3365
|
try {
|
|
3294
3366
|
return await this.implementation.load();
|
|
@@ -3301,7 +3373,8 @@ var LipSyncWithFallback = class {
|
|
|
3301
3373
|
} catch {
|
|
3302
3374
|
}
|
|
3303
3375
|
this.implementation = new Wav2ArkitCpuInference({
|
|
3304
|
-
modelUrl: this.config.cpuModelUrl
|
|
3376
|
+
modelUrl: this.config.cpuModelUrl,
|
|
3377
|
+
modelDataUrl: this.config.cpuModelDataUrl
|
|
3305
3378
|
});
|
|
3306
3379
|
this.hasFallenBack = true;
|
|
3307
3380
|
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
@@ -3331,8 +3404,6 @@ var SileroVADInference = class {
|
|
|
3331
3404
|
// Pre-speech buffer for capturing beginning of speech
|
|
3332
3405
|
this.preSpeechBuffer = [];
|
|
3333
3406
|
this.wasSpeaking = false;
|
|
3334
|
-
// Cached sample rate tensor (int64 scalar, never changes per instance)
|
|
3335
|
-
this.srTensor = null;
|
|
3336
3407
|
const sampleRate = config.sampleRate ?? 16e3;
|
|
3337
3408
|
if (sampleRate !== 8e3 && sampleRate !== 16e3) {
|
|
3338
3409
|
throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
|
|
@@ -3463,24 +3534,6 @@ var SileroVADInference = class {
|
|
|
3463
3534
|
this.context = new Float32Array(this.contextSize);
|
|
3464
3535
|
this.preSpeechBuffer = [];
|
|
3465
3536
|
this.wasSpeaking = false;
|
|
3466
|
-
if (!this.srTensor) {
|
|
3467
|
-
try {
|
|
3468
|
-
this.srTensor = new this.ort.Tensor(
|
|
3469
|
-
"int64",
|
|
3470
|
-
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
3471
|
-
[]
|
|
3472
|
-
);
|
|
3473
|
-
} catch (e) {
|
|
3474
|
-
logger7.warn("BigInt64Array not available, using bigint array fallback", {
|
|
3475
|
-
error: e instanceof Error ? e.message : String(e)
|
|
3476
|
-
});
|
|
3477
|
-
this.srTensor = new this.ort.Tensor(
|
|
3478
|
-
"int64",
|
|
3479
|
-
[BigInt(this.config.sampleRate)],
|
|
3480
|
-
[]
|
|
3481
|
-
);
|
|
3482
|
-
}
|
|
3483
|
-
}
|
|
3484
3537
|
}
|
|
3485
3538
|
/**
|
|
3486
3539
|
* Process a single audio chunk
|
|
@@ -3612,7 +3665,20 @@ var SileroVADInference = class {
|
|
|
3612
3665
|
inputBuffer.set(audioChunkCopy, this.contextSize);
|
|
3613
3666
|
const inputBufferCopy = new Float32Array(inputBuffer);
|
|
3614
3667
|
const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
|
|
3615
|
-
|
|
3668
|
+
let srTensor;
|
|
3669
|
+
try {
|
|
3670
|
+
srTensor = new this.ort.Tensor(
|
|
3671
|
+
"int64",
|
|
3672
|
+
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
3673
|
+
[]
|
|
3674
|
+
);
|
|
3675
|
+
} catch {
|
|
3676
|
+
srTensor = new this.ort.Tensor(
|
|
3677
|
+
"int64",
|
|
3678
|
+
[BigInt(this.config.sampleRate)],
|
|
3679
|
+
[]
|
|
3680
|
+
);
|
|
3681
|
+
}
|
|
3616
3682
|
const stateCopy = new Float32Array(this.state.data);
|
|
3617
3683
|
const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
|
|
3618
3684
|
const feeds = {
|
|
@@ -3701,7 +3767,6 @@ var SileroVADInference = class {
|
|
|
3701
3767
|
this.session = null;
|
|
3702
3768
|
}
|
|
3703
3769
|
this.state = null;
|
|
3704
|
-
this.srTensor = null;
|
|
3705
3770
|
}
|
|
3706
3771
|
};
|
|
3707
3772
|
/**
|
|
@@ -6514,8 +6579,8 @@ async function nukeBrowserCaches(preventRecreation = false) {
|
|
|
6514
6579
|
totalDeleted: deletedCount
|
|
6515
6580
|
});
|
|
6516
6581
|
if (preventRecreation) {
|
|
6517
|
-
const { env
|
|
6518
|
-
|
|
6582
|
+
const { env } = await import("./transformers.web-MHLR33H6.mjs");
|
|
6583
|
+
env.useBrowserCache = false;
|
|
6519
6584
|
logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
|
|
6520
6585
|
}
|
|
6521
6586
|
return deletedCount;
|