@omote/core 0.2.3 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/chunk-6W7G6WE7.mjs +13 -0
- package/dist/chunk-6W7G6WE7.mjs.map +1 -0
- package/dist/chunk-T465MTDX.mjs +38869 -0
- package/dist/chunk-T465MTDX.mjs.map +1 -0
- package/dist/events/index.mjs +1 -1
- package/dist/index.d.mts +32 -12
- package/dist/index.d.ts +32 -12
- package/dist/index.js +38189 -25600
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +153 -108
- package/dist/index.mjs.map +1 -1
- package/dist/logging/index.mjs +1 -1
- package/dist/transformers.web-MHLR33H6.mjs +1718 -0
- package/dist/transformers.web-MHLR33H6.mjs.map +1 -0
- package/package.json +3 -2
package/dist/index.mjs
CHANGED
|
@@ -13,10 +13,10 @@ import {
|
|
|
13
13
|
setLoggingEnabled
|
|
14
14
|
} from "./chunk-ESU52TDS.mjs";
|
|
15
15
|
import {
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
} from "./chunk-
|
|
19
|
-
import "./chunk-
|
|
16
|
+
__webpack_exports__env,
|
|
17
|
+
__webpack_exports__pipeline
|
|
18
|
+
} from "./chunk-T465MTDX.mjs";
|
|
19
|
+
import "./chunk-6W7G6WE7.mjs";
|
|
20
20
|
|
|
21
21
|
// src/audio/MicrophoneCapture.ts
|
|
22
22
|
var MicrophoneCapture = class {
|
|
@@ -263,7 +263,7 @@ var AudioScheduler = class {
|
|
|
263
263
|
const ctx = await this.ensureContext();
|
|
264
264
|
const channels = this.options.channels ?? 1;
|
|
265
265
|
if (!this.isPlaying) {
|
|
266
|
-
this.nextPlayTime = ctx.currentTime + 0.05;
|
|
266
|
+
this.nextPlayTime = ctx.currentTime + (this.options.initialDelayS ?? 0.05);
|
|
267
267
|
this.isPlaying = true;
|
|
268
268
|
}
|
|
269
269
|
const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
|
|
@@ -446,8 +446,8 @@ var AudioChunkCoalescer = class {
|
|
|
446
446
|
var LAMPipeline = class {
|
|
447
447
|
constructor(options = {}) {
|
|
448
448
|
this.options = options;
|
|
449
|
-
this.
|
|
450
|
-
// 1.0s at 16kHz (
|
|
449
|
+
this.DEFAULT_CHUNK_SAMPLES = 16e3;
|
|
450
|
+
// 1.0s at 16kHz (Wav2Vec2 requirement)
|
|
451
451
|
this.FRAME_RATE = 30;
|
|
452
452
|
// LAM outputs 30fps
|
|
453
453
|
this.buffer = new Float32Array(0);
|
|
@@ -477,19 +477,20 @@ var LAMPipeline = class {
|
|
|
477
477
|
newBuffer.set(this.buffer, 0);
|
|
478
478
|
newBuffer.set(samples, this.buffer.length);
|
|
479
479
|
this.buffer = newBuffer;
|
|
480
|
-
|
|
481
|
-
|
|
480
|
+
const chunkSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
|
|
481
|
+
while (this.buffer.length >= chunkSize) {
|
|
482
|
+
await this.processBuffer(lam, chunkSize);
|
|
482
483
|
}
|
|
483
484
|
}
|
|
484
485
|
/**
|
|
485
486
|
* Process accumulated buffer through LAM inference
|
|
486
487
|
*/
|
|
487
|
-
async processBuffer(lam) {
|
|
488
|
+
async processBuffer(lam, chunkSize) {
|
|
488
489
|
try {
|
|
489
|
-
const toProcess = this.buffer.slice(0,
|
|
490
|
+
const toProcess = this.buffer.slice(0, chunkSize);
|
|
490
491
|
const processedStartTime = this.bufferStartTime;
|
|
491
|
-
this.buffer = this.buffer.slice(
|
|
492
|
-
const processedDuration =
|
|
492
|
+
this.buffer = this.buffer.slice(chunkSize);
|
|
493
|
+
const processedDuration = chunkSize / (this.options.sampleRate ?? 16e3);
|
|
493
494
|
this.bufferStartTime = processedStartTime + processedDuration;
|
|
494
495
|
const result = await lam.infer(toProcess);
|
|
495
496
|
const frameDuration = 1 / this.FRAME_RATE;
|
|
@@ -508,35 +509,22 @@ var LAMPipeline = class {
|
|
|
508
509
|
/**
|
|
509
510
|
* Get the frame that should be displayed at the current time
|
|
510
511
|
*
|
|
511
|
-
*
|
|
512
|
-
*
|
|
512
|
+
* Timestamp-synced playback for all backends. Audio playback is delayed
|
|
513
|
+
* for slow backends (WASM gets 1s head start via AudioScheduler) so
|
|
514
|
+
* frames are ready by the time their corresponding audio plays.
|
|
513
515
|
*
|
|
514
|
-
* Discard
|
|
515
|
-
*
|
|
516
|
-
*
|
|
517
|
-
*
|
|
518
|
-
* Last-Frame-Hold: Returns last valid frame instead of null to prevent
|
|
519
|
-
* avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
|
|
516
|
+
* Discard window is generous for WASM to handle inference jitter.
|
|
517
|
+
* Late frames play at RAF rate (~60fps) until caught up, then settle
|
|
518
|
+
* to natural 30fps pacing via timestamp gating.
|
|
520
519
|
*
|
|
521
520
|
* @param currentTime - Current AudioContext time
|
|
522
521
|
* @param lam - LAM inference engine (optional, for backend detection)
|
|
523
522
|
* @returns Current frame, or last frame as fallback, or null if no frames yet
|
|
524
523
|
*/
|
|
525
524
|
getFrameForTime(currentTime, lam) {
|
|
526
|
-
const discardWindow = lam?.backend === "wasm" ?
|
|
527
|
-
let discardedCount = 0;
|
|
525
|
+
const discardWindow = lam?.backend === "wasm" ? 10 : 0.5;
|
|
528
526
|
while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
|
|
529
|
-
|
|
530
|
-
discardedCount++;
|
|
531
|
-
if (discardedCount === 1) {
|
|
532
|
-
const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
|
|
533
|
-
console.warn("[LAM] Frame(s) discarded as too old", {
|
|
534
|
-
ageMs,
|
|
535
|
-
discardWindowMs: discardWindow * 1e3,
|
|
536
|
-
queueLength: this.frameQueue.length,
|
|
537
|
-
backend: lam?.backend ?? "unknown"
|
|
538
|
-
});
|
|
539
|
-
}
|
|
527
|
+
this.frameQueue.shift();
|
|
540
528
|
}
|
|
541
529
|
if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
|
|
542
530
|
const { frame } = this.frameQueue.shift();
|
|
@@ -555,7 +543,7 @@ var LAMPipeline = class {
|
|
|
555
543
|
* Get current buffer fill level (0-1)
|
|
556
544
|
*/
|
|
557
545
|
get fillLevel() {
|
|
558
|
-
return Math.min(1, this.buffer.length / this.
|
|
546
|
+
return Math.min(1, this.buffer.length / this.DEFAULT_CHUNK_SAMPLES);
|
|
559
547
|
}
|
|
560
548
|
/**
|
|
561
549
|
* Get number of frames queued
|
|
@@ -572,7 +560,7 @@ var LAMPipeline = class {
|
|
|
572
560
|
/**
|
|
573
561
|
* Flush remaining buffered audio
|
|
574
562
|
*
|
|
575
|
-
* Processes any remaining audio in the buffer, even if less than
|
|
563
|
+
* Processes any remaining audio in the buffer, even if less than the chunk size.
|
|
576
564
|
* This ensures the final audio chunk generates blendshape frames.
|
|
577
565
|
*
|
|
578
566
|
* Should be called when audio stream ends to prevent losing the last 0-1 seconds.
|
|
@@ -583,12 +571,17 @@ var LAMPipeline = class {
|
|
|
583
571
|
if (this.buffer.length === 0) {
|
|
584
572
|
return;
|
|
585
573
|
}
|
|
586
|
-
const padded = new Float32Array(this.REQUIRED_SAMPLES);
|
|
587
|
-
padded.set(this.buffer, 0);
|
|
588
574
|
const processedStartTime = this.bufferStartTime;
|
|
575
|
+
const sampleRate = this.options.sampleRate ?? 16e3;
|
|
576
|
+
const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
|
|
577
|
+
const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
|
|
578
|
+
const padded = new Float32Array(minSize);
|
|
579
|
+
padded.set(this.buffer, 0);
|
|
580
|
+
return padded;
|
|
581
|
+
})();
|
|
589
582
|
try {
|
|
590
|
-
const result = await lam.infer(
|
|
591
|
-
const actualDuration = this.buffer.length /
|
|
583
|
+
const result = await lam.infer(audioToInfer);
|
|
584
|
+
const actualDuration = this.buffer.length / sampleRate;
|
|
592
585
|
const frameDuration = 1 / this.FRAME_RATE;
|
|
593
586
|
const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
|
|
594
587
|
for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
|
|
@@ -647,7 +640,13 @@ var SyncedAudioPipeline = class extends EventEmitter {
|
|
|
647
640
|
this.monitorInterval = null;
|
|
648
641
|
this.frameAnimationId = null;
|
|
649
642
|
const sampleRate = options.sampleRate ?? 16e3;
|
|
650
|
-
|
|
643
|
+
if (!options.lam.isLoaded) {
|
|
644
|
+
throw new Error(
|
|
645
|
+
"LipSyncBackend must be loaded before constructing SyncedAudioPipeline. Call lam.load() first so backend type is known for timing configuration."
|
|
646
|
+
);
|
|
647
|
+
}
|
|
648
|
+
const initialDelayS = options.lam.backend === "wasm" ? 1 : 0.05;
|
|
649
|
+
this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
|
|
651
650
|
this.coalescer = new AudioChunkCoalescer({
|
|
652
651
|
sampleRate,
|
|
653
652
|
targetDurationMs: options.chunkTargetMs ?? 200
|
|
@@ -2138,12 +2137,9 @@ function applyIOSWasmMemoryPatch() {
|
|
|
2138
2137
|
iosWasmPatched = true;
|
|
2139
2138
|
const OrigMemory = WebAssembly.Memory;
|
|
2140
2139
|
const MAX_IOS_PAGES = 16384;
|
|
2141
|
-
logger.info("Applying iOS WASM memory patch (
|
|
2140
|
+
logger.info("Applying iOS WASM memory patch (max capped to 1GB, shared preserved)");
|
|
2142
2141
|
WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
|
|
2143
2142
|
const patched = { ...descriptor };
|
|
2144
|
-
if (patched.shared) {
|
|
2145
|
-
patched.shared = false;
|
|
2146
|
-
}
|
|
2147
2143
|
if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
|
|
2148
2144
|
patched.maximum = MAX_IOS_PAGES;
|
|
2149
2145
|
}
|
|
@@ -2760,7 +2756,7 @@ var WhisperInference = class _WhisperInference {
|
|
|
2760
2756
|
* Check if WebGPU is available in this browser
|
|
2761
2757
|
*/
|
|
2762
2758
|
static async isWebGPUAvailable() {
|
|
2763
|
-
return
|
|
2759
|
+
return isWebGPUAvailable();
|
|
2764
2760
|
}
|
|
2765
2761
|
/**
|
|
2766
2762
|
* Load the Whisper model pipeline
|
|
@@ -2801,19 +2797,19 @@ var WhisperInference = class _WhisperInference {
|
|
|
2801
2797
|
const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
|
|
2802
2798
|
const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
|
|
2803
2799
|
logger4.info("Creating pipeline", { device, hasWebGPU });
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
|
|
2807
|
-
|
|
2808
|
-
|
|
2809
|
-
if (
|
|
2810
|
-
|
|
2811
|
-
|
|
2800
|
+
__webpack_exports__env.allowLocalModels = false;
|
|
2801
|
+
__webpack_exports__env.allowRemoteModels = true;
|
|
2802
|
+
__webpack_exports__env.useBrowserCache = false;
|
|
2803
|
+
__webpack_exports__env.useCustomCache = false;
|
|
2804
|
+
__webpack_exports__env.useWasmCache = false;
|
|
2805
|
+
if (__webpack_exports__env.backends.onnx.wasm) {
|
|
2806
|
+
__webpack_exports__env.backends.onnx.wasm.proxy = false;
|
|
2807
|
+
__webpack_exports__env.backends.onnx.wasm.numThreads = 1;
|
|
2812
2808
|
}
|
|
2813
2809
|
logger4.info("Configured transformers.js env", {
|
|
2814
|
-
allowLocalModels:
|
|
2815
|
-
useBrowserCache:
|
|
2816
|
-
useWasmCache:
|
|
2810
|
+
allowLocalModels: __webpack_exports__env.allowLocalModels,
|
|
2811
|
+
useBrowserCache: __webpack_exports__env.useBrowserCache,
|
|
2812
|
+
useWasmCache: __webpack_exports__env.useWasmCache
|
|
2817
2813
|
});
|
|
2818
2814
|
const pipelineOptions = {
|
|
2819
2815
|
dtype: this.config.dtype,
|
|
@@ -2830,7 +2826,7 @@ var WhisperInference = class _WhisperInference {
|
|
|
2830
2826
|
};
|
|
2831
2827
|
logger4.info("Forcing WebGPU execution providers");
|
|
2832
2828
|
}
|
|
2833
|
-
this.pipeline = await
|
|
2829
|
+
this.pipeline = await __webpack_exports__pipeline(
|
|
2834
2830
|
"automatic-speech-recognition",
|
|
2835
2831
|
modelName,
|
|
2836
2832
|
pipelineOptions
|
|
@@ -3061,6 +3057,12 @@ var Wav2ArkitCpuInference = class {
|
|
|
3061
3057
|
this.isLoading = false;
|
|
3062
3058
|
// Inference queue for handling concurrent calls
|
|
3063
3059
|
this.inferenceQueue = Promise.resolve();
|
|
3060
|
+
/**
|
|
3061
|
+
* Preferred chunk size: 4000 samples (250ms at 16kHz).
|
|
3062
|
+
* wav2arkit_cpu accepts variable-length input, so we use smaller chunks
|
|
3063
|
+
* for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
|
|
3064
|
+
*/
|
|
3065
|
+
this.chunkSamples = 4e3;
|
|
3064
3066
|
this.config = config;
|
|
3065
3067
|
}
|
|
3066
3068
|
get backend() {
|
|
@@ -3093,32 +3095,78 @@ var Wav2ArkitCpuInference = class {
|
|
|
3093
3095
|
this.ort = ort;
|
|
3094
3096
|
this._backend = backend;
|
|
3095
3097
|
logger5.info("ONNX Runtime loaded", { backend: this._backend });
|
|
3096
|
-
const cache = getModelCache();
|
|
3097
3098
|
const modelUrl = this.config.modelUrl;
|
|
3098
|
-
const
|
|
3099
|
-
let
|
|
3100
|
-
if (
|
|
3101
|
-
|
|
3102
|
-
|
|
3103
|
-
|
|
3104
|
-
|
|
3105
|
-
|
|
3099
|
+
const sessionOptions = { ...getSessionOptions(this._backend) };
|
|
3100
|
+
let isCached = false;
|
|
3101
|
+
if (isIOS() && this.config.modelDataUrl) {
|
|
3102
|
+
const dataFilename = this.config.modelDataUrl.split("/").pop();
|
|
3103
|
+
sessionOptions.externalData = [{
|
|
3104
|
+
path: dataFilename,
|
|
3105
|
+
data: this.config.modelDataUrl
|
|
3106
|
+
}];
|
|
3107
|
+
logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
|
|
3108
|
+
modelUrl,
|
|
3109
|
+
dataFile: dataFilename,
|
|
3110
|
+
dataUrl: this.config.modelDataUrl
|
|
3111
|
+
});
|
|
3112
|
+
this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
|
|
3113
|
+
} else {
|
|
3114
|
+
const cache = getModelCache();
|
|
3115
|
+
isCached = await cache.has(modelUrl);
|
|
3116
|
+
let modelBuffer;
|
|
3117
|
+
if (isCached) {
|
|
3118
|
+
logger5.debug("Loading model from cache", { modelUrl });
|
|
3119
|
+
modelBuffer = await cache.get(modelUrl);
|
|
3120
|
+
if (!modelBuffer) {
|
|
3121
|
+
logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
|
|
3122
|
+
await cache.delete(modelUrl);
|
|
3123
|
+
modelBuffer = await fetchWithCache(modelUrl);
|
|
3124
|
+
}
|
|
3125
|
+
} else {
|
|
3126
|
+
logger5.debug("Fetching and caching model", { modelUrl });
|
|
3106
3127
|
modelBuffer = await fetchWithCache(modelUrl);
|
|
3107
3128
|
}
|
|
3108
|
-
|
|
3109
|
-
|
|
3110
|
-
|
|
3111
|
-
|
|
3112
|
-
|
|
3113
|
-
|
|
3129
|
+
if (!modelBuffer) {
|
|
3130
|
+
throw new Error(`Failed to load model: ${modelUrl}`);
|
|
3131
|
+
}
|
|
3132
|
+
let externalDataBuffer;
|
|
3133
|
+
if (this.config.modelDataUrl) {
|
|
3134
|
+
const dataUrl = this.config.modelDataUrl;
|
|
3135
|
+
const isDataCached = await cache.has(dataUrl);
|
|
3136
|
+
if (isDataCached) {
|
|
3137
|
+
logger5.debug("Loading external data from cache", { dataUrl });
|
|
3138
|
+
externalDataBuffer = await cache.get(dataUrl);
|
|
3139
|
+
if (!externalDataBuffer) {
|
|
3140
|
+
logger5.warn("External data cache corruption, re-fetching", { dataUrl });
|
|
3141
|
+
await cache.delete(dataUrl);
|
|
3142
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3143
|
+
}
|
|
3144
|
+
} else {
|
|
3145
|
+
logger5.info("Fetching external data (this may take a while on first load)", {
|
|
3146
|
+
dataUrl
|
|
3147
|
+
});
|
|
3148
|
+
externalDataBuffer = await fetchWithCache(dataUrl);
|
|
3149
|
+
}
|
|
3150
|
+
logger5.debug("External data loaded", {
|
|
3151
|
+
size: formatBytes(externalDataBuffer.byteLength)
|
|
3152
|
+
});
|
|
3153
|
+
}
|
|
3154
|
+
logger5.debug("Creating ONNX session", {
|
|
3155
|
+
size: formatBytes(modelBuffer.byteLength),
|
|
3156
|
+
hasExternalData: !!externalDataBuffer,
|
|
3157
|
+
externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
|
|
3158
|
+
backend: this._backend
|
|
3159
|
+
});
|
|
3160
|
+
if (externalDataBuffer) {
|
|
3161
|
+
const dataFilename = this.config.modelDataUrl.split("/").pop();
|
|
3162
|
+
sessionOptions.externalData = [{
|
|
3163
|
+
path: dataFilename,
|
|
3164
|
+
data: new Uint8Array(externalDataBuffer)
|
|
3165
|
+
}];
|
|
3166
|
+
}
|
|
3167
|
+
const modelData = new Uint8Array(modelBuffer);
|
|
3168
|
+
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3114
3169
|
}
|
|
3115
|
-
logger5.debug("Creating ONNX session", {
|
|
3116
|
-
size: formatBytes(modelBuffer.byteLength),
|
|
3117
|
-
backend: this._backend
|
|
3118
|
-
});
|
|
3119
|
-
const sessionOptions = getSessionOptions(this._backend);
|
|
3120
|
-
const modelData = new Uint8Array(modelBuffer);
|
|
3121
|
-
this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
|
|
3122
3170
|
const loadTimeMs = performance.now() - startTime;
|
|
3123
3171
|
logger5.info("Model loaded successfully", {
|
|
3124
3172
|
backend: this._backend,
|
|
@@ -3214,7 +3262,7 @@ var Wav2ArkitCpuInference = class {
|
|
|
3214
3262
|
blendshapes.push(symmetrizeBlendshapes(remapped));
|
|
3215
3263
|
}
|
|
3216
3264
|
logger5.trace("Inference completed", {
|
|
3217
|
-
inferenceTimeMs: Math.round(inferenceTimeMs
|
|
3265
|
+
inferenceTimeMs: Math.round(inferenceTimeMs),
|
|
3218
3266
|
numFrames,
|
|
3219
3267
|
inputSamples
|
|
3220
3268
|
});
|
|
@@ -3280,9 +3328,10 @@ function createLipSync(config) {
|
|
|
3280
3328
|
});
|
|
3281
3329
|
}
|
|
3282
3330
|
if (useCpu) {
|
|
3283
|
-
logger6.info("Creating Wav2ArkitCpuInference (
|
|
3331
|
+
logger6.info("Creating Wav2ArkitCpuInference (WASM)");
|
|
3284
3332
|
return new Wav2ArkitCpuInference({
|
|
3285
|
-
modelUrl: config.cpuModelUrl
|
|
3333
|
+
modelUrl: config.cpuModelUrl,
|
|
3334
|
+
modelDataUrl: config.cpuModelDataUrl
|
|
3286
3335
|
});
|
|
3287
3336
|
}
|
|
3288
3337
|
const gpuInstance = new Wav2Vec2Inference({
|
|
@@ -3309,6 +3358,9 @@ var LipSyncWithFallback = class {
|
|
|
3309
3358
|
get isLoaded() {
|
|
3310
3359
|
return this.implementation.isLoaded;
|
|
3311
3360
|
}
|
|
3361
|
+
get chunkSamples() {
|
|
3362
|
+
return this.implementation.chunkSamples;
|
|
3363
|
+
}
|
|
3312
3364
|
async load() {
|
|
3313
3365
|
try {
|
|
3314
3366
|
return await this.implementation.load();
|
|
@@ -3321,7 +3373,8 @@ var LipSyncWithFallback = class {
|
|
|
3321
3373
|
} catch {
|
|
3322
3374
|
}
|
|
3323
3375
|
this.implementation = new Wav2ArkitCpuInference({
|
|
3324
|
-
modelUrl: this.config.cpuModelUrl
|
|
3376
|
+
modelUrl: this.config.cpuModelUrl,
|
|
3377
|
+
modelDataUrl: this.config.cpuModelDataUrl
|
|
3325
3378
|
});
|
|
3326
3379
|
this.hasFallenBack = true;
|
|
3327
3380
|
logger6.info("Fallback to Wav2ArkitCpuInference successful");
|
|
@@ -3351,8 +3404,6 @@ var SileroVADInference = class {
|
|
|
3351
3404
|
// Pre-speech buffer for capturing beginning of speech
|
|
3352
3405
|
this.preSpeechBuffer = [];
|
|
3353
3406
|
this.wasSpeaking = false;
|
|
3354
|
-
// Cached sample rate tensor (int64 scalar, never changes per instance)
|
|
3355
|
-
this.srTensor = null;
|
|
3356
3407
|
const sampleRate = config.sampleRate ?? 16e3;
|
|
3357
3408
|
if (sampleRate !== 8e3 && sampleRate !== 16e3) {
|
|
3358
3409
|
throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
|
|
@@ -3483,24 +3534,6 @@ var SileroVADInference = class {
|
|
|
3483
3534
|
this.context = new Float32Array(this.contextSize);
|
|
3484
3535
|
this.preSpeechBuffer = [];
|
|
3485
3536
|
this.wasSpeaking = false;
|
|
3486
|
-
if (!this.srTensor) {
|
|
3487
|
-
try {
|
|
3488
|
-
this.srTensor = new this.ort.Tensor(
|
|
3489
|
-
"int64",
|
|
3490
|
-
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
3491
|
-
[]
|
|
3492
|
-
);
|
|
3493
|
-
} catch (e) {
|
|
3494
|
-
logger7.warn("BigInt64Array not available, using bigint array fallback", {
|
|
3495
|
-
error: e instanceof Error ? e.message : String(e)
|
|
3496
|
-
});
|
|
3497
|
-
this.srTensor = new this.ort.Tensor(
|
|
3498
|
-
"int64",
|
|
3499
|
-
[BigInt(this.config.sampleRate)],
|
|
3500
|
-
[]
|
|
3501
|
-
);
|
|
3502
|
-
}
|
|
3503
|
-
}
|
|
3504
3537
|
}
|
|
3505
3538
|
/**
|
|
3506
3539
|
* Process a single audio chunk
|
|
@@ -3632,7 +3665,20 @@ var SileroVADInference = class {
|
|
|
3632
3665
|
inputBuffer.set(audioChunkCopy, this.contextSize);
|
|
3633
3666
|
const inputBufferCopy = new Float32Array(inputBuffer);
|
|
3634
3667
|
const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
|
|
3635
|
-
|
|
3668
|
+
let srTensor;
|
|
3669
|
+
try {
|
|
3670
|
+
srTensor = new this.ort.Tensor(
|
|
3671
|
+
"int64",
|
|
3672
|
+
new BigInt64Array([BigInt(this.config.sampleRate)]),
|
|
3673
|
+
[]
|
|
3674
|
+
);
|
|
3675
|
+
} catch {
|
|
3676
|
+
srTensor = new this.ort.Tensor(
|
|
3677
|
+
"int64",
|
|
3678
|
+
[BigInt(this.config.sampleRate)],
|
|
3679
|
+
[]
|
|
3680
|
+
);
|
|
3681
|
+
}
|
|
3636
3682
|
const stateCopy = new Float32Array(this.state.data);
|
|
3637
3683
|
const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
|
|
3638
3684
|
const feeds = {
|
|
@@ -3721,7 +3767,6 @@ var SileroVADInference = class {
|
|
|
3721
3767
|
this.session = null;
|
|
3722
3768
|
}
|
|
3723
3769
|
this.state = null;
|
|
3724
|
-
this.srTensor = null;
|
|
3725
3770
|
}
|
|
3726
3771
|
};
|
|
3727
3772
|
/**
|
|
@@ -6534,8 +6579,8 @@ async function nukeBrowserCaches(preventRecreation = false) {
|
|
|
6534
6579
|
totalDeleted: deletedCount
|
|
6535
6580
|
});
|
|
6536
6581
|
if (preventRecreation) {
|
|
6537
|
-
const { env
|
|
6538
|
-
|
|
6582
|
+
const { env } = await import("./transformers.web-MHLR33H6.mjs");
|
|
6583
|
+
env.useBrowserCache = false;
|
|
6539
6584
|
logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
|
|
6540
6585
|
}
|
|
6541
6586
|
return deletedCount;
|