@omote/core 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -13,10 +13,10 @@ import {
13
13
  setLoggingEnabled
14
14
  } from "./chunk-ESU52TDS.mjs";
15
15
  import {
16
- env,
17
- pipeline3
18
- } from "./chunk-RI6UQ7WF.mjs";
19
- import "./chunk-NSSMTXJJ.mjs";
16
+ __webpack_exports__env,
17
+ __webpack_exports__pipeline
18
+ } from "./chunk-T465MTDX.mjs";
19
+ import "./chunk-6W7G6WE7.mjs";
20
20
 
21
21
  // src/audio/MicrophoneCapture.ts
22
22
  var MicrophoneCapture = class {
@@ -263,7 +263,7 @@ var AudioScheduler = class {
263
263
  const ctx = await this.ensureContext();
264
264
  const channels = this.options.channels ?? 1;
265
265
  if (!this.isPlaying) {
266
- this.nextPlayTime = ctx.currentTime + 0.05;
266
+ this.nextPlayTime = ctx.currentTime + (this.options.initialDelayS ?? 0.05);
267
267
  this.isPlaying = true;
268
268
  }
269
269
  const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
@@ -446,8 +446,8 @@ var AudioChunkCoalescer = class {
446
446
  var LAMPipeline = class {
447
447
  constructor(options = {}) {
448
448
  this.options = options;
449
- this.REQUIRED_SAMPLES = 16e3;
450
- // 1.0s at 16kHz (LAM requirement)
449
+ this.DEFAULT_CHUNK_SAMPLES = 16e3;
450
+ // 1.0s at 16kHz (Wav2Vec2 requirement)
451
451
  this.FRAME_RATE = 30;
452
452
  // LAM outputs 30fps
453
453
  this.buffer = new Float32Array(0);
@@ -477,19 +477,20 @@ var LAMPipeline = class {
477
477
  newBuffer.set(this.buffer, 0);
478
478
  newBuffer.set(samples, this.buffer.length);
479
479
  this.buffer = newBuffer;
480
- while (this.buffer.length >= this.REQUIRED_SAMPLES) {
481
- await this.processBuffer(lam);
480
+ const chunkSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
481
+ while (this.buffer.length >= chunkSize) {
482
+ await this.processBuffer(lam, chunkSize);
482
483
  }
483
484
  }
484
485
  /**
485
486
  * Process accumulated buffer through LAM inference
486
487
  */
487
- async processBuffer(lam) {
488
+ async processBuffer(lam, chunkSize) {
488
489
  try {
489
- const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
490
+ const toProcess = this.buffer.slice(0, chunkSize);
490
491
  const processedStartTime = this.bufferStartTime;
491
- this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
492
- const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
492
+ this.buffer = this.buffer.slice(chunkSize);
493
+ const processedDuration = chunkSize / (this.options.sampleRate ?? 16e3);
493
494
  this.bufferStartTime = processedStartTime + processedDuration;
494
495
  const result = await lam.infer(toProcess);
495
496
  const frameDuration = 1 / this.FRAME_RATE;
@@ -508,35 +509,22 @@ var LAMPipeline = class {
508
509
  /**
509
510
  * Get the frame that should be displayed at the current time
510
511
  *
511
- * Automatically removes frames that have already been displayed.
512
- * This prevents memory leaks from accumulating old frames.
512
+ * Timestamp-synced playback for all backends. Audio playback is delayed
513
+ * for slow backends (WASM gets 1s head start via AudioScheduler) so
514
+ * frames are ready by the time their corresponding audio plays.
513
515
  *
514
- * Discard Window (prevents premature frame discarding):
515
- * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
516
- * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
517
- *
518
- * Last-Frame-Hold: Returns last valid frame instead of null to prevent
519
- * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
516
+ * Discard window is generous for WASM to handle inference jitter.
517
+ * Late frames play at RAF rate (~60fps) until caught up, then settle
518
+ * to natural 30fps pacing via timestamp gating.
520
519
  *
521
520
  * @param currentTime - Current AudioContext time
522
521
  * @param lam - LAM inference engine (optional, for backend detection)
523
522
  * @returns Current frame, or last frame as fallback, or null if no frames yet
524
523
  */
525
524
  getFrameForTime(currentTime, lam) {
526
- const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
527
- let discardedCount = 0;
525
+ const discardWindow = lam?.backend === "wasm" ? 10 : 0.5;
528
526
  while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
529
- const discarded = this.frameQueue.shift();
530
- discardedCount++;
531
- if (discardedCount === 1) {
532
- const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
533
- console.warn("[LAM] Frame(s) discarded as too old", {
534
- ageMs,
535
- discardWindowMs: discardWindow * 1e3,
536
- queueLength: this.frameQueue.length,
537
- backend: lam?.backend ?? "unknown"
538
- });
539
- }
527
+ this.frameQueue.shift();
540
528
  }
541
529
  if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
542
530
  const { frame } = this.frameQueue.shift();
@@ -555,7 +543,7 @@ var LAMPipeline = class {
555
543
  * Get current buffer fill level (0-1)
556
544
  */
557
545
  get fillLevel() {
558
- return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
546
+ return Math.min(1, this.buffer.length / this.DEFAULT_CHUNK_SAMPLES);
559
547
  }
560
548
  /**
561
549
  * Get number of frames queued
@@ -572,7 +560,7 @@ var LAMPipeline = class {
572
560
  /**
573
561
  * Flush remaining buffered audio
574
562
  *
575
- * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
563
+ * Processes any remaining audio in the buffer, even if less than the chunk size.
576
564
  * This ensures the final audio chunk generates blendshape frames.
577
565
  *
578
566
  * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -583,12 +571,17 @@ var LAMPipeline = class {
583
571
  if (this.buffer.length === 0) {
584
572
  return;
585
573
  }
586
- const padded = new Float32Array(this.REQUIRED_SAMPLES);
587
- padded.set(this.buffer, 0);
588
574
  const processedStartTime = this.bufferStartTime;
575
+ const sampleRate = this.options.sampleRate ?? 16e3;
576
+ const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
577
+ const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
578
+ const padded = new Float32Array(minSize);
579
+ padded.set(this.buffer, 0);
580
+ return padded;
581
+ })();
589
582
  try {
590
- const result = await lam.infer(padded);
591
- const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
583
+ const result = await lam.infer(audioToInfer);
584
+ const actualDuration = this.buffer.length / sampleRate;
592
585
  const frameDuration = 1 / this.FRAME_RATE;
593
586
  const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
594
587
  for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
@@ -647,7 +640,13 @@ var SyncedAudioPipeline = class extends EventEmitter {
647
640
  this.monitorInterval = null;
648
641
  this.frameAnimationId = null;
649
642
  const sampleRate = options.sampleRate ?? 16e3;
650
- this.scheduler = new AudioScheduler({ sampleRate });
643
+ if (!options.lam.isLoaded) {
644
+ throw new Error(
645
+ "LipSyncBackend must be loaded before constructing SyncedAudioPipeline. Call lam.load() first so backend type is known for timing configuration."
646
+ );
647
+ }
648
+ const initialDelayS = options.lam.backend === "wasm" ? 1 : 0.05;
649
+ this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
651
650
  this.coalescer = new AudioChunkCoalescer({
652
651
  sampleRate,
653
652
  targetDurationMs: options.chunkTargetMs ?? 200
@@ -2132,6 +2131,22 @@ async function isWebGPUAvailable() {
2132
2131
  return false;
2133
2132
  }
2134
2133
  }
2134
+ var iosWasmPatched = false;
2135
+ function applyIOSWasmMemoryPatch() {
2136
+ if (iosWasmPatched || !isIOS()) return;
2137
+ iosWasmPatched = true;
2138
+ const OrigMemory = WebAssembly.Memory;
2139
+ const MAX_IOS_PAGES = 16384;
2140
+ logger.info("Applying iOS WASM memory patch (max capped to 1GB, shared preserved)");
2141
+ WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
2142
+ const patched = { ...descriptor };
2143
+ if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
2144
+ patched.maximum = MAX_IOS_PAGES;
2145
+ }
2146
+ return new OrigMemory(patched);
2147
+ };
2148
+ WebAssembly.Memory.prototype = OrigMemory.prototype;
2149
+ }
2135
2150
  function configureWasm(ort) {
2136
2151
  ort.env.wasm.wasmPaths = WASM_CDN_PATH;
2137
2152
  const numThreads = getOptimalWasmThreads();
@@ -2157,6 +2172,7 @@ async function getOnnxRuntime(backend) {
2157
2172
  return ortInstance;
2158
2173
  }
2159
2174
  logger.info(`Loading ONNX Runtime with ${backend} backend...`);
2175
+ applyIOSWasmMemoryPatch();
2160
2176
  try {
2161
2177
  if (backend === "wasm") {
2162
2178
  const module = await import("onnxruntime-web");
@@ -2781,19 +2797,19 @@ var WhisperInference = class _WhisperInference {
2781
2797
  const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
2782
2798
  const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
2783
2799
  logger4.info("Creating pipeline", { device, hasWebGPU });
2784
- env.allowLocalModels = false;
2785
- env.allowRemoteModels = true;
2786
- env.useBrowserCache = false;
2787
- env.useCustomCache = false;
2788
- env.useWasmCache = false;
2789
- if (env.backends.onnx.wasm) {
2790
- env.backends.onnx.wasm.proxy = false;
2791
- env.backends.onnx.wasm.numThreads = 1;
2800
+ __webpack_exports__env.allowLocalModels = false;
2801
+ __webpack_exports__env.allowRemoteModels = true;
2802
+ __webpack_exports__env.useBrowserCache = false;
2803
+ __webpack_exports__env.useCustomCache = false;
2804
+ __webpack_exports__env.useWasmCache = false;
2805
+ if (__webpack_exports__env.backends.onnx.wasm) {
2806
+ __webpack_exports__env.backends.onnx.wasm.proxy = false;
2807
+ __webpack_exports__env.backends.onnx.wasm.numThreads = 1;
2792
2808
  }
2793
2809
  logger4.info("Configured transformers.js env", {
2794
- allowLocalModels: env.allowLocalModels,
2795
- useBrowserCache: env.useBrowserCache,
2796
- useWasmCache: env.useWasmCache
2810
+ allowLocalModels: __webpack_exports__env.allowLocalModels,
2811
+ useBrowserCache: __webpack_exports__env.useBrowserCache,
2812
+ useWasmCache: __webpack_exports__env.useWasmCache
2797
2813
  });
2798
2814
  const pipelineOptions = {
2799
2815
  dtype: this.config.dtype,
@@ -2810,7 +2826,7 @@ var WhisperInference = class _WhisperInference {
2810
2826
  };
2811
2827
  logger4.info("Forcing WebGPU execution providers");
2812
2828
  }
2813
- this.pipeline = await pipeline3(
2829
+ this.pipeline = await __webpack_exports__pipeline(
2814
2830
  "automatic-speech-recognition",
2815
2831
  modelName,
2816
2832
  pipelineOptions
@@ -3041,6 +3057,12 @@ var Wav2ArkitCpuInference = class {
3041
3057
  this.isLoading = false;
3042
3058
  // Inference queue for handling concurrent calls
3043
3059
  this.inferenceQueue = Promise.resolve();
3060
+ /**
3061
+ * Preferred chunk size: 4000 samples (250ms at 16kHz).
3062
+ * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
3063
+ * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
3064
+ */
3065
+ this.chunkSamples = 4e3;
3044
3066
  this.config = config;
3045
3067
  }
3046
3068
  get backend() {
@@ -3073,32 +3095,78 @@ var Wav2ArkitCpuInference = class {
3073
3095
  this.ort = ort;
3074
3096
  this._backend = backend;
3075
3097
  logger5.info("ONNX Runtime loaded", { backend: this._backend });
3076
- const cache = getModelCache();
3077
3098
  const modelUrl = this.config.modelUrl;
3078
- const isCached = await cache.has(modelUrl);
3079
- let modelBuffer;
3080
- if (isCached) {
3081
- logger5.debug("Loading model from cache", { modelUrl });
3082
- modelBuffer = await cache.get(modelUrl);
3083
- if (!modelBuffer) {
3084
- logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
3085
- await cache.delete(modelUrl);
3099
+ const sessionOptions = { ...getSessionOptions(this._backend) };
3100
+ let isCached = false;
3101
+ if (isIOS() && this.config.modelDataUrl) {
3102
+ const dataFilename = this.config.modelDataUrl.split("/").pop();
3103
+ sessionOptions.externalData = [{
3104
+ path: dataFilename,
3105
+ data: this.config.modelDataUrl
3106
+ }];
3107
+ logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
3108
+ modelUrl,
3109
+ dataFile: dataFilename,
3110
+ dataUrl: this.config.modelDataUrl
3111
+ });
3112
+ this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
3113
+ } else {
3114
+ const cache = getModelCache();
3115
+ isCached = await cache.has(modelUrl);
3116
+ let modelBuffer;
3117
+ if (isCached) {
3118
+ logger5.debug("Loading model from cache", { modelUrl });
3119
+ modelBuffer = await cache.get(modelUrl);
3120
+ if (!modelBuffer) {
3121
+ logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
3122
+ await cache.delete(modelUrl);
3123
+ modelBuffer = await fetchWithCache(modelUrl);
3124
+ }
3125
+ } else {
3126
+ logger5.debug("Fetching and caching model", { modelUrl });
3086
3127
  modelBuffer = await fetchWithCache(modelUrl);
3087
3128
  }
3088
- } else {
3089
- logger5.debug("Fetching and caching model", { modelUrl });
3090
- modelBuffer = await fetchWithCache(modelUrl);
3091
- }
3092
- if (!modelBuffer) {
3093
- throw new Error(`Failed to load model: ${modelUrl}`);
3129
+ if (!modelBuffer) {
3130
+ throw new Error(`Failed to load model: ${modelUrl}`);
3131
+ }
3132
+ let externalDataBuffer;
3133
+ if (this.config.modelDataUrl) {
3134
+ const dataUrl = this.config.modelDataUrl;
3135
+ const isDataCached = await cache.has(dataUrl);
3136
+ if (isDataCached) {
3137
+ logger5.debug("Loading external data from cache", { dataUrl });
3138
+ externalDataBuffer = await cache.get(dataUrl);
3139
+ if (!externalDataBuffer) {
3140
+ logger5.warn("External data cache corruption, re-fetching", { dataUrl });
3141
+ await cache.delete(dataUrl);
3142
+ externalDataBuffer = await fetchWithCache(dataUrl);
3143
+ }
3144
+ } else {
3145
+ logger5.info("Fetching external data (this may take a while on first load)", {
3146
+ dataUrl
3147
+ });
3148
+ externalDataBuffer = await fetchWithCache(dataUrl);
3149
+ }
3150
+ logger5.debug("External data loaded", {
3151
+ size: formatBytes(externalDataBuffer.byteLength)
3152
+ });
3153
+ }
3154
+ logger5.debug("Creating ONNX session", {
3155
+ size: formatBytes(modelBuffer.byteLength),
3156
+ hasExternalData: !!externalDataBuffer,
3157
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
3158
+ backend: this._backend
3159
+ });
3160
+ if (externalDataBuffer) {
3161
+ const dataFilename = this.config.modelDataUrl.split("/").pop();
3162
+ sessionOptions.externalData = [{
3163
+ path: dataFilename,
3164
+ data: new Uint8Array(externalDataBuffer)
3165
+ }];
3166
+ }
3167
+ const modelData = new Uint8Array(modelBuffer);
3168
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
3094
3169
  }
3095
- logger5.debug("Creating ONNX session", {
3096
- size: formatBytes(modelBuffer.byteLength),
3097
- backend: this._backend
3098
- });
3099
- const sessionOptions = getSessionOptions(this._backend);
3100
- const modelData = new Uint8Array(modelBuffer);
3101
- this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
3102
3170
  const loadTimeMs = performance.now() - startTime;
3103
3171
  logger5.info("Model loaded successfully", {
3104
3172
  backend: this._backend,
@@ -3194,7 +3262,7 @@ var Wav2ArkitCpuInference = class {
3194
3262
  blendshapes.push(symmetrizeBlendshapes(remapped));
3195
3263
  }
3196
3264
  logger5.trace("Inference completed", {
3197
- inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3265
+ inferenceTimeMs: Math.round(inferenceTimeMs),
3198
3266
  numFrames,
3199
3267
  inputSamples
3200
3268
  });
@@ -3260,9 +3328,10 @@ function createLipSync(config) {
3260
3328
  });
3261
3329
  }
3262
3330
  if (useCpu) {
3263
- logger6.info("Creating Wav2ArkitCpuInference (1.8MB, WASM)");
3331
+ logger6.info("Creating Wav2ArkitCpuInference (WASM)");
3264
3332
  return new Wav2ArkitCpuInference({
3265
- modelUrl: config.cpuModelUrl
3333
+ modelUrl: config.cpuModelUrl,
3334
+ modelDataUrl: config.cpuModelDataUrl
3266
3335
  });
3267
3336
  }
3268
3337
  const gpuInstance = new Wav2Vec2Inference({
@@ -3289,6 +3358,9 @@ var LipSyncWithFallback = class {
3289
3358
  get isLoaded() {
3290
3359
  return this.implementation.isLoaded;
3291
3360
  }
3361
+ get chunkSamples() {
3362
+ return this.implementation.chunkSamples;
3363
+ }
3292
3364
  async load() {
3293
3365
  try {
3294
3366
  return await this.implementation.load();
@@ -3301,7 +3373,8 @@ var LipSyncWithFallback = class {
3301
3373
  } catch {
3302
3374
  }
3303
3375
  this.implementation = new Wav2ArkitCpuInference({
3304
- modelUrl: this.config.cpuModelUrl
3376
+ modelUrl: this.config.cpuModelUrl,
3377
+ modelDataUrl: this.config.cpuModelDataUrl
3305
3378
  });
3306
3379
  this.hasFallenBack = true;
3307
3380
  logger6.info("Fallback to Wav2ArkitCpuInference successful");
@@ -3331,8 +3404,6 @@ var SileroVADInference = class {
3331
3404
  // Pre-speech buffer for capturing beginning of speech
3332
3405
  this.preSpeechBuffer = [];
3333
3406
  this.wasSpeaking = false;
3334
- // Cached sample rate tensor (int64 scalar, never changes per instance)
3335
- this.srTensor = null;
3336
3407
  const sampleRate = config.sampleRate ?? 16e3;
3337
3408
  if (sampleRate !== 8e3 && sampleRate !== 16e3) {
3338
3409
  throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
@@ -3463,24 +3534,6 @@ var SileroVADInference = class {
3463
3534
  this.context = new Float32Array(this.contextSize);
3464
3535
  this.preSpeechBuffer = [];
3465
3536
  this.wasSpeaking = false;
3466
- if (!this.srTensor) {
3467
- try {
3468
- this.srTensor = new this.ort.Tensor(
3469
- "int64",
3470
- new BigInt64Array([BigInt(this.config.sampleRate)]),
3471
- []
3472
- );
3473
- } catch (e) {
3474
- logger7.warn("BigInt64Array not available, using bigint array fallback", {
3475
- error: e instanceof Error ? e.message : String(e)
3476
- });
3477
- this.srTensor = new this.ort.Tensor(
3478
- "int64",
3479
- [BigInt(this.config.sampleRate)],
3480
- []
3481
- );
3482
- }
3483
- }
3484
3537
  }
3485
3538
  /**
3486
3539
  * Process a single audio chunk
@@ -3612,7 +3665,20 @@ var SileroVADInference = class {
3612
3665
  inputBuffer.set(audioChunkCopy, this.contextSize);
3613
3666
  const inputBufferCopy = new Float32Array(inputBuffer);
3614
3667
  const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
3615
- const srTensor = this.srTensor;
3668
+ let srTensor;
3669
+ try {
3670
+ srTensor = new this.ort.Tensor(
3671
+ "int64",
3672
+ new BigInt64Array([BigInt(this.config.sampleRate)]),
3673
+ []
3674
+ );
3675
+ } catch {
3676
+ srTensor = new this.ort.Tensor(
3677
+ "int64",
3678
+ [BigInt(this.config.sampleRate)],
3679
+ []
3680
+ );
3681
+ }
3616
3682
  const stateCopy = new Float32Array(this.state.data);
3617
3683
  const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
3618
3684
  const feeds = {
@@ -3701,7 +3767,6 @@ var SileroVADInference = class {
3701
3767
  this.session = null;
3702
3768
  }
3703
3769
  this.state = null;
3704
- this.srTensor = null;
3705
3770
  }
3706
3771
  };
3707
3772
  /**
@@ -6514,8 +6579,8 @@ async function nukeBrowserCaches(preventRecreation = false) {
6514
6579
  totalDeleted: deletedCount
6515
6580
  });
6516
6581
  if (preventRecreation) {
6517
- const { env: env2 } = await import("./transformers.web-ALDLCPHT.mjs");
6518
- env2.useBrowserCache = false;
6582
+ const { env } = await import("./transformers.web-MHLR33H6.mjs");
6583
+ env.useBrowserCache = false;
6519
6584
  logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
6520
6585
  }
6521
6586
  return deletedCount;