@omote/core 0.2.3 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -13,10 +13,10 @@ import {
13
13
  setLoggingEnabled
14
14
  } from "./chunk-ESU52TDS.mjs";
15
15
  import {
16
- env,
17
- pipeline3
18
- } from "./chunk-RI6UQ7WF.mjs";
19
- import "./chunk-NSSMTXJJ.mjs";
16
+ __webpack_exports__env,
17
+ __webpack_exports__pipeline
18
+ } from "./chunk-T465MTDX.mjs";
19
+ import "./chunk-6W7G6WE7.mjs";
20
20
 
21
21
  // src/audio/MicrophoneCapture.ts
22
22
  var MicrophoneCapture = class {
@@ -263,7 +263,7 @@ var AudioScheduler = class {
263
263
  const ctx = await this.ensureContext();
264
264
  const channels = this.options.channels ?? 1;
265
265
  if (!this.isPlaying) {
266
- this.nextPlayTime = ctx.currentTime + 0.05;
266
+ this.nextPlayTime = ctx.currentTime + (this.options.initialDelayS ?? 0.05);
267
267
  this.isPlaying = true;
268
268
  }
269
269
  const audioBuffer = ctx.createBuffer(channels, audioData.length, ctx.sampleRate);
@@ -446,8 +446,8 @@ var AudioChunkCoalescer = class {
446
446
  var LAMPipeline = class {
447
447
  constructor(options = {}) {
448
448
  this.options = options;
449
- this.REQUIRED_SAMPLES = 16e3;
450
- // 1.0s at 16kHz (LAM requirement)
449
+ this.DEFAULT_CHUNK_SAMPLES = 16e3;
450
+ // 1.0s at 16kHz (Wav2Vec2 requirement)
451
451
  this.FRAME_RATE = 30;
452
452
  // LAM outputs 30fps
453
453
  this.buffer = new Float32Array(0);
@@ -477,19 +477,20 @@ var LAMPipeline = class {
477
477
  newBuffer.set(this.buffer, 0);
478
478
  newBuffer.set(samples, this.buffer.length);
479
479
  this.buffer = newBuffer;
480
- while (this.buffer.length >= this.REQUIRED_SAMPLES) {
481
- await this.processBuffer(lam);
480
+ const chunkSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
481
+ while (this.buffer.length >= chunkSize) {
482
+ await this.processBuffer(lam, chunkSize);
482
483
  }
483
484
  }
484
485
  /**
485
486
  * Process accumulated buffer through LAM inference
486
487
  */
487
- async processBuffer(lam) {
488
+ async processBuffer(lam, chunkSize) {
488
489
  try {
489
- const toProcess = this.buffer.slice(0, this.REQUIRED_SAMPLES);
490
+ const toProcess = this.buffer.slice(0, chunkSize);
490
491
  const processedStartTime = this.bufferStartTime;
491
- this.buffer = this.buffer.slice(this.REQUIRED_SAMPLES);
492
- const processedDuration = this.REQUIRED_SAMPLES / (this.options.sampleRate ?? 16e3);
492
+ this.buffer = this.buffer.slice(chunkSize);
493
+ const processedDuration = chunkSize / (this.options.sampleRate ?? 16e3);
493
494
  this.bufferStartTime = processedStartTime + processedDuration;
494
495
  const result = await lam.infer(toProcess);
495
496
  const frameDuration = 1 / this.FRAME_RATE;
@@ -508,35 +509,22 @@ var LAMPipeline = class {
508
509
  /**
509
510
  * Get the frame that should be displayed at the current time
510
511
  *
511
- * Automatically removes frames that have already been displayed.
512
- * This prevents memory leaks from accumulating old frames.
512
+ * Timestamp-synced playback for all backends. Audio playback is delayed
513
+ * for slow backends (WASM gets 1s head start via AudioScheduler) so
514
+ * frames are ready by the time their corresponding audio plays.
513
515
  *
514
- * Discard Window (prevents premature frame discarding):
515
- * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
516
- * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
517
- *
518
- * Last-Frame-Hold: Returns last valid frame instead of null to prevent
519
- * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
516
+ * Discard window is generous for WASM to handle inference jitter.
517
+ * Late frames play at RAF rate (~60fps) until caught up, then settle
518
+ * to natural 30fps pacing via timestamp gating.
520
519
  *
521
520
  * @param currentTime - Current AudioContext time
522
521
  * @param lam - LAM inference engine (optional, for backend detection)
523
522
  * @returns Current frame, or last frame as fallback, or null if no frames yet
524
523
  */
525
524
  getFrameForTime(currentTime, lam) {
526
- const discardWindow = lam?.backend === "wasm" ? 1 : 0.5;
527
- let discardedCount = 0;
525
+ const discardWindow = lam?.backend === "wasm" ? 10 : 0.5;
528
526
  while (this.frameQueue.length > 0 && this.frameQueue[0].timestamp < currentTime - discardWindow) {
529
- const discarded = this.frameQueue.shift();
530
- discardedCount++;
531
- if (discardedCount === 1) {
532
- const ageMs = ((currentTime - discarded.timestamp) * 1e3).toFixed(0);
533
- console.warn("[LAM] Frame(s) discarded as too old", {
534
- ageMs,
535
- discardWindowMs: discardWindow * 1e3,
536
- queueLength: this.frameQueue.length,
537
- backend: lam?.backend ?? "unknown"
538
- });
539
- }
527
+ this.frameQueue.shift();
540
528
  }
541
529
  if (this.frameQueue.length > 0 && this.frameQueue[0].timestamp <= currentTime) {
542
530
  const { frame } = this.frameQueue.shift();
@@ -555,7 +543,7 @@ var LAMPipeline = class {
555
543
  * Get current buffer fill level (0-1)
556
544
  */
557
545
  get fillLevel() {
558
- return Math.min(1, this.buffer.length / this.REQUIRED_SAMPLES);
546
+ return Math.min(1, this.buffer.length / this.DEFAULT_CHUNK_SAMPLES);
559
547
  }
560
548
  /**
561
549
  * Get number of frames queued
@@ -572,7 +560,7 @@ var LAMPipeline = class {
572
560
  /**
573
561
  * Flush remaining buffered audio
574
562
  *
575
- * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
563
+ * Processes any remaining audio in the buffer, even if less than the chunk size.
576
564
  * This ensures the final audio chunk generates blendshape frames.
577
565
  *
578
566
  * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -583,12 +571,17 @@ var LAMPipeline = class {
583
571
  if (this.buffer.length === 0) {
584
572
  return;
585
573
  }
586
- const padded = new Float32Array(this.REQUIRED_SAMPLES);
587
- padded.set(this.buffer, 0);
588
574
  const processedStartTime = this.bufferStartTime;
575
+ const sampleRate = this.options.sampleRate ?? 16e3;
576
+ const minSize = lam.chunkSamples ?? this.DEFAULT_CHUNK_SAMPLES;
577
+ const audioToInfer = this.buffer.length >= minSize ? this.buffer : (() => {
578
+ const padded = new Float32Array(minSize);
579
+ padded.set(this.buffer, 0);
580
+ return padded;
581
+ })();
589
582
  try {
590
- const result = await lam.infer(padded);
591
- const actualDuration = this.buffer.length / (this.options.sampleRate ?? 16e3);
583
+ const result = await lam.infer(audioToInfer);
584
+ const actualDuration = this.buffer.length / sampleRate;
592
585
  const frameDuration = 1 / this.FRAME_RATE;
593
586
  const actualFrameCount = Math.ceil(actualDuration * this.FRAME_RATE);
594
587
  for (let i = 0; i < Math.min(actualFrameCount, result.blendshapes.length); i++) {
@@ -647,7 +640,13 @@ var SyncedAudioPipeline = class extends EventEmitter {
647
640
  this.monitorInterval = null;
648
641
  this.frameAnimationId = null;
649
642
  const sampleRate = options.sampleRate ?? 16e3;
650
- this.scheduler = new AudioScheduler({ sampleRate });
643
+ if (!options.lam.isLoaded) {
644
+ throw new Error(
645
+ "LipSyncBackend must be loaded before constructing SyncedAudioPipeline. Call lam.load() first so backend type is known for timing configuration."
646
+ );
647
+ }
648
+ const initialDelayS = options.lam.backend === "wasm" ? 1 : 0.05;
649
+ this.scheduler = new AudioScheduler({ sampleRate, initialDelayS });
651
650
  this.coalescer = new AudioChunkCoalescer({
652
651
  sampleRate,
653
652
  targetDurationMs: options.chunkTargetMs ?? 200
@@ -2138,12 +2137,9 @@ function applyIOSWasmMemoryPatch() {
2138
2137
  iosWasmPatched = true;
2139
2138
  const OrigMemory = WebAssembly.Memory;
2140
2139
  const MAX_IOS_PAGES = 16384;
2141
- logger.info("Applying iOS WASM memory patch (shared\u2192false, max\u21921GB)");
2140
+ logger.info("Applying iOS WASM memory patch (max capped to 1GB, shared preserved)");
2142
2141
  WebAssembly.Memory = function IOSPatchedMemory(descriptor) {
2143
2142
  const patched = { ...descriptor };
2144
- if (patched.shared) {
2145
- patched.shared = false;
2146
- }
2147
2143
  if (patched.maximum !== void 0 && patched.maximum > MAX_IOS_PAGES) {
2148
2144
  patched.maximum = MAX_IOS_PAGES;
2149
2145
  }
@@ -2760,7 +2756,7 @@ var WhisperInference = class _WhisperInference {
2760
2756
  * Check if WebGPU is available in this browser
2761
2757
  */
2762
2758
  static async isWebGPUAvailable() {
2763
- return "gpu" in navigator;
2759
+ return isWebGPUAvailable();
2764
2760
  }
2765
2761
  /**
2766
2762
  * Load the Whisper model pipeline
@@ -2801,19 +2797,19 @@ var WhisperInference = class _WhisperInference {
2801
2797
  const hasWebGPU = await _WhisperInference.isWebGPUAvailable();
2802
2798
  const device = this.config.device === "auto" ? hasWebGPU ? "webgpu" : "wasm" : this.config.device;
2803
2799
  logger4.info("Creating pipeline", { device, hasWebGPU });
2804
- env.allowLocalModels = false;
2805
- env.allowRemoteModels = true;
2806
- env.useBrowserCache = false;
2807
- env.useCustomCache = false;
2808
- env.useWasmCache = false;
2809
- if (env.backends.onnx.wasm) {
2810
- env.backends.onnx.wasm.proxy = false;
2811
- env.backends.onnx.wasm.numThreads = 1;
2800
+ __webpack_exports__env.allowLocalModels = false;
2801
+ __webpack_exports__env.allowRemoteModels = true;
2802
+ __webpack_exports__env.useBrowserCache = false;
2803
+ __webpack_exports__env.useCustomCache = false;
2804
+ __webpack_exports__env.useWasmCache = false;
2805
+ if (__webpack_exports__env.backends.onnx.wasm) {
2806
+ __webpack_exports__env.backends.onnx.wasm.proxy = false;
2807
+ __webpack_exports__env.backends.onnx.wasm.numThreads = 1;
2812
2808
  }
2813
2809
  logger4.info("Configured transformers.js env", {
2814
- allowLocalModels: env.allowLocalModels,
2815
- useBrowserCache: env.useBrowserCache,
2816
- useWasmCache: env.useWasmCache
2810
+ allowLocalModels: __webpack_exports__env.allowLocalModels,
2811
+ useBrowserCache: __webpack_exports__env.useBrowserCache,
2812
+ useWasmCache: __webpack_exports__env.useWasmCache
2817
2813
  });
2818
2814
  const pipelineOptions = {
2819
2815
  dtype: this.config.dtype,
@@ -2830,7 +2826,7 @@ var WhisperInference = class _WhisperInference {
2830
2826
  };
2831
2827
  logger4.info("Forcing WebGPU execution providers");
2832
2828
  }
2833
- this.pipeline = await pipeline3(
2829
+ this.pipeline = await __webpack_exports__pipeline(
2834
2830
  "automatic-speech-recognition",
2835
2831
  modelName,
2836
2832
  pipelineOptions
@@ -3061,6 +3057,12 @@ var Wav2ArkitCpuInference = class {
3061
3057
  this.isLoading = false;
3062
3058
  // Inference queue for handling concurrent calls
3063
3059
  this.inferenceQueue = Promise.resolve();
3060
+ /**
3061
+ * Preferred chunk size: 4000 samples (250ms at 16kHz).
3062
+ * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
3063
+ * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
3064
+ */
3065
+ this.chunkSamples = 4e3;
3064
3066
  this.config = config;
3065
3067
  }
3066
3068
  get backend() {
@@ -3093,32 +3095,78 @@ var Wav2ArkitCpuInference = class {
3093
3095
  this.ort = ort;
3094
3096
  this._backend = backend;
3095
3097
  logger5.info("ONNX Runtime loaded", { backend: this._backend });
3096
- const cache = getModelCache();
3097
3098
  const modelUrl = this.config.modelUrl;
3098
- const isCached = await cache.has(modelUrl);
3099
- let modelBuffer;
3100
- if (isCached) {
3101
- logger5.debug("Loading model from cache", { modelUrl });
3102
- modelBuffer = await cache.get(modelUrl);
3103
- if (!modelBuffer) {
3104
- logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
3105
- await cache.delete(modelUrl);
3099
+ const sessionOptions = { ...getSessionOptions(this._backend) };
3100
+ let isCached = false;
3101
+ if (isIOS() && this.config.modelDataUrl) {
3102
+ const dataFilename = this.config.modelDataUrl.split("/").pop();
3103
+ sessionOptions.externalData = [{
3104
+ path: dataFilename,
3105
+ data: this.config.modelDataUrl
3106
+ }];
3107
+ logger5.info("iOS: URL-based session creation (ORT handles fetch internally)", {
3108
+ modelUrl,
3109
+ dataFile: dataFilename,
3110
+ dataUrl: this.config.modelDataUrl
3111
+ });
3112
+ this.session = await this.ort.InferenceSession.create(modelUrl, sessionOptions);
3113
+ } else {
3114
+ const cache = getModelCache();
3115
+ isCached = await cache.has(modelUrl);
3116
+ let modelBuffer;
3117
+ if (isCached) {
3118
+ logger5.debug("Loading model from cache", { modelUrl });
3119
+ modelBuffer = await cache.get(modelUrl);
3120
+ if (!modelBuffer) {
3121
+ logger5.warn("Cache corruption detected, clearing and retrying", { modelUrl });
3122
+ await cache.delete(modelUrl);
3123
+ modelBuffer = await fetchWithCache(modelUrl);
3124
+ }
3125
+ } else {
3126
+ logger5.debug("Fetching and caching model", { modelUrl });
3106
3127
  modelBuffer = await fetchWithCache(modelUrl);
3107
3128
  }
3108
- } else {
3109
- logger5.debug("Fetching and caching model", { modelUrl });
3110
- modelBuffer = await fetchWithCache(modelUrl);
3111
- }
3112
- if (!modelBuffer) {
3113
- throw new Error(`Failed to load model: ${modelUrl}`);
3129
+ if (!modelBuffer) {
3130
+ throw new Error(`Failed to load model: ${modelUrl}`);
3131
+ }
3132
+ let externalDataBuffer;
3133
+ if (this.config.modelDataUrl) {
3134
+ const dataUrl = this.config.modelDataUrl;
3135
+ const isDataCached = await cache.has(dataUrl);
3136
+ if (isDataCached) {
3137
+ logger5.debug("Loading external data from cache", { dataUrl });
3138
+ externalDataBuffer = await cache.get(dataUrl);
3139
+ if (!externalDataBuffer) {
3140
+ logger5.warn("External data cache corruption, re-fetching", { dataUrl });
3141
+ await cache.delete(dataUrl);
3142
+ externalDataBuffer = await fetchWithCache(dataUrl);
3143
+ }
3144
+ } else {
3145
+ logger5.info("Fetching external data (this may take a while on first load)", {
3146
+ dataUrl
3147
+ });
3148
+ externalDataBuffer = await fetchWithCache(dataUrl);
3149
+ }
3150
+ logger5.debug("External data loaded", {
3151
+ size: formatBytes(externalDataBuffer.byteLength)
3152
+ });
3153
+ }
3154
+ logger5.debug("Creating ONNX session", {
3155
+ size: formatBytes(modelBuffer.byteLength),
3156
+ hasExternalData: !!externalDataBuffer,
3157
+ externalDataSize: externalDataBuffer ? formatBytes(externalDataBuffer.byteLength) : void 0,
3158
+ backend: this._backend
3159
+ });
3160
+ if (externalDataBuffer) {
3161
+ const dataFilename = this.config.modelDataUrl.split("/").pop();
3162
+ sessionOptions.externalData = [{
3163
+ path: dataFilename,
3164
+ data: new Uint8Array(externalDataBuffer)
3165
+ }];
3166
+ }
3167
+ const modelData = new Uint8Array(modelBuffer);
3168
+ this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
3114
3169
  }
3115
- logger5.debug("Creating ONNX session", {
3116
- size: formatBytes(modelBuffer.byteLength),
3117
- backend: this._backend
3118
- });
3119
- const sessionOptions = getSessionOptions(this._backend);
3120
- const modelData = new Uint8Array(modelBuffer);
3121
- this.session = await this.ort.InferenceSession.create(modelData, sessionOptions);
3122
3170
  const loadTimeMs = performance.now() - startTime;
3123
3171
  logger5.info("Model loaded successfully", {
3124
3172
  backend: this._backend,
@@ -3214,7 +3262,7 @@ var Wav2ArkitCpuInference = class {
3214
3262
  blendshapes.push(symmetrizeBlendshapes(remapped));
3215
3263
  }
3216
3264
  logger5.trace("Inference completed", {
3217
- inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
3265
+ inferenceTimeMs: Math.round(inferenceTimeMs),
3218
3266
  numFrames,
3219
3267
  inputSamples
3220
3268
  });
@@ -3280,9 +3328,10 @@ function createLipSync(config) {
3280
3328
  });
3281
3329
  }
3282
3330
  if (useCpu) {
3283
- logger6.info("Creating Wav2ArkitCpuInference (1.8MB, WASM)");
3331
+ logger6.info("Creating Wav2ArkitCpuInference (WASM)");
3284
3332
  return new Wav2ArkitCpuInference({
3285
- modelUrl: config.cpuModelUrl
3333
+ modelUrl: config.cpuModelUrl,
3334
+ modelDataUrl: config.cpuModelDataUrl
3286
3335
  });
3287
3336
  }
3288
3337
  const gpuInstance = new Wav2Vec2Inference({
@@ -3309,6 +3358,9 @@ var LipSyncWithFallback = class {
3309
3358
  get isLoaded() {
3310
3359
  return this.implementation.isLoaded;
3311
3360
  }
3361
+ get chunkSamples() {
3362
+ return this.implementation.chunkSamples;
3363
+ }
3312
3364
  async load() {
3313
3365
  try {
3314
3366
  return await this.implementation.load();
@@ -3321,7 +3373,8 @@ var LipSyncWithFallback = class {
3321
3373
  } catch {
3322
3374
  }
3323
3375
  this.implementation = new Wav2ArkitCpuInference({
3324
- modelUrl: this.config.cpuModelUrl
3376
+ modelUrl: this.config.cpuModelUrl,
3377
+ modelDataUrl: this.config.cpuModelDataUrl
3325
3378
  });
3326
3379
  this.hasFallenBack = true;
3327
3380
  logger6.info("Fallback to Wav2ArkitCpuInference successful");
@@ -3351,8 +3404,6 @@ var SileroVADInference = class {
3351
3404
  // Pre-speech buffer for capturing beginning of speech
3352
3405
  this.preSpeechBuffer = [];
3353
3406
  this.wasSpeaking = false;
3354
- // Cached sample rate tensor (int64 scalar, never changes per instance)
3355
- this.srTensor = null;
3356
3407
  const sampleRate = config.sampleRate ?? 16e3;
3357
3408
  if (sampleRate !== 8e3 && sampleRate !== 16e3) {
3358
3409
  throw new Error("Silero VAD only supports 8000 or 16000 Hz sample rates");
@@ -3483,24 +3534,6 @@ var SileroVADInference = class {
3483
3534
  this.context = new Float32Array(this.contextSize);
3484
3535
  this.preSpeechBuffer = [];
3485
3536
  this.wasSpeaking = false;
3486
- if (!this.srTensor) {
3487
- try {
3488
- this.srTensor = new this.ort.Tensor(
3489
- "int64",
3490
- new BigInt64Array([BigInt(this.config.sampleRate)]),
3491
- []
3492
- );
3493
- } catch (e) {
3494
- logger7.warn("BigInt64Array not available, using bigint array fallback", {
3495
- error: e instanceof Error ? e.message : String(e)
3496
- });
3497
- this.srTensor = new this.ort.Tensor(
3498
- "int64",
3499
- [BigInt(this.config.sampleRate)],
3500
- []
3501
- );
3502
- }
3503
- }
3504
3537
  }
3505
3538
  /**
3506
3539
  * Process a single audio chunk
@@ -3632,7 +3665,20 @@ var SileroVADInference = class {
3632
3665
  inputBuffer.set(audioChunkCopy, this.contextSize);
3633
3666
  const inputBufferCopy = new Float32Array(inputBuffer);
3634
3667
  const inputTensor = new this.ort.Tensor("float32", inputBufferCopy, [1, inputSize]);
3635
- const srTensor = this.srTensor;
3668
+ let srTensor;
3669
+ try {
3670
+ srTensor = new this.ort.Tensor(
3671
+ "int64",
3672
+ new BigInt64Array([BigInt(this.config.sampleRate)]),
3673
+ []
3674
+ );
3675
+ } catch {
3676
+ srTensor = new this.ort.Tensor(
3677
+ "int64",
3678
+ [BigInt(this.config.sampleRate)],
3679
+ []
3680
+ );
3681
+ }
3636
3682
  const stateCopy = new Float32Array(this.state.data);
3637
3683
  const stateTensor = new this.ort.Tensor("float32", stateCopy, this.state.dims);
3638
3684
  const feeds = {
@@ -3721,7 +3767,6 @@ var SileroVADInference = class {
3721
3767
  this.session = null;
3722
3768
  }
3723
3769
  this.state = null;
3724
- this.srTensor = null;
3725
3770
  }
3726
3771
  };
3727
3772
  /**
@@ -6534,8 +6579,8 @@ async function nukeBrowserCaches(preventRecreation = false) {
6534
6579
  totalDeleted: deletedCount
6535
6580
  });
6536
6581
  if (preventRecreation) {
6537
- const { env: env2 } = await import("./transformers.web-ALDLCPHT.mjs");
6538
- env2.useBrowserCache = false;
6582
+ const { env } = await import("./transformers.web-MHLR33H6.mjs");
6583
+ env.useBrowserCache = false;
6539
6584
  logger12.warn("Browser cache creation disabled (env.useBrowserCache = false)");
6540
6585
  }
6541
6586
  return deletedCount;