npm - @omote/core - Versions diffs - 0.3.1 → 0.3.25 - Mend

@omote/core 0.3.1 → 0.3.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/dist/{chunk-T465MTDX.mjs → chunk-B6TIE56N.mjs} +63 -1153
package/dist/chunk-B6TIE56N.mjs.map +1 -0
package/dist/events/index.mjs +1 -1
package/dist/index.d.mts +86 -45
package/dist/index.d.ts +86 -45
package/dist/index.js +313 -1428
package/dist/index.js.map +1 -1
package/dist/index.mjs +241 -124
package/dist/index.mjs.map +1 -1
package/dist/logging/index.mjs +1 -1
package/dist/{transformers.web-MHLR33H6.mjs → transformers.web-T5LWC34T.mjs} +3 -3
package/package.json +2 -3
package/dist/chunk-6W7G6WE7.mjs +0 -13
package/dist/chunk-C3Y37HKD.mjs +0 -26378
package/dist/chunk-C3Y37HKD.mjs.map +0 -1
package/dist/chunk-RI6UQ7WF.mjs +0 -26378
package/dist/chunk-RI6UQ7WF.mjs.map +0 -1
package/dist/chunk-T465MTDX.mjs.map +0 -1
package/dist/transformers.web-4C62MDO6.mjs +0 -1724
package/dist/transformers.web-4C62MDO6.mjs.map +0 -1
package/dist/transformers.web-ALDLCPHT.mjs +0 -1725
package/dist/transformers.web-ALDLCPHT.mjs.map +0 -1
package/dist/transformers.web-MHLR33H6.mjs.map +0 -1
/package/dist/{chunk-6W7G6WE7.mjs.map → transformers.web-T5LWC34T.mjs.map} +0 -0

package/dist/events/index.mjs CHANGED Viewed

@@ -1,7 +1,7 @@
 import {
   EventEmitter
 } from "../chunk-XK22BRG4.mjs";
-import "../chunk-6W7G6WE7.mjs";
+import "../chunk-NSSMTXJJ.mjs";
 export {
   EventEmitter
 };

package/dist/index.d.mts CHANGED Viewed

@@ -27,11 +27,19 @@ declare class MicrophoneCapture {
     private buffer;
     private _isRecording;
     private _loggedFirstChunk;
+    /** Actual AudioContext sample rate (may differ from target on Firefox) */
+    private _nativeSampleRate;
     constructor(events: EventEmitter<OmoteEvents>, config?: MicrophoneCaptureConfig);
     get isRecording(): boolean;
     get isSupported(): boolean;
     start(): Promise<void>;
     stop(): void;
+    /**
+     * Resample audio using linear interpolation.
+     * Used when the AudioContext runs at the device's native rate (e.g. 48kHz)
+     * and we need to downsample to the target rate (e.g. 16kHz).
+     */
+    private resample;
     private floatToPCM16;
 }
@@ -98,12 +106,11 @@ interface AudioSchedulerOptions {
     /** Number of audio channels (default: 1 for mono) */
     channels?: number;
     /**
-     * Delay before first audio chunk plays (seconds).
-     * Gives slow inference backends (WASM) a head start so lip sync
-     * frames are ready by the time audio reaches the listener.
-     * Default: 0.05 (50ms — just enough to enqueue the first node)
+     * Initial lookahead delay in seconds before first audio plays.
+     * Gives LAM inference time to compute blendshapes before audio starts.
+     * Default: 0.05 (50ms) for WebGPU, increase to 0.3-0.5 for WASM on iOS.
      */
-    initialDelayS?: number;
+    initialLookaheadSec?: number;
 }
 declare class AudioScheduler {
     private readonly options;
@@ -373,13 +380,14 @@ declare function isSafari(): boolean;
 /**
  * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
  *
- * All WebKit browsers (Safari macOS, Safari iOS, Chrome iOS, Firefox iOS)
- * have ONNX Runtime WebGPU JSEP bugs that crash session creation, and the
- * 384MB LAM model stack-overflows in WASM mode.
- * The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
- * output at 22x real-time on CPU/WASM.
+ * All iOS browsers use WebKit and have tight memory limits — the 384MB
+ * LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
+ * (ORT fetches the 402MB weights directly into WASM, no JS heap copy).
+ *
+ * macOS Safari also needs this due to ONNX Runtime JSEP/ASYNCIFY bugs
+ * that crash WebKit's JIT compiler.
  *
- * @returns true if on Safari or any iOS browser (should use CPU lip sync model)
+ * @returns true if iOS (any browser) or Safari (any platform)
  */
 declare function shouldUseCpuLipSync(): boolean;
 /**
@@ -400,7 +408,7 @@ declare function isSpeechRecognitionAvailable(): boolean;
  * - Battery-efficient (no WASM overhead)
  * - No model download needed (saves 30-150MB)
  *
- * @returns true if on iOS with Speech API available
+ * @returns true if on iOS or Safari with Speech API available
  */
 declare function shouldUseNativeASR(): boolean;
 /**
@@ -419,7 +427,7 @@ declare function shouldUseServerLipSync(): boolean;
 /**
  * Common interface for lip sync inference backends
  *
- * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 1.8MB)
+ * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
  * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
  * work with either model transparently.
  *
@@ -454,19 +462,15 @@ interface LipSyncResult {
  *
  * Implemented by:
  * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
- * - Wav2ArkitCpuInference (WASM-only, 1.8MB, lip sync only)
+ * - Wav2ArkitCpuInference (WASM-only, 404MB, lip sync only)
  */
 interface LipSyncBackend {
+    /** Model identifier for backend-specific tuning (e.g. audio delay) */
+    readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
     /** Current backend type (webgpu, wasm, or null if not loaded) */
     readonly backend: RuntimeBackend | null;
     /** Whether the model is loaded and ready for inference */
     readonly isLoaded: boolean;
-    /**
-     * Preferred number of audio samples per inference chunk.
-     * Models with variable-length input can use smaller values for lower latency.
-     * Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
-     */
-    readonly chunkSamples?: number;
     /**
      * Load the ONNX model
      * @returns Model loading information
@@ -529,7 +533,7 @@ interface LAMPipelineOptions {
 }
 declare class LAMPipeline {
     private readonly options;
-    private readonly DEFAULT_CHUNK_SAMPLES;
+    private readonly REQUIRED_SAMPLES;
     private readonly FRAME_RATE;
     private buffer;
     private bufferStartTime;
@@ -558,13 +562,15 @@ declare class LAMPipeline {
     /**
      * Get the frame that should be displayed at the current time
      *
-     * Timestamp-synced playback for all backends. Audio playback is delayed
-     * for slow backends (WASM gets 1s head start via AudioScheduler) so
-     * frames are ready by the time their corresponding audio plays.
+     * Automatically removes frames that have already been displayed.
+     * This prevents memory leaks from accumulating old frames.
+     *
+     * Discard Window (prevents premature frame discarding):
+     * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
+     * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
      *
-     * Discard window is generous for WASM to handle inference jitter.
-     * Late frames play at RAF rate (~60fps) until caught up, then settle
-     * to natural 30fps pacing via timestamp gating.
+     * Last-Frame-Hold: Returns last valid frame instead of null to prevent
+     * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
      *
      * @param currentTime - Current AudioContext time
      * @param lam - LAM inference engine (optional, for backend detection)
@@ -592,7 +598,7 @@ declare class LAMPipeline {
     /**
      * Flush remaining buffered audio
      *
-     * Processes any remaining audio in the buffer, even if less than the chunk size.
+     * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
      * This ensures the final audio chunk generates blendshape frames.
      *
      * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -645,6 +651,12 @@ interface SyncedAudioPipelineOptions {
     chunkTargetMs?: number;
     /** LAM inference engine */
     lam: LipSyncBackend;
+    /**
+     * Audio playback delay in ms before first audio plays.
+     * Gives LAM inference time to pre-compute blendshapes.
+     * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
+     */
+    audioDelayMs?: number;
 }
 interface SyncedAudioPipelineEvents {
     /** New frame ready for display */
@@ -838,6 +850,17 @@ declare function getLoadedBackend(): RuntimeBackend | null;
  * Check if ONNX Runtime has been loaded
  */
 declare function isOnnxRuntimeLoaded(): boolean;
+/**
+ * Preload ONNX Runtime and compile the WASM binary early
+ *
+ * Call this before loading heavy resources (Three.js, VRM models) to ensure
+ * WASM memory is allocated in a clean JS heap, reducing iOS memory pressure.
+ * Uses the singleton pattern — subsequent model loading reuses this instance.
+ *
+ * @param preference Backend preference (default: 'auto')
+ * @returns The resolved backend that was loaded
+ */
+declare function preloadOnnxRuntime(preference?: BackendPreference): Promise<RuntimeBackend>;
 /**
  * Whisper Automatic Speech Recognition using transformers.js
@@ -1036,6 +1059,13 @@ type InferenceBackend = BackendPreference;
 interface Wav2Vec2InferenceConfig {
     /** Path or URL to the ONNX model */
     modelUrl: string;
+    /**
+     * Path or URL to external model data file (.onnx.data weights).
+     * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
+     *
+     * Set to `false` to skip external data loading (single-file models only).
+     */
+    externalDataUrl?: string | false;
     /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
     backend?: InferenceBackend;
     /** Number of identity classes (default: 12 for streaming model) */
@@ -1066,7 +1096,8 @@ interface Wav2Vec2Result {
     /** Inference time in ms */
     inferenceTimeMs: number;
 }
-declare class Wav2Vec2Inference {
+declare class Wav2Vec2Inference implements LipSyncBackend {
+    readonly modelId: "wav2vec2";
     private session;
     private ort;
     private config;
@@ -1116,12 +1147,16 @@ declare class Wav2Vec2Inference {
 /**
  * CPU-optimized lip sync inference using wav2arkit_cpu model
  *
- * A lightweight (1.8MB) alternative to Wav2Vec2Inference (384MB) designed
- * for Safari/iOS where WebGPU crashes due to ONNX Runtime JSEP bugs.
+ * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
+ * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
+ *
+ * The model uses ONNX external data format:
+ * - wav2arkit_cpu.onnx (1.86MB graph structure)
+ * - wav2arkit_cpu.onnx.data (402MB weights)
+ * Both files are fetched and cached automatically.
  *
  * Key differences from Wav2Vec2Inference:
- * - WASM-only backend (CPU-optimized, no WebGPU)
- * - 1.8MB model vs 384MB
+ * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
  * - No identity input (baked to identity 11)
  * - No ASR output (lip sync only)
  * - Dynamic input length (not fixed to 16000 samples)
@@ -1146,12 +1181,18 @@ declare class Wav2Vec2Inference {
 interface Wav2ArkitCpuConfig {
     /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
     modelUrl: string;
-    /** Path or URL to the external data file (.onnx.data weights file) */
-    modelDataUrl?: string;
+    /**
+     * Path or URL to external model data file (.onnx.data weights).
+     * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
+     *
+     * Set to `false` to skip external data loading (single-file models only).
+     */
+    externalDataUrl?: string | false;
     /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
     backend?: BackendPreference;
 }
 declare class Wav2ArkitCpuInference implements LipSyncBackend {
+    readonly modelId: "wav2arkit_cpu";
     private session;
     private ort;
     private config;
@@ -1161,12 +1202,6 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
     constructor(config: Wav2ArkitCpuConfig);
     get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;
-    /**
-     * Preferred chunk size: 4000 samples (250ms at 16kHz).
-     * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
-     * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
-     */
-    readonly chunkSamples = 4000;
     /**
      * Load the ONNX model
      */
@@ -1195,7 +1230,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
  * Factory function for lip sync with automatic GPU/CPU model selection
  *
  * Provides a unified API that automatically selects the optimal model:
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (1.8MB, WASM)
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
  * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
  * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
  *
@@ -1230,10 +1265,15 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
 interface CreateLipSyncConfig {
     /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
     gpuModelUrl: string;
+    /**
+     * URL for GPU model external data file (.onnx.data weights).
+     * Default: `${gpuModelUrl}.data`
+     *
+     * Set to `false` to skip external data loading (single-file models only).
+     */
+    gpuExternalDataUrl?: string | false;
     /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
     cpuModelUrl: string;
-    /** URL for the CPU model's external data file (.onnx.data weights) */
-    cpuModelDataUrl?: string;
     /**
      * Model selection mode:
      * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
@@ -1388,6 +1428,7 @@ declare class SileroVADInference {
     private inferenceQueue;
     private preSpeechBuffer;
     private wasSpeaking;
+    private srTensor;
     constructor(config: SileroVADConfig);
     get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;
@@ -4086,4 +4127,4 @@ declare class EmphasisDetector {
     reset(): void;
 }
-export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };
+export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };

package/dist/index.d.ts CHANGED Viewed

@@ -27,11 +27,19 @@ declare class MicrophoneCapture {
     private buffer;
     private _isRecording;
     private _loggedFirstChunk;
+    /** Actual AudioContext sample rate (may differ from target on Firefox) */
+    private _nativeSampleRate;
     constructor(events: EventEmitter<OmoteEvents>, config?: MicrophoneCaptureConfig);
     get isRecording(): boolean;
     get isSupported(): boolean;
     start(): Promise<void>;
     stop(): void;
+    /**
+     * Resample audio using linear interpolation.
+     * Used when the AudioContext runs at the device's native rate (e.g. 48kHz)
+     * and we need to downsample to the target rate (e.g. 16kHz).
+     */
+    private resample;
     private floatToPCM16;
 }
@@ -98,12 +106,11 @@ interface AudioSchedulerOptions {
     /** Number of audio channels (default: 1 for mono) */
     channels?: number;
     /**
-     * Delay before first audio chunk plays (seconds).
-     * Gives slow inference backends (WASM) a head start so lip sync
-     * frames are ready by the time audio reaches the listener.
-     * Default: 0.05 (50ms — just enough to enqueue the first node)
+     * Initial lookahead delay in seconds before first audio plays.
+     * Gives LAM inference time to compute blendshapes before audio starts.
+     * Default: 0.05 (50ms) for WebGPU, increase to 0.3-0.5 for WASM on iOS.
      */
-    initialDelayS?: number;
+    initialLookaheadSec?: number;
 }
 declare class AudioScheduler {
     private readonly options;
@@ -373,13 +380,14 @@ declare function isSafari(): boolean;
 /**
  * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
  *
- * All WebKit browsers (Safari macOS, Safari iOS, Chrome iOS, Firefox iOS)
- * have ONNX Runtime WebGPU JSEP bugs that crash session creation, and the
- * 384MB LAM model stack-overflows in WASM mode.
- * The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
- * output at 22x real-time on CPU/WASM.
+ * All iOS browsers use WebKit and have tight memory limits — the 384MB
+ * LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
+ * (ORT fetches the 402MB weights directly into WASM, no JS heap copy).
+ *
+ * macOS Safari also needs this due to ONNX Runtime JSEP/ASYNCIFY bugs
+ * that crash WebKit's JIT compiler.
  *
- * @returns true if on Safari or any iOS browser (should use CPU lip sync model)
+ * @returns true if iOS (any browser) or Safari (any platform)
  */
 declare function shouldUseCpuLipSync(): boolean;
 /**
@@ -400,7 +408,7 @@ declare function isSpeechRecognitionAvailable(): boolean;
  * - Battery-efficient (no WASM overhead)
  * - No model download needed (saves 30-150MB)
  *
- * @returns true if on iOS with Speech API available
+ * @returns true if on iOS or Safari with Speech API available
  */
 declare function shouldUseNativeASR(): boolean;
 /**
@@ -419,7 +427,7 @@ declare function shouldUseServerLipSync(): boolean;
 /**
  * Common interface for lip sync inference backends
  *
- * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 1.8MB)
+ * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
  * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
  * work with either model transparently.
  *
@@ -454,19 +462,15 @@ interface LipSyncResult {
  *
  * Implemented by:
  * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
- * - Wav2ArkitCpuInference (WASM-only, 1.8MB, lip sync only)
+ * - Wav2ArkitCpuInference (WASM-only, 404MB, lip sync only)
  */
 interface LipSyncBackend {
+    /** Model identifier for backend-specific tuning (e.g. audio delay) */
+    readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
     /** Current backend type (webgpu, wasm, or null if not loaded) */
     readonly backend: RuntimeBackend | null;
     /** Whether the model is loaded and ready for inference */
     readonly isLoaded: boolean;
-    /**
-     * Preferred number of audio samples per inference chunk.
-     * Models with variable-length input can use smaller values for lower latency.
-     * Default (if undefined): 16000 (1.0s at 16kHz, required by Wav2Vec2).
-     */
-    readonly chunkSamples?: number;
     /**
      * Load the ONNX model
      * @returns Model loading information
@@ -529,7 +533,7 @@ interface LAMPipelineOptions {
 }
 declare class LAMPipeline {
     private readonly options;
-    private readonly DEFAULT_CHUNK_SAMPLES;
+    private readonly REQUIRED_SAMPLES;
     private readonly FRAME_RATE;
     private buffer;
     private bufferStartTime;
@@ -558,13 +562,15 @@ declare class LAMPipeline {
     /**
      * Get the frame that should be displayed at the current time
      *
-     * Timestamp-synced playback for all backends. Audio playback is delayed
-     * for slow backends (WASM gets 1s head start via AudioScheduler) so
-     * frames are ready by the time their corresponding audio plays.
+     * Automatically removes frames that have already been displayed.
+     * This prevents memory leaks from accumulating old frames.
+     *
+     * Discard Window (prevents premature frame discarding):
+     * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
+     * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
      *
-     * Discard window is generous for WASM to handle inference jitter.
-     * Late frames play at RAF rate (~60fps) until caught up, then settle
-     * to natural 30fps pacing via timestamp gating.
+     * Last-Frame-Hold: Returns last valid frame instead of null to prevent
+     * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
      *
      * @param currentTime - Current AudioContext time
      * @param lam - LAM inference engine (optional, for backend detection)
@@ -592,7 +598,7 @@ declare class LAMPipeline {
     /**
      * Flush remaining buffered audio
      *
-     * Processes any remaining audio in the buffer, even if less than the chunk size.
+     * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
      * This ensures the final audio chunk generates blendshape frames.
      *
      * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
@@ -645,6 +651,12 @@ interface SyncedAudioPipelineOptions {
     chunkTargetMs?: number;
     /** LAM inference engine */
     lam: LipSyncBackend;
+    /**
+     * Audio playback delay in ms before first audio plays.
+     * Gives LAM inference time to pre-compute blendshapes.
+     * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
+     */
+    audioDelayMs?: number;
 }
 interface SyncedAudioPipelineEvents {
     /** New frame ready for display */
@@ -838,6 +850,17 @@ declare function getLoadedBackend(): RuntimeBackend | null;
  * Check if ONNX Runtime has been loaded
  */
 declare function isOnnxRuntimeLoaded(): boolean;
+/**
+ * Preload ONNX Runtime and compile the WASM binary early
+ *
+ * Call this before loading heavy resources (Three.js, VRM models) to ensure
+ * WASM memory is allocated in a clean JS heap, reducing iOS memory pressure.
+ * Uses the singleton pattern — subsequent model loading reuses this instance.
+ *
+ * @param preference Backend preference (default: 'auto')
+ * @returns The resolved backend that was loaded
+ */
+declare function preloadOnnxRuntime(preference?: BackendPreference): Promise<RuntimeBackend>;
 /**
  * Whisper Automatic Speech Recognition using transformers.js
@@ -1036,6 +1059,13 @@ type InferenceBackend = BackendPreference;
 interface Wav2Vec2InferenceConfig {
     /** Path or URL to the ONNX model */
     modelUrl: string;
+    /**
+     * Path or URL to external model data file (.onnx.data weights).
+     * Default: `${modelUrl}.data` (e.g., /models/model.onnx.data)
+     *
+     * Set to `false` to skip external data loading (single-file models only).
+     */
+    externalDataUrl?: string | false;
     /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
     backend?: InferenceBackend;
     /** Number of identity classes (default: 12 for streaming model) */
@@ -1066,7 +1096,8 @@ interface Wav2Vec2Result {
     /** Inference time in ms */
     inferenceTimeMs: number;
 }
-declare class Wav2Vec2Inference {
+declare class Wav2Vec2Inference implements LipSyncBackend {
+    readonly modelId: "wav2vec2";
     private session;
     private ort;
     private config;
@@ -1116,12 +1147,16 @@ declare class Wav2Vec2Inference {
 /**
  * CPU-optimized lip sync inference using wav2arkit_cpu model
  *
- * A lightweight (1.8MB) alternative to Wav2Vec2Inference (384MB) designed
- * for Safari/iOS where WebGPU crashes due to ONNX Runtime JSEP bugs.
+ * A Safari/iOS-compatible alternative to Wav2Vec2Inference (384MB) designed
+ * for platforms where WebGPU crashes due to ONNX Runtime JSEP bugs.
+ *
+ * The model uses ONNX external data format:
+ * - wav2arkit_cpu.onnx (1.86MB graph structure)
+ * - wav2arkit_cpu.onnx.data (402MB weights)
+ * Both files are fetched and cached automatically.
  *
  * Key differences from Wav2Vec2Inference:
- * - WASM-only backend (CPU-optimized, no WebGPU)
- * - 1.8MB model vs 384MB
+ * - WASM-only backend (CPU-optimized, single-threaded, no WebGPU)
  * - No identity input (baked to identity 11)
  * - No ASR output (lip sync only)
  * - Dynamic input length (not fixed to 16000 samples)
@@ -1146,12 +1181,18 @@ declare class Wav2Vec2Inference {
 interface Wav2ArkitCpuConfig {
     /** Path or URL to the wav2arkit_cpu ONNX model (.onnx graph file) */
     modelUrl: string;
-    /** Path or URL to the external data file (.onnx.data weights file) */
-    modelDataUrl?: string;
+    /**
+     * Path or URL to external model data file (.onnx.data weights).
+     * Default: `${modelUrl}.data` (e.g., /models/wav2arkit_cpu.onnx.data)
+     *
+     * Set to `false` to skip external data loading (single-file models only).
+     */
+    externalDataUrl?: string | false;
     /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
     backend?: BackendPreference;
 }
 declare class Wav2ArkitCpuInference implements LipSyncBackend {
+    readonly modelId: "wav2arkit_cpu";
     private session;
     private ort;
     private config;
@@ -1161,12 +1202,6 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
     constructor(config: Wav2ArkitCpuConfig);
     get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;
-    /**
-     * Preferred chunk size: 4000 samples (250ms at 16kHz).
-     * wav2arkit_cpu accepts variable-length input, so we use smaller chunks
-     * for lower latency on WASM (vs 16000 for Wav2Vec2's fixed requirement).
-     */
-    readonly chunkSamples = 4000;
     /**
      * Load the ONNX model
      */
@@ -1195,7 +1230,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
  * Factory function for lip sync with automatic GPU/CPU model selection
  *
  * Provides a unified API that automatically selects the optimal model:
- * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (1.8MB, WASM)
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
  * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
  * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
  *
@@ -1230,10 +1265,15 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
 interface CreateLipSyncConfig {
     /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
     gpuModelUrl: string;
+    /**
+     * URL for GPU model external data file (.onnx.data weights).
+     * Default: `${gpuModelUrl}.data`
+     *
+     * Set to `false` to skip external data loading (single-file models only).
+     */
+    gpuExternalDataUrl?: string | false;
     /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
     cpuModelUrl: string;
-    /** URL for the CPU model's external data file (.onnx.data weights) */
-    cpuModelDataUrl?: string;
     /**
      * Model selection mode:
      * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
@@ -1388,6 +1428,7 @@ declare class SileroVADInference {
     private inferenceQueue;
     private preSpeechBuffer;
     private wasSpeaking;
+    private srTensor;
     constructor(config: SileroVADConfig);
     get backend(): RuntimeBackend | null;
     get isLoaded(): boolean;
@@ -4086,4 +4127,4 @@ declare class EmphasisDetector {
     reset(): void;
 }
-export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };
+export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };