@omote/core 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -112,6 +112,17 @@ declare class AudioScheduler {
112
112
  * to avoid browser autoplay policy issues (requires user gesture).
113
113
  */
114
114
  initialize(): Promise<void>;
115
+ /**
116
+ * Eagerly create and warm up the AudioContext
117
+ *
118
+ * Call this when a playback session starts (e.g., when AI response begins).
119
+ * The AudioContext needs time to initialize the audio hardware — on Windows
120
+ * this can take 50-100ms. By warming up early (before audio data arrives),
121
+ * the context is fully ready when schedule() is first called.
122
+ *
123
+ * Must be called after a user gesture (click/tap) for autoplay policy.
124
+ */
125
+ warmup(): Promise<void>;
115
126
  /**
116
127
  * Ensure AudioContext is created and ready
117
128
  * Called lazily on first schedule() - requires user gesture
@@ -157,6 +168,7 @@ declare class AudioScheduler {
157
168
  cancelAll(fadeOutMs?: number): Promise<void>;
158
169
  /**
159
170
  * Reset scheduler state for new playback session
171
+ * Stops any orphaned sources that weren't cleaned up by cancelAll()
160
172
  */
161
173
  reset(): void;
162
174
  /**
@@ -342,6 +354,26 @@ declare function getOptimalWasmThreads(): number;
342
354
  * @returns true if proxy mode is safe to enable
343
355
  */
344
356
  declare function shouldEnableWasmProxy(): boolean;
357
+ /**
358
+ * Detect Safari browser on any platform (macOS + iOS)
359
+ *
360
+ * Safari WebKit has bugs with ONNX Runtime's WebGPU multithreaded JSEP build
361
+ * that crash session creation. Both iOS and macOS Safari are affected.
362
+ *
363
+ * @returns true if running in Safari on any platform
364
+ */
365
+ declare function isSafari(): boolean;
366
+ /**
367
+ * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
368
+ *
369
+ * Safari (macOS + iOS) has WebGPU JSEP bugs that crash ONNX Runtime,
370
+ * and the 384MB LAM model stack-overflows in WASM mode.
371
+ * The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
372
+ * output at 22x real-time on CPU/WASM.
373
+ *
374
+ * @returns true if on Safari (should use CPU lip sync model)
375
+ */
376
+ declare function shouldUseCpuLipSync(): boolean;
345
377
  /**
346
378
  * Check if Web Speech API is available in the browser
347
379
  *
@@ -377,200 +409,62 @@ declare function shouldUseNativeASR(): boolean;
377
409
  declare function shouldUseServerLipSync(): boolean;
378
410
 
379
411
  /**
380
- * Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
412
+ * Common interface for lip sync inference backends
381
413
  *
382
- * This module provides a way to dynamically load the appropriate ONNX Runtime bundle
383
- * based on the platform's capabilities. This is critical for iOS support because:
414
+ * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 1.8MB)
415
+ * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
416
+ * work with either model transparently.
384
417
  *
385
- * 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
386
- * 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
387
- * 3. WASM-only bundle is smaller and more reliable on iOS
388
- *
389
- * Usage:
390
- * ```typescript
391
- * const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
392
- * const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
393
- * ```
394
- *
395
- * @module inference/onnxLoader
418
+ * @category Inference
396
419
  */
397
420
 
398
- type OrtModule = {
399
- InferenceSession: typeof InferenceSession;
400
- Tensor: typeof Tensor;
401
- env: Env;
402
- };
403
- type SessionOptions = InferenceSession.SessionOptions;
404
-
405
- /**
406
- * Check if WebGPU is available and likely to work
407
- *
408
- * This is more thorough than just checking navigator.gpu exists.
409
- * It actually requests an adapter to verify the GPU is accessible.
410
- *
411
- * @returns true if WebGPU is available and working
412
- */
413
- declare function isWebGPUAvailable(): Promise<boolean>;
414
- /**
415
- * Load ONNX Runtime with the specified backend
416
- *
417
- * This lazily loads the appropriate bundle:
418
- * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
419
- * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
420
- *
421
- * Once loaded, the same instance is reused for all subsequent calls.
422
- * If you need to switch backends, you must reload the page.
423
- *
424
- * @param backend The backend to load ('webgpu' or 'wasm')
425
- * @returns The ONNX Runtime module
426
- */
427
- declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
428
- /**
429
- * Get the appropriate ONNX Runtime based on user preference
430
- *
431
- * This resolves the user's preference against platform capabilities
432
- * and loads the appropriate bundle.
433
- *
434
- * @param preference User's backend preference
435
- * @returns The ONNX Runtime module and the resolved backend
436
- */
437
- declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
438
- ort: OrtModule;
439
- backend: RuntimeBackend;
440
- }>;
441
421
  /**
442
- * Get session options for creating an inference session
443
- *
444
- * This returns optimized session options based on the backend and platform.
445
- *
446
- * @param backend The backend being used
447
- * @returns Session options for InferenceSession.create()
422
+ * Model loading information returned by load()
448
423
  */
449
- declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
450
- /**
451
- * Create an inference session with automatic fallback
452
- *
453
- * If WebGPU session creation fails, automatically falls back to WASM.
454
- *
455
- * @param modelBuffer The model data as ArrayBuffer
456
- * @param preferredBackend The preferred backend
457
- * @returns The created session and the backend used
458
- */
459
- declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
460
- session: InferenceSession;
424
+ interface LipSyncModelInfo {
461
425
  backend: RuntimeBackend;
462
- }>;
463
- /**
464
- * Get the currently loaded backend (if any)
465
- */
466
- declare function getLoadedBackend(): RuntimeBackend | null;
467
- /**
468
- * Check if ONNX Runtime has been loaded
469
- */
470
- declare function isOnnxRuntimeLoaded(): boolean;
471
-
472
- /**
473
- * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
474
- *
475
- * Runs entirely in the browser using WebGPU or WASM.
476
- * Takes raw 16kHz audio and outputs:
477
- * - 52 ARKit blendshapes (lip sync)
478
- * - 32-token CTC logits (speech recognition)
479
- *
480
- * @category Inference
481
- *
482
- * @example Basic usage
483
- * ```typescript
484
- * import { Wav2Vec2Inference } from '@omote/core';
485
- *
486
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
487
- * await wav2vec.load();
488
- *
489
- * // Process 1 second of audio (16kHz = 16000 samples)
490
- * const result = await wav2vec.infer(audioSamples);
491
- *
492
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
493
- * console.log('ASR text:', result.text); // Decoded transcription
494
- * ```
495
- */
496
-
497
- type InferenceBackend = BackendPreference;
498
- interface Wav2Vec2InferenceConfig {
499
- /** Path or URL to the ONNX model */
500
- modelUrl: string;
501
- /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
502
- backend?: InferenceBackend;
503
- /** Number of identity classes (default: 12 for streaming model) */
504
- numIdentityClasses?: number;
505
- }
506
- interface ModelInfo {
507
- backend: 'webgpu' | 'wasm';
508
426
  loadTimeMs: number;
509
427
  inputNames: string[];
510
428
  outputNames: string[];
511
429
  }
512
430
  /**
513
- * LAM model blendshape names in order (52 total)
514
- * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
431
+ * Result from lip sync inference
432
+ *
433
+ * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
434
+ * Models with different native orderings must remap internally before returning.
515
435
  */
516
- declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
517
- /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
518
- declare const CTC_VOCAB: string[];
519
- interface Wav2Vec2Result {
520
- /** Blendshape weights [frames, 52] - 30fps */
436
+ interface LipSyncResult {
437
+ /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
521
438
  blendshapes: Float32Array[];
522
- /** Raw CTC logits [frames, 32] - 50fps */
523
- asrLogits: Float32Array[];
524
- /** Decoded text from CTC */
525
- text: string;
526
- /** Number of A2E frames (30fps) */
527
- numA2EFrames: number;
528
- /** Number of ASR frames (50fps) */
529
- numASRFrames: number;
439
+ /** Number of blendshape frames */
440
+ numFrames: number;
530
441
  /** Inference time in ms */
531
442
  inferenceTimeMs: number;
532
443
  }
533
- declare class Wav2Vec2Inference {
534
- private session;
535
- private ort;
536
- private config;
537
- private _backend;
538
- private isLoading;
539
- private numIdentityClasses;
540
- private inferenceQueue;
541
- constructor(config: Wav2Vec2InferenceConfig);
542
- /**
543
- * Check if WebGPU is available and working
544
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
545
- */
546
- static isWebGPUAvailable: typeof isWebGPUAvailable;
547
- get backend(): 'webgpu' | 'wasm' | null;
548
- get isLoaded(): boolean;
444
+ /**
445
+ * Common interface for lip sync inference engines
446
+ *
447
+ * Implemented by:
448
+ * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
449
+ * - Wav2ArkitCpuInference (WASM-only, 1.8MB, lip sync only)
450
+ */
451
+ interface LipSyncBackend {
452
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
453
+ readonly backend: RuntimeBackend | null;
454
+ /** Whether the model is loaded and ready for inference */
455
+ readonly isLoaded: boolean;
549
456
  /**
550
457
  * Load the ONNX model
458
+ * @returns Model loading information
551
459
  */
552
- load(): Promise<ModelInfo>;
460
+ load(): Promise<LipSyncModelInfo>;
553
461
  /**
554
462
  * Run inference on raw audio
555
- * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
556
- * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
557
- *
558
- * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
559
- * Audio will be zero-padded or truncated to 16000 samples.
463
+ * @param audioSamples - Float32Array of raw audio at 16kHz
464
+ * @param identityIndex - Optional identity index (ignored by CPU model)
465
+ * @returns Lip sync result with blendshapes in LAM_BLENDSHAPES order
560
466
  */
561
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
562
- /**
563
- * Decode CTC logits to text using greedy decoding
564
- */
565
- private decodeCTC;
566
- /**
567
- * Queue inference to serialize ONNX session calls
568
- */
569
- private queueInference;
570
- /**
571
- * Get blendshape value by name for a specific frame
572
- */
573
- getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
467
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<LipSyncResult>;
574
468
  /**
575
469
  * Dispose of the model and free resources
576
470
  */
@@ -642,7 +536,7 @@ declare class LAMPipeline {
642
536
  * @param timestamp - AudioContext time when these samples start playing
643
537
  * @param lam - LAM inference engine
644
538
  */
645
- push(samples: Float32Array, timestamp: number, lam: Wav2Vec2Inference): Promise<void>;
539
+ push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
646
540
  /**
647
541
  * Process accumulated buffer through LAM inference
648
542
  */
@@ -693,7 +587,7 @@ declare class LAMPipeline {
693
587
  *
694
588
  * @param lam - LAM inference engine
695
589
  */
696
- flush(lam: Wav2Vec2Inference): Promise<void>;
590
+ flush(lam: LipSyncBackend): Promise<void>;
697
591
  /**
698
592
  * Adjust all queued frame timestamps by an offset
699
593
  *
@@ -710,25 +604,25 @@ declare class LAMPipeline {
710
604
  }
711
605
 
712
606
  /**
713
- * SyncedAudioPipeline - Enterprise-grade audio + LAM synchronization coordinator
607
+ * SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
714
608
  *
715
609
  * Orchestrates the complete pipeline for synchronized audio playback and lip sync:
716
610
  * 1. Network chunks → Coalescer → Optimized buffers
717
- * 2. Audio buffers → Scheduler → Gapless playback
718
- * 3. Audio buffers → LAM Pipeline → Blendshape frames
611
+ * 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
612
+ * 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
719
613
  * 4. Frames synchronized to AudioContext clock → Renderer
720
614
  *
721
- * Key Architecture Pattern: Wait-for-First-LAM
722
- * - Buffers incoming audio chunks without scheduling playback
723
- * - Waits for first LAM inference to complete (ensures LAM frames are ready)
724
- * - Then schedules all buffered audio + LAM frames together
725
- * - Result: Perfect synchronization from frame 1, no lag compensation needed
615
+ * Key Architecture Pattern: Audio-First, LAM-Background
616
+ * - Audio chunks are scheduled for playback immediately (never waits for LAM)
617
+ * - LAM inference runs in background without blocking the audio path
618
+ * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
619
+ * - Once LAM catches up, frames stay synchronized to AudioContext clock
726
620
  *
727
- * This is a deterministic, enterprise-grade solution suitable for production use.
728
- * No hacks, no lag detection, no frame skipping - just guaranteed synchronization.
621
+ * This decoupled design prevents LAM inference (50-300ms) from blocking audio
622
+ * scheduling, which caused audible stuttering when audio arrived as a continuous
623
+ * stream (e.g., single-call TTS from ElevenLabs via AgentCore).
729
624
  *
730
625
  * @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
731
- * @see https://developer.chrome.com/blog/audio-worklet-design-pattern (Ring buffer patterns)
732
626
  * @category Audio
733
627
  */
734
628
 
@@ -738,14 +632,14 @@ interface SyncedAudioPipelineOptions {
738
632
  /** Target chunk duration in ms for coalescing (default: 200) */
739
633
  chunkTargetMs?: number;
740
634
  /** LAM inference engine */
741
- lam: Wav2Vec2Inference;
635
+ lam: LipSyncBackend;
742
636
  }
743
637
  interface SyncedAudioPipelineEvents {
744
638
  /** New frame ready for display */
745
639
  frame_ready: Float32Array;
746
640
  /** Playback has completed */
747
641
  playback_complete: void;
748
- /** First LAM inference completed, playback starting */
642
+ /** First audio chunk scheduled, playback starting */
749
643
  playback_start: number;
750
644
  /** Error occurred */
751
645
  error: Error;
@@ -757,8 +651,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
757
651
  private scheduler;
758
652
  private coalescer;
759
653
  private lamPipeline;
760
- private waitingForFirstLAM;
761
- private bufferedChunks;
654
+ private playbackStarted;
762
655
  private monitorInterval;
763
656
  private frameAnimationId;
764
657
  constructor(options: SyncedAudioPipelineOptions);
@@ -770,31 +663,19 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
770
663
  * Start a new playback session
771
664
  *
772
665
  * Resets all state and prepares for incoming audio chunks.
773
- * Enables wait-for-first-LAM synchronization.
666
+ * Audio will be scheduled immediately as chunks arrive (no buffering).
774
667
  */
775
668
  start(): void;
776
669
  /**
777
670
  * Receive audio chunk from network
778
671
  *
779
- * Implements wait-for-first-LAM pattern:
780
- * - Chunks are coalesced into optimal buffers
781
- * - Buffers are sent to LAM for processing
782
- * - Audio scheduling waits until first LAM completes
783
- * - Then all buffered audio is scheduled together with LAM frames
672
+ * Audio-first design: schedules audio immediately, LAM runs in background.
673
+ * This prevents LAM inference (50-300ms) from blocking audio scheduling,
674
+ * which caused audible stuttering with continuous audio streams.
784
675
  *
785
676
  * @param chunk - Uint8Array containing Int16 PCM audio
786
677
  */
787
678
  onAudioChunk(chunk: Uint8Array): Promise<void>;
788
- /**
789
- * Handle first LAM inference completion
790
- *
791
- * This is the critical synchronization point:
792
- * - LAM frames are now ready in the queue
793
- * - Schedule all buffered audio chunks
794
- * - Adjust LAM frame timestamps to match actual schedule time
795
- * - Audio and LAM start playing together, perfectly synchronized
796
- */
797
- private onFirstLAMComplete;
798
679
  /**
799
680
  * End of audio stream
800
681
  *
@@ -840,8 +721,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
840
721
  * Get current pipeline state (for debugging/monitoring)
841
722
  */
842
723
  getState(): {
843
- waitingForFirstLAM: boolean;
844
- bufferedChunks: number;
724
+ playbackStarted: boolean;
845
725
  coalescerFill: number;
846
726
  lamFill: number;
847
727
  queuedFrames: number;
@@ -854,6 +734,99 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
854
734
  dispose(): void;
855
735
  }
856
736
 
737
+ /**
738
+ * Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
739
+ *
740
+ * This module provides a way to dynamically load the appropriate ONNX Runtime bundle
741
+ * based on the platform's capabilities. This is critical for iOS support because:
742
+ *
743
+ * 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
744
+ * 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
745
+ * 3. WASM-only bundle is smaller and more reliable on iOS
746
+ *
747
+ * Usage:
748
+ * ```typescript
749
+ * const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
750
+ * const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
751
+ * ```
752
+ *
753
+ * @module inference/onnxLoader
754
+ */
755
+
756
+ type OrtModule = {
757
+ InferenceSession: typeof InferenceSession;
758
+ Tensor: typeof Tensor;
759
+ env: Env;
760
+ };
761
+ type SessionOptions = InferenceSession.SessionOptions;
762
+
763
+ /**
764
+ * Check if WebGPU is available and likely to work
765
+ *
766
+ * This is more thorough than just checking navigator.gpu exists.
767
+ * It actually requests an adapter to verify the GPU is accessible.
768
+ *
769
+ * @returns true if WebGPU is available and working
770
+ */
771
+ declare function isWebGPUAvailable(): Promise<boolean>;
772
+ /**
773
+ * Load ONNX Runtime with the specified backend
774
+ *
775
+ * This lazily loads the appropriate bundle:
776
+ * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
777
+ * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
778
+ *
779
+ * Once loaded, the same instance is reused for all subsequent calls.
780
+ * If you need to switch backends, you must reload the page.
781
+ *
782
+ * @param backend The backend to load ('webgpu' or 'wasm')
783
+ * @returns The ONNX Runtime module
784
+ */
785
+ declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
786
+ /**
787
+ * Get the appropriate ONNX Runtime based on user preference
788
+ *
789
+ * This resolves the user's preference against platform capabilities
790
+ * and loads the appropriate bundle.
791
+ *
792
+ * @param preference User's backend preference
793
+ * @returns The ONNX Runtime module and the resolved backend
794
+ */
795
+ declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
796
+ ort: OrtModule;
797
+ backend: RuntimeBackend;
798
+ }>;
799
+ /**
800
+ * Get session options for creating an inference session
801
+ *
802
+ * This returns optimized session options based on the backend and platform.
803
+ *
804
+ * @param backend The backend being used
805
+ * @returns Session options for InferenceSession.create()
806
+ */
807
+ declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
808
+ /**
809
+ * Create an inference session with automatic fallback
810
+ *
811
+ * If WebGPU session creation fails, automatically falls back to WASM.
812
+ *
813
+ * @param modelBuffer The model data as ArrayBuffer
814
+ * @param preferredBackend The preferred backend
815
+ * @returns The created session and the backend used
816
+ */
817
+ declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
818
+ session: InferenceSession;
819
+ backend: RuntimeBackend;
820
+ }>;
821
+ /**
822
+ * Get the currently loaded backend (if any)
823
+ */
824
+ declare function getLoadedBackend(): RuntimeBackend | null;
825
+ /**
826
+ * Check if ONNX Runtime has been loaded
827
+ */
828
+ declare function isOnnxRuntimeLoaded(): boolean;
829
+
857
830
  /**
858
831
  * Whisper Automatic Speech Recognition using transformers.js
859
832
  * Uses Xenova's proven pipeline API for reliable transcription
@@ -982,6 +955,288 @@ declare class WhisperInference {
982
955
  private removeNonSpeechTokens;
983
956
  }
984
957
 
958
+ /**
959
+ * Shared blendshape constants and utilities for lip sync inference
960
+ *
961
+ * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
962
+ * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
963
+ *
964
+ * This module is the single source of truth for blendshape ordering to
965
+ * avoid circular dependencies between inference classes.
966
+ *
967
+ * @category Inference
968
+ */
969
+ /**
970
+ * LAM model blendshape names in order (52 total)
971
+ * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
972
+ */
973
+ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
974
+ /** Alias for backwards compatibility */
975
+ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
976
+ /**
977
+ * Symmetrize blendshapes by averaging left/right pairs
978
+ * From LAM official postprocessing (models/utils.py)
979
+ * This fixes asymmetric output from the raw model
980
+ */
981
+ declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
982
+ /**
983
+ * wav2arkit_cpu model blendshape ordering
984
+ *
985
+ * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
986
+ * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
987
+ * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
988
+ */
989
+ declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
990
+ /**
991
+ * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
992
+ *
993
+ * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
994
+ * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
995
+ */
996
+ declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
997
+
998
+ /**
999
+ * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
1000
+ *
1001
+ * Runs entirely in the browser using WebGPU or WASM.
1002
+ * Takes raw 16kHz audio and outputs:
1003
+ * - 52 ARKit blendshapes (lip sync)
1004
+ * - 32-token CTC logits (speech recognition)
1005
+ *
1006
+ * @category Inference
1007
+ *
1008
+ * @example Basic usage
1009
+ * ```typescript
1010
+ * import { Wav2Vec2Inference } from '@omote/core';
1011
+ *
1012
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
1013
+ * await wav2vec.load();
1014
+ *
1015
+ * // Process 1 second of audio (16kHz = 16000 samples)
1016
+ * const result = await wav2vec.infer(audioSamples);
1017
+ *
1018
+ * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
1019
+ * console.log('ASR text:', result.text); // Decoded transcription
1020
+ * ```
1021
+ */
1022
+
1023
+ type InferenceBackend = BackendPreference;
1024
+ interface Wav2Vec2InferenceConfig {
1025
+ /** Path or URL to the ONNX model */
1026
+ modelUrl: string;
1027
+ /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
1028
+ backend?: InferenceBackend;
1029
+ /** Number of identity classes (default: 12 for streaming model) */
1030
+ numIdentityClasses?: number;
1031
+ }
1032
+ interface ModelInfo {
1033
+ backend: 'webgpu' | 'wasm';
1034
+ loadTimeMs: number;
1035
+ inputNames: string[];
1036
+ outputNames: string[];
1037
+ }
1038
+
1039
+ /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
1040
+ declare const CTC_VOCAB: string[];
1041
+ interface Wav2Vec2Result {
1042
+ /** Blendshape weights [frames, 52] - 30fps */
1043
+ blendshapes: Float32Array[];
1044
+ /** Raw CTC logits [frames, 32] - 50fps */
1045
+ asrLogits: Float32Array[];
1046
+ /** Decoded text from CTC */
1047
+ text: string;
1048
+ /** Number of blendshape frames (30fps) — alias for numA2EFrames */
1049
+ numFrames: number;
1050
+ /** Number of A2E frames (30fps) */
1051
+ numA2EFrames: number;
1052
+ /** Number of ASR frames (50fps) */
1053
+ numASRFrames: number;
1054
+ /** Inference time in ms */
1055
+ inferenceTimeMs: number;
1056
+ }
1057
+ declare class Wav2Vec2Inference {
1058
+ private session;
1059
+ private ort;
1060
+ private config;
1061
+ private _backend;
1062
+ private isLoading;
1063
+ private numIdentityClasses;
1064
+ private inferenceQueue;
1065
+ constructor(config: Wav2Vec2InferenceConfig);
1066
+ /**
1067
+ * Check if WebGPU is available and working
1068
+ * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1069
+ */
1070
+ static isWebGPUAvailable: typeof isWebGPUAvailable;
1071
+ get backend(): 'webgpu' | 'wasm' | null;
1072
+ get isLoaded(): boolean;
1073
+ /**
1074
+ * Load the ONNX model
1075
+ */
1076
+ load(): Promise<ModelInfo>;
1077
+ /**
1078
+ * Run inference on raw audio
1079
+ * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
1080
+ * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
1081
+ *
1082
+ * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
1083
+ * Audio will be zero-padded or truncated to 16000 samples.
1084
+ */
1085
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
1086
+ /**
1087
+ * Decode CTC logits to text using greedy decoding
1088
+ */
1089
+ private decodeCTC;
1090
+ /**
1091
+ * Queue inference to serialize ONNX session calls
1092
+ */
1093
+ private queueInference;
1094
+ /**
1095
+ * Get blendshape value by name for a specific frame
1096
+ */
1097
+ getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
1098
+ /**
1099
+ * Dispose of the model and free resources
1100
+ */
1101
+ dispose(): Promise<void>;
1102
+ }
1103
+
1104
+ /**
1105
+ * CPU-optimized lip sync inference using wav2arkit_cpu model
1106
+ *
1107
+ * A lightweight (1.8MB) alternative to Wav2Vec2Inference (384MB) designed
1108
+ * for Safari/iOS where WebGPU crashes due to ONNX Runtime JSEP bugs.
1109
+ *
1110
+ * Key differences from Wav2Vec2Inference:
1111
+ * - WASM-only backend (CPU-optimized, no WebGPU)
1112
+ * - 1.8MB model vs 384MB
1113
+ * - No identity input (baked to identity 11)
1114
+ * - No ASR output (lip sync only)
1115
+ * - Dynamic input length (not fixed to 16000 samples)
1116
+ * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
1117
+ *
1118
+ * @category Inference
1119
+ *
1120
+ * @example
1121
+ * ```typescript
1122
+ * import { Wav2ArkitCpuInference } from '@omote/core';
1123
+ *
1124
+ * const lam = new Wav2ArkitCpuInference({
1125
+ * modelUrl: '/models/wav2arkit_cpu.onnx',
1126
+ * });
1127
+ * await lam.load();
1128
+ *
1129
+ * const { blendshapes } = await lam.infer(audioSamples);
1130
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
1131
+ * ```
1132
+ */
1133
+
1134
+ interface Wav2ArkitCpuConfig {
1135
+ /** Path or URL to the wav2arkit_cpu ONNX model */
1136
+ modelUrl: string;
1137
+ /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
1138
+ backend?: BackendPreference;
1139
+ }
1140
+ declare class Wav2ArkitCpuInference implements LipSyncBackend {
1141
+ private session;
1142
+ private ort;
1143
+ private config;
1144
+ private _backend;
1145
+ private isLoading;
1146
+ private inferenceQueue;
1147
+ constructor(config: Wav2ArkitCpuConfig);
1148
+ get backend(): RuntimeBackend | null;
1149
+ get isLoaded(): boolean;
1150
+ /**
1151
+ * Load the ONNX model
1152
+ */
1153
+ load(): Promise<LipSyncModelInfo>;
1154
+ /**
1155
+ * Run inference on raw audio
1156
+ *
1157
+ * Accepts variable-length audio (not fixed to 16000 samples).
1158
+ * Output frames = ceil(30 * numSamples / 16000).
1159
+ *
1160
+ * @param audioSamples - Float32Array of raw audio at 16kHz
1161
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
1162
+ */
1163
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
1164
+ /**
1165
+ * Queue inference to serialize ONNX session calls
1166
+ */
1167
+ private queueInference;
1168
+ /**
1169
+ * Dispose of the model and free resources
1170
+ */
1171
+ dispose(): Promise<void>;
1172
+ }
1173
+
1174
+ /**
1175
+ * Factory function for lip sync with automatic GPU/CPU model selection
1176
+ *
1177
+ * Provides a unified API that automatically selects the optimal model:
1178
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (1.8MB, WASM)
1179
+ * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
1180
+ * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
1181
+ *
1182
+ * @category Inference
1183
+ *
1184
+ * @example Auto-detect (recommended)
1185
+ * ```typescript
1186
+ * import { createLipSync } from '@omote/core';
1187
+ *
1188
+ * const lam = createLipSync({
1189
+ * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
1190
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
1191
+ * });
1192
+ *
1193
+ * await lam.load();
1194
+ * const { blendshapes } = await lam.infer(audioSamples);
1195
+ * ```
1196
+ *
1197
+ * @example Force CPU model
1198
+ * ```typescript
1199
+ * const lam = createLipSync({
1200
+ * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
1201
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
1202
+ * mode: 'cpu',
1203
+ * });
1204
+ * ```
1205
+ */
1206
+
1207
+ /**
1208
+ * Configuration for the lip sync factory
1209
+ */
1210
+ interface CreateLipSyncConfig {
1211
+ /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
1212
+ gpuModelUrl: string;
1213
+ /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
1214
+ cpuModelUrl: string;
1215
+ /**
1216
+ * Model selection mode:
1217
+ * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
1218
+ * - 'gpu': Force GPU model (Wav2Vec2Inference)
1219
+ * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
1220
+ */
1221
+ mode?: 'auto' | 'gpu' | 'cpu';
1222
+ /** Backend preference for GPU model (default: 'auto') */
1223
+ gpuBackend?: BackendPreference;
1224
+ /** Number of identity classes for GPU model (default: 12) */
1225
+ numIdentityClasses?: number;
1226
+ /**
1227
+ * Fall back to CPU model if GPU model fails to load (default: true)
1228
+ * Only applies when mode is 'auto' or 'gpu'
1229
+ */
1230
+ fallbackOnError?: boolean;
1231
+ }
1232
+ /**
1233
+ * Create a lip sync instance with automatic GPU/CPU model selection
1234
+ *
1235
+ * @param config - Factory configuration
1236
+ * @returns A LipSyncBackend instance (either GPU or CPU model)
1237
+ */
1238
+ declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
1239
+
985
1240
  /**
986
1241
  * Silero VAD (Voice Activity Detection) inference
987
1242
  *
@@ -3809,4 +4064,4 @@ declare class EmphasisDetector {
3809
4064
  reset(): void;
3810
4065
  }
3811
4066
 
3812
- export { type AIAdapter, type AIAdapterEvents, type AISessionState, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, validateCachedResponse };
4067
+ export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };