@omote/core 0.1.3 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.mts CHANGED
@@ -112,6 +112,17 @@ declare class AudioScheduler {
112
112
  * to avoid browser autoplay policy issues (requires user gesture).
113
113
  */
114
114
  initialize(): Promise<void>;
115
+ /**
116
+ * Eagerly create and warm up the AudioContext
117
+ *
118
+ * Call this when a playback session starts (e.g., when AI response begins).
119
+ * The AudioContext needs time to initialize the audio hardware — on Windows
120
+ * this can take 50-100ms. By warming up early (before audio data arrives),
121
+ * the context is fully ready when schedule() is first called.
122
+ *
123
+ * Must be called after a user gesture (click/tap) for autoplay policy.
124
+ */
125
+ warmup(): Promise<void>;
115
126
  /**
116
127
  * Ensure AudioContext is created and ready
117
128
  * Called lazily on first schedule() - requires user gesture
@@ -157,6 +168,7 @@ declare class AudioScheduler {
157
168
  cancelAll(fadeOutMs?: number): Promise<void>;
158
169
  /**
159
170
  * Reset scheduler state for new playback session
171
+ * Stops any orphaned sources that weren't cleaned up by cancelAll()
160
172
  */
161
173
  reset(): void;
162
174
  /**
@@ -342,6 +354,27 @@ declare function getOptimalWasmThreads(): number;
342
354
  * @returns true if proxy mode is safe to enable
343
355
  */
344
356
  declare function shouldEnableWasmProxy(): boolean;
357
+ /**
358
+ * Detect Safari browser on any platform (macOS + iOS)
359
+ *
360
+ * Safari WebKit has bugs with ONNX Runtime's WebGPU multithreaded JSEP build
361
+ * that crash session creation. Both iOS and macOS Safari are affected.
362
+ *
363
+ * @returns true if running in Safari on any platform
364
+ */
365
+ declare function isSafari(): boolean;
366
+ /**
367
+ * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
368
+ *
369
+ * All WebKit browsers (Safari macOS, Safari iOS, Chrome iOS, Firefox iOS)
370
+ * have ONNX Runtime WebGPU JSEP bugs that crash session creation, and the
371
+ * 384MB LAM model stack-overflows in WASM mode.
372
+ * The wav2arkit_cpu model (1.8MB) provides identical 52 ARKit blendshape
373
+ * output at 22x real-time on CPU/WASM.
374
+ *
375
+ * @returns true if on Safari or any iOS browser (should use CPU lip sync model)
376
+ */
377
+ declare function shouldUseCpuLipSync(): boolean;
345
378
  /**
346
379
  * Check if Web Speech API is available in the browser
347
380
  *
@@ -377,200 +410,62 @@ declare function shouldUseNativeASR(): boolean;
377
410
  declare function shouldUseServerLipSync(): boolean;
378
411
 
379
412
  /**
380
- * Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
413
+ * Common interface for lip sync inference backends
381
414
  *
382
- * This module provides a way to dynamically load the appropriate ONNX Runtime bundle
383
- * based on the platform's capabilities. This is critical for iOS support because:
415
+ * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 1.8MB)
416
+ * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
417
+ * work with either model transparently.
384
418
  *
385
- * 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
386
- * 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
387
- * 3. WASM-only bundle is smaller and more reliable on iOS
388
- *
389
- * Usage:
390
- * ```typescript
391
- * const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
392
- * const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
393
- * ```
394
- *
395
- * @module inference/onnxLoader
419
+ * @category Inference
396
420
  */
397
421
 
398
- type OrtModule = {
399
- InferenceSession: typeof InferenceSession;
400
- Tensor: typeof Tensor;
401
- env: Env;
402
- };
403
- type SessionOptions = InferenceSession.SessionOptions;
404
-
405
- /**
406
- * Check if WebGPU is available and likely to work
407
- *
408
- * This is more thorough than just checking navigator.gpu exists.
409
- * It actually requests an adapter to verify the GPU is accessible.
410
- *
411
- * @returns true if WebGPU is available and working
412
- */
413
- declare function isWebGPUAvailable(): Promise<boolean>;
414
- /**
415
- * Load ONNX Runtime with the specified backend
416
- *
417
- * This lazily loads the appropriate bundle:
418
- * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
419
- * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
420
- *
421
- * Once loaded, the same instance is reused for all subsequent calls.
422
- * If you need to switch backends, you must reload the page.
423
- *
424
- * @param backend The backend to load ('webgpu' or 'wasm')
425
- * @returns The ONNX Runtime module
426
- */
427
- declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
428
- /**
429
- * Get the appropriate ONNX Runtime based on user preference
430
- *
431
- * This resolves the user's preference against platform capabilities
432
- * and loads the appropriate bundle.
433
- *
434
- * @param preference User's backend preference
435
- * @returns The ONNX Runtime module and the resolved backend
436
- */
437
- declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
438
- ort: OrtModule;
439
- backend: RuntimeBackend;
440
- }>;
441
422
  /**
442
- * Get session options for creating an inference session
443
- *
444
- * This returns optimized session options based on the backend and platform.
445
- *
446
- * @param backend The backend being used
447
- * @returns Session options for InferenceSession.create()
423
+ * Model loading information returned by load()
448
424
  */
449
- declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
450
- /**
451
- * Create an inference session with automatic fallback
452
- *
453
- * If WebGPU session creation fails, automatically falls back to WASM.
454
- *
455
- * @param modelBuffer The model data as ArrayBuffer
456
- * @param preferredBackend The preferred backend
457
- * @returns The created session and the backend used
458
- */
459
- declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
460
- session: InferenceSession;
425
+ interface LipSyncModelInfo {
461
426
  backend: RuntimeBackend;
462
- }>;
463
- /**
464
- * Get the currently loaded backend (if any)
465
- */
466
- declare function getLoadedBackend(): RuntimeBackend | null;
467
- /**
468
- * Check if ONNX Runtime has been loaded
469
- */
470
- declare function isOnnxRuntimeLoaded(): boolean;
471
-
472
- /**
473
- * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
474
- *
475
- * Runs entirely in the browser using WebGPU or WASM.
476
- * Takes raw 16kHz audio and outputs:
477
- * - 52 ARKit blendshapes (lip sync)
478
- * - 32-token CTC logits (speech recognition)
479
- *
480
- * @category Inference
481
- *
482
- * @example Basic usage
483
- * ```typescript
484
- * import { Wav2Vec2Inference } from '@omote/core';
485
- *
486
- * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
487
- * await wav2vec.load();
488
- *
489
- * // Process 1 second of audio (16kHz = 16000 samples)
490
- * const result = await wav2vec.infer(audioSamples);
491
- *
492
- * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
493
- * console.log('ASR text:', result.text); // Decoded transcription
494
- * ```
495
- */
496
-
497
- type InferenceBackend = BackendPreference;
498
- interface Wav2Vec2InferenceConfig {
499
- /** Path or URL to the ONNX model */
500
- modelUrl: string;
501
- /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
502
- backend?: InferenceBackend;
503
- /** Number of identity classes (default: 12 for streaming model) */
504
- numIdentityClasses?: number;
505
- }
506
- interface ModelInfo {
507
- backend: 'webgpu' | 'wasm';
508
427
  loadTimeMs: number;
509
428
  inputNames: string[];
510
429
  outputNames: string[];
511
430
  }
512
431
  /**
513
- * LAM model blendshape names in order (52 total)
514
- * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
432
+ * Result from lip sync inference
433
+ *
434
+ * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
435
+ * Models with different native orderings must remap internally before returning.
515
436
  */
516
- declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
517
- /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
518
- declare const CTC_VOCAB: string[];
519
- interface Wav2Vec2Result {
520
- /** Blendshape weights [frames, 52] - 30fps */
437
+ interface LipSyncResult {
438
+ /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
521
439
  blendshapes: Float32Array[];
522
- /** Raw CTC logits [frames, 32] - 50fps */
523
- asrLogits: Float32Array[];
524
- /** Decoded text from CTC */
525
- text: string;
526
- /** Number of A2E frames (30fps) */
527
- numA2EFrames: number;
528
- /** Number of ASR frames (50fps) */
529
- numASRFrames: number;
440
+ /** Number of blendshape frames */
441
+ numFrames: number;
530
442
  /** Inference time in ms */
531
443
  inferenceTimeMs: number;
532
444
  }
533
- declare class Wav2Vec2Inference {
534
- private session;
535
- private ort;
536
- private config;
537
- private _backend;
538
- private isLoading;
539
- private numIdentityClasses;
540
- private inferenceQueue;
541
- constructor(config: Wav2Vec2InferenceConfig);
542
- /**
543
- * Check if WebGPU is available and working
544
- * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
545
- */
546
- static isWebGPUAvailable: typeof isWebGPUAvailable;
547
- get backend(): 'webgpu' | 'wasm' | null;
548
- get isLoaded(): boolean;
445
+ /**
446
+ * Common interface for lip sync inference engines
447
+ *
448
+ * Implemented by:
449
+ * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
450
+ * - Wav2ArkitCpuInference (WASM-only, 1.8MB, lip sync only)
451
+ */
452
+ interface LipSyncBackend {
453
+ /** Current backend type (webgpu, wasm, or null if not loaded) */
454
+ readonly backend: RuntimeBackend | null;
455
+ /** Whether the model is loaded and ready for inference */
456
+ readonly isLoaded: boolean;
549
457
  /**
550
458
  * Load the ONNX model
459
+ * @returns Model loading information
551
460
  */
552
- load(): Promise<ModelInfo>;
461
+ load(): Promise<LipSyncModelInfo>;
553
462
  /**
554
463
  * Run inference on raw audio
555
- * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
556
- * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
557
- *
558
- * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
559
- * Audio will be zero-padded or truncated to 16000 samples.
464
+ * @param audioSamples - Float32Array of raw audio at 16kHz
465
+ * @param identityIndex - Optional identity index (ignored by CPU model)
466
+ * @returns Lip sync result with blendshapes in LAM_BLENDSHAPES order
560
467
  */
561
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
562
- /**
563
- * Decode CTC logits to text using greedy decoding
564
- */
565
- private decodeCTC;
566
- /**
567
- * Queue inference to serialize ONNX session calls
568
- */
569
- private queueInference;
570
- /**
571
- * Get blendshape value by name for a specific frame
572
- */
573
- getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
468
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<LipSyncResult>;
574
469
  /**
575
470
  * Dispose of the model and free resources
576
471
  */
@@ -642,7 +537,7 @@ declare class LAMPipeline {
642
537
  * @param timestamp - AudioContext time when these samples start playing
643
538
  * @param lam - LAM inference engine
644
539
  */
645
- push(samples: Float32Array, timestamp: number, lam: Wav2Vec2Inference): Promise<void>;
540
+ push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
646
541
  /**
647
542
  * Process accumulated buffer through LAM inference
648
543
  */
@@ -693,7 +588,7 @@ declare class LAMPipeline {
693
588
  *
694
589
  * @param lam - LAM inference engine
695
590
  */
696
- flush(lam: Wav2Vec2Inference): Promise<void>;
591
+ flush(lam: LipSyncBackend): Promise<void>;
697
592
  /**
698
593
  * Adjust all queued frame timestamps by an offset
699
594
  *
@@ -710,25 +605,25 @@ declare class LAMPipeline {
710
605
  }
711
606
 
712
607
  /**
713
- * SyncedAudioPipeline - Enterprise-grade audio + LAM synchronization coordinator
608
+ * SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
714
609
  *
715
610
  * Orchestrates the complete pipeline for synchronized audio playback and lip sync:
716
611
  * 1. Network chunks → Coalescer → Optimized buffers
717
- * 2. Audio buffers → Scheduler → Gapless playback
718
- * 3. Audio buffers → LAM Pipeline → Blendshape frames
612
+ * 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
613
+ * 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
719
614
  * 4. Frames synchronized to AudioContext clock → Renderer
720
615
  *
721
- * Key Architecture Pattern: Wait-for-First-LAM
722
- * - Buffers incoming audio chunks without scheduling playback
723
- * - Waits for first LAM inference to complete (ensures LAM frames are ready)
724
- * - Then schedules all buffered audio + LAM frames together
725
- * - Result: Perfect synchronization from frame 1, no lag compensation needed
616
+ * Key Architecture Pattern: Audio-First, LAM-Background
617
+ * - Audio chunks are scheduled for playback immediately (never waits for LAM)
618
+ * - LAM inference runs in background without blocking the audio path
619
+ * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
620
+ * - Once LAM catches up, frames stay synchronized to AudioContext clock
726
621
  *
727
- * This is a deterministic, enterprise-grade solution suitable for production use.
728
- * No hacks, no lag detection, no frame skipping - just guaranteed synchronization.
622
+ * This decoupled design prevents LAM inference (50-300ms) from blocking audio
623
+ * scheduling, which caused audible stuttering when audio arrived as a continuous
624
+ * stream (e.g., single-call TTS from ElevenLabs via AgentCore).
729
625
  *
730
626
  * @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
731
- * @see https://developer.chrome.com/blog/audio-worklet-design-pattern (Ring buffer patterns)
732
627
  * @category Audio
733
628
  */
734
629
 
@@ -738,14 +633,14 @@ interface SyncedAudioPipelineOptions {
738
633
  /** Target chunk duration in ms for coalescing (default: 200) */
739
634
  chunkTargetMs?: number;
740
635
  /** LAM inference engine */
741
- lam: Wav2Vec2Inference;
636
+ lam: LipSyncBackend;
742
637
  }
743
638
  interface SyncedAudioPipelineEvents {
744
639
  /** New frame ready for display */
745
640
  frame_ready: Float32Array;
746
641
  /** Playback has completed */
747
642
  playback_complete: void;
748
- /** First LAM inference completed, playback starting */
643
+ /** First audio chunk scheduled, playback starting */
749
644
  playback_start: number;
750
645
  /** Error occurred */
751
646
  error: Error;
@@ -757,8 +652,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
757
652
  private scheduler;
758
653
  private coalescer;
759
654
  private lamPipeline;
760
- private waitingForFirstLAM;
761
- private bufferedChunks;
655
+ private playbackStarted;
762
656
  private monitorInterval;
763
657
  private frameAnimationId;
764
658
  constructor(options: SyncedAudioPipelineOptions);
@@ -770,31 +664,19 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
770
664
  * Start a new playback session
771
665
  *
772
666
  * Resets all state and prepares for incoming audio chunks.
773
- * Enables wait-for-first-LAM synchronization.
667
+ * Audio will be scheduled immediately as chunks arrive (no buffering).
774
668
  */
775
669
  start(): void;
776
670
  /**
777
671
  * Receive audio chunk from network
778
672
  *
779
- * Implements wait-for-first-LAM pattern:
780
- * - Chunks are coalesced into optimal buffers
781
- * - Buffers are sent to LAM for processing
782
- * - Audio scheduling waits until first LAM completes
783
- * - Then all buffered audio is scheduled together with LAM frames
673
+ * Audio-first design: schedules audio immediately, LAM runs in background.
674
+ * This prevents LAM inference (50-300ms) from blocking audio scheduling,
675
+ * which caused audible stuttering with continuous audio streams.
784
676
  *
785
677
  * @param chunk - Uint8Array containing Int16 PCM audio
786
678
  */
787
679
  onAudioChunk(chunk: Uint8Array): Promise<void>;
788
- /**
789
- * Handle first LAM inference completion
790
- *
791
- * This is the critical synchronization point:
792
- * - LAM frames are now ready in the queue
793
- * - Schedule all buffered audio chunks
794
- * - Adjust LAM frame timestamps to match actual schedule time
795
- * - Audio and LAM start playing together, perfectly synchronized
796
- */
797
- private onFirstLAMComplete;
798
680
  /**
799
681
  * End of audio stream
800
682
  *
@@ -840,8 +722,7 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
840
722
  * Get current pipeline state (for debugging/monitoring)
841
723
  */
842
724
  getState(): {
843
- waitingForFirstLAM: boolean;
844
- bufferedChunks: number;
725
+ playbackStarted: boolean;
845
726
  coalescerFill: number;
846
727
  lamFill: number;
847
728
  queuedFrames: number;
@@ -854,6 +735,99 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
854
735
  dispose(): void;
855
736
  }
856
737
 
738
+ /**
739
+ * Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
740
+ *
741
+ * This module provides a way to dynamically load the appropriate ONNX Runtime bundle
742
+ * based on the platform's capabilities. This is critical for iOS support because:
743
+ *
744
+ * 1. iOS Safari has WebGPU API but ONNX Runtime's WebGPU backend crashes
745
+ * 2. Loading the WebGPU bundle wastes bandwidth and can cause issues
746
+ * 3. WASM-only bundle is smaller and more reliable on iOS
747
+ *
748
+ * Usage:
749
+ * ```typescript
750
+ * const ort = await getOnnxRuntime('wasm'); // Load WASM-only bundle
751
+ * const ort = await getOnnxRuntime('webgpu'); // Load WebGPU bundle (includes WASM)
752
+ * ```
753
+ *
754
+ * @module inference/onnxLoader
755
+ */
756
+
757
+ type OrtModule = {
758
+ InferenceSession: typeof InferenceSession;
759
+ Tensor: typeof Tensor;
760
+ env: Env;
761
+ };
762
+ type SessionOptions = InferenceSession.SessionOptions;
763
+
764
+ /**
765
+ * Check if WebGPU is available and likely to work
766
+ *
767
+ * This is more thorough than just checking navigator.gpu exists.
768
+ * It actually requests an adapter to verify the GPU is accessible.
769
+ *
770
+ * @returns true if WebGPU is available and working
771
+ */
772
+ declare function isWebGPUAvailable(): Promise<boolean>;
773
+ /**
774
+ * Load ONNX Runtime with the specified backend
775
+ *
776
+ * This lazily loads the appropriate bundle:
777
+ * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
778
+ * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
779
+ *
780
+ * Once loaded, the same instance is reused for all subsequent calls.
781
+ * If you need to switch backends, you must reload the page.
782
+ *
783
+ * @param backend The backend to load ('webgpu' or 'wasm')
784
+ * @returns The ONNX Runtime module
785
+ */
786
+ declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
787
+ /**
788
+ * Get the appropriate ONNX Runtime based on user preference
789
+ *
790
+ * This resolves the user's preference against platform capabilities
791
+ * and loads the appropriate bundle.
792
+ *
793
+ * @param preference User's backend preference
794
+ * @returns The ONNX Runtime module and the resolved backend
795
+ */
796
+ declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
797
+ ort: OrtModule;
798
+ backend: RuntimeBackend;
799
+ }>;
800
+ /**
801
+ * Get session options for creating an inference session
802
+ *
803
+ * This returns optimized session options based on the backend and platform.
804
+ *
805
+ * @param backend The backend being used
806
+ * @returns Session options for InferenceSession.create()
807
+ */
808
+ declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
809
+ /**
810
+ * Create an inference session with automatic fallback
811
+ *
812
+ * If WebGPU session creation fails, automatically falls back to WASM.
813
+ *
814
+ * @param modelBuffer The model data as ArrayBuffer
815
+ * @param preferredBackend The preferred backend
816
+ * @returns The created session and the backend used
817
+ */
818
+ declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
819
+ session: InferenceSession;
820
+ backend: RuntimeBackend;
821
+ }>;
822
+ /**
823
+ * Get the currently loaded backend (if any)
824
+ */
825
+ declare function getLoadedBackend(): RuntimeBackend | null;
826
+ /**
827
+ * Check if ONNX Runtime has been loaded
828
+ */
829
+ declare function isOnnxRuntimeLoaded(): boolean;
830
+
857
831
  /**
858
832
  * Whisper Automatic Speech Recognition using transformers.js
859
833
  * Uses Xenova's proven pipeline API for reliable transcription
@@ -982,6 +956,288 @@ declare class WhisperInference {
982
956
  private removeNonSpeechTokens;
983
957
  }
984
958
 
959
+ /**
960
+ * Shared blendshape constants and utilities for lip sync inference
961
+ *
962
+ * Contains LAM_BLENDSHAPES (canonical ordering), symmetrization, and
963
+ * index remapping used by both Wav2Vec2Inference and Wav2ArkitCpuInference.
964
+ *
965
+ * This module is the single source of truth for blendshape ordering to
966
+ * avoid circular dependencies between inference classes.
967
+ *
968
+ * @category Inference
969
+ */
970
+ /**
971
+ * LAM model blendshape names in order (52 total)
972
+ * NOTE: This is alphabetical ordering used by LAM, different from standard ARKit order
973
+ */
974
+ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
975
+ /** Alias for backwards compatibility */
976
+ declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
977
+ /**
978
+ * Symmetrize blendshapes by averaging left/right pairs
979
+ * From LAM official postprocessing (models/utils.py)
980
+ * This fixes asymmetric output from the raw model
981
+ */
982
+ declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
983
+ /**
984
+ * wav2arkit_cpu model blendshape ordering
985
+ *
986
+ * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
987
+ * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
988
+ * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
989
+ */
990
+ declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
991
+ /**
992
+ * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
993
+ *
994
+ * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
995
+ * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
996
+ */
997
+ declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
998
+
999
+ /**
1000
+ * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
1001
+ *
1002
+ * Runs entirely in the browser using WebGPU or WASM.
1003
+ * Takes raw 16kHz audio and outputs:
1004
+ * - 52 ARKit blendshapes (lip sync)
1005
+ * - 32-token CTC logits (speech recognition)
1006
+ *
1007
+ * @category Inference
1008
+ *
1009
+ * @example Basic usage
1010
+ * ```typescript
1011
+ * import { Wav2Vec2Inference } from '@omote/core';
1012
+ *
1013
+ * const wav2vec = new Wav2Vec2Inference({ modelUrl: '/models/unified_wav2vec2_asr_a2e.onnx' });
1014
+ * await wav2vec.load();
1015
+ *
1016
+ * // Process 1 second of audio (16kHz = 16000 samples)
1017
+ * const result = await wav2vec.infer(audioSamples);
1018
+ *
1019
+ * console.log('Blendshapes:', result.blendshapes); // [30, 52] for 30fps
1020
+ * console.log('ASR text:', result.text); // Decoded transcription
1021
+ * ```
1022
+ */
1023
+
1024
+ type InferenceBackend = BackendPreference;
1025
+ interface Wav2Vec2InferenceConfig {
1026
+ /** Path or URL to the ONNX model */
1027
+ modelUrl: string;
1028
+ /** Preferred backend (auto will try WebGPU first, fallback to WASM) */
1029
+ backend?: InferenceBackend;
1030
+ /** Number of identity classes (default: 12 for streaming model) */
1031
+ numIdentityClasses?: number;
1032
+ }
1033
+ interface ModelInfo {
1034
+ backend: 'webgpu' | 'wasm';
1035
+ loadTimeMs: number;
1036
+ inputNames: string[];
1037
+ outputNames: string[];
1038
+ }
1039
+
1040
+ /** CTC vocabulary (32 tokens from wav2vec2-base-960h) */
1041
+ declare const CTC_VOCAB: string[];
1042
+ interface Wav2Vec2Result {
1043
+ /** Blendshape weights [frames, 52] - 30fps */
1044
+ blendshapes: Float32Array[];
1045
+ /** Raw CTC logits [frames, 32] - 50fps */
1046
+ asrLogits: Float32Array[];
1047
+ /** Decoded text from CTC */
1048
+ text: string;
1049
+ /** Number of blendshape frames (30fps) — alias for numA2EFrames */
1050
+ numFrames: number;
1051
+ /** Number of A2E frames (30fps) */
1052
+ numA2EFrames: number;
1053
+ /** Number of ASR frames (50fps) */
1054
+ numASRFrames: number;
1055
+ /** Inference time in ms */
1056
+ inferenceTimeMs: number;
1057
+ }
1058
+ declare class Wav2Vec2Inference {
1059
+ private session;
1060
+ private ort;
1061
+ private config;
1062
+ private _backend;
1063
+ private isLoading;
1064
+ private numIdentityClasses;
1065
+ private inferenceQueue;
1066
+ constructor(config: Wav2Vec2InferenceConfig);
1067
+ /**
1068
+ * Check if WebGPU is available and working
1069
+ * (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
1070
+ */
1071
+ static isWebGPUAvailable: typeof isWebGPUAvailable;
1072
+ get backend(): 'webgpu' | 'wasm' | null;
1073
+ get isLoaded(): boolean;
1074
+ /**
1075
+ * Load the ONNX model
1076
+ */
1077
+ load(): Promise<ModelInfo>;
1078
+ /**
1079
+ * Run inference on raw audio
1080
+ * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
1081
+ * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
1082
+ *
1083
+ * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
1084
+ * Audio will be zero-padded or truncated to 16000 samples.
1085
+ */
1086
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
1087
+ /**
1088
+ * Decode CTC logits to text using greedy decoding
1089
+ */
1090
+ private decodeCTC;
1091
+ /**
1092
+ * Queue inference to serialize ONNX session calls
1093
+ */
1094
+ private queueInference;
1095
+ /**
1096
+ * Get blendshape value by name for a specific frame
1097
+ */
1098
+ getBlendshape(blendshapes: Float32Array, name: typeof LAM_BLENDSHAPES[number]): number;
1099
+ /**
1100
+ * Dispose of the model and free resources
1101
+ */
1102
+ dispose(): Promise<void>;
1103
+ }
1104
+
1105
+ /**
1106
+ * CPU-optimized lip sync inference using wav2arkit_cpu model
1107
+ *
1108
+ * A lightweight (1.8MB) alternative to Wav2Vec2Inference (384MB) designed
1109
+ * for Safari/iOS where WebGPU crashes due to ONNX Runtime JSEP bugs.
1110
+ *
1111
+ * Key differences from Wav2Vec2Inference:
1112
+ * - WASM-only backend (CPU-optimized, no WebGPU)
1113
+ * - 1.8MB model vs 384MB
1114
+ * - No identity input (baked to identity 11)
1115
+ * - No ASR output (lip sync only)
1116
+ * - Dynamic input length (not fixed to 16000 samples)
1117
+ * - Different native blendshape ordering (remapped to LAM_BLENDSHAPES)
1118
+ *
1119
+ * @category Inference
1120
+ *
1121
+ * @example
1122
+ * ```typescript
1123
+ * import { Wav2ArkitCpuInference } from '@omote/core';
1124
+ *
1125
+ * const lam = new Wav2ArkitCpuInference({
1126
+ * modelUrl: '/models/wav2arkit_cpu.onnx',
1127
+ * });
1128
+ * await lam.load();
1129
+ *
1130
+ * const { blendshapes } = await lam.infer(audioSamples);
1131
+ * // blendshapes: Float32Array[] in LAM_BLENDSHAPES order, 30fps
1132
+ * ```
1133
+ */
1134
+
1135
+ interface Wav2ArkitCpuConfig {
1136
+ /** Path or URL to the wav2arkit_cpu ONNX model */
1137
+ modelUrl: string;
1138
+ /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
1139
+ backend?: BackendPreference;
1140
+ }
1141
+ declare class Wav2ArkitCpuInference implements LipSyncBackend {
1142
+ private session;
1143
+ private ort;
1144
+ private config;
1145
+ private _backend;
1146
+ private isLoading;
1147
+ private inferenceQueue;
1148
+ constructor(config: Wav2ArkitCpuConfig);
1149
+ get backend(): RuntimeBackend | null;
1150
+ get isLoaded(): boolean;
1151
+ /**
1152
+ * Load the ONNX model
1153
+ */
1154
+ load(): Promise<LipSyncModelInfo>;
1155
+ /**
1156
+ * Run inference on raw audio
1157
+ *
1158
+ * Accepts variable-length audio (not fixed to 16000 samples).
1159
+ * Output frames = ceil(30 * numSamples / 16000).
1160
+ *
1161
+ * @param audioSamples - Float32Array of raw audio at 16kHz
1162
+ * @param _identityIndex - Ignored (identity 11 is baked into the model)
1163
+ */
1164
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
1165
+ /**
1166
+ * Queue inference to serialize ONNX session calls
1167
+ */
1168
+ private queueInference;
1169
+ /**
1170
+ * Dispose of the model and free resources
1171
+ */
1172
+ dispose(): Promise<void>;
1173
+ }
1174
+
1175
+ /**
1176
+ * Factory function for lip sync with automatic GPU/CPU model selection
1177
+ *
1178
+ * Provides a unified API that automatically selects the optimal model:
1179
+ * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (1.8MB, WASM)
1180
+ * - Chrome/Firefox/Edge: Uses Wav2Vec2Inference (384MB, WebGPU)
1181
+ * - Fallback: Gracefully falls back to CPU model if GPU model fails to load
1182
+ *
1183
+ * @category Inference
1184
+ *
1185
+ * @example Auto-detect (recommended)
1186
+ * ```typescript
1187
+ * import { createLipSync } from '@omote/core';
1188
+ *
1189
+ * const lam = createLipSync({
1190
+ * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
1191
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
1192
+ * });
1193
+ *
1194
+ * await lam.load();
1195
+ * const { blendshapes } = await lam.infer(audioSamples);
1196
+ * ```
1197
+ *
1198
+ * @example Force CPU model
1199
+ * ```typescript
1200
+ * const lam = createLipSync({
1201
+ * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
1202
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
1203
+ * mode: 'cpu',
1204
+ * });
1205
+ * ```
1206
+ */
1207
+
1208
+ /**
1209
+ * Configuration for the lip sync factory
1210
+ */
1211
+ interface CreateLipSyncConfig {
1212
+ /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
1213
+ gpuModelUrl: string;
1214
+ /** URL for the CPU model (wav2arkit_cpu, used on Safari/iOS) */
1215
+ cpuModelUrl: string;
1216
+ /**
1217
+ * Model selection mode:
1218
+ * - 'auto': Safari/iOS → CPU, everything else → GPU (default)
1219
+ * - 'gpu': Force GPU model (Wav2Vec2Inference)
1220
+ * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
1221
+ */
1222
+ mode?: 'auto' | 'gpu' | 'cpu';
1223
+ /** Backend preference for GPU model (default: 'auto') */
1224
+ gpuBackend?: BackendPreference;
1225
+ /** Number of identity classes for GPU model (default: 12) */
1226
+ numIdentityClasses?: number;
1227
+ /**
1228
+ * Fall back to CPU model if GPU model fails to load (default: true)
1229
+ * Only applies when mode is 'auto' or 'gpu'
1230
+ */
1231
+ fallbackOnError?: boolean;
1232
+ }
1233
+ /**
1234
+ * Create a lip sync instance with automatic GPU/CPU model selection
1235
+ *
1236
+ * @param config - Factory configuration
1237
+ * @returns A LipSyncBackend instance (either GPU or CPU model)
1238
+ */
1239
+ declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
1240
+
985
1241
  /**
986
1242
  * Silero VAD (Voice Activity Detection) inference
987
1243
  *
@@ -3809,4 +4065,4 @@ declare class EmphasisDetector {
3809
4065
  reset(): void;
3810
4066
  }
3811
4067
 
3812
- export { type AIAdapter, type AIAdapterEvents, type AISessionState, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, validateCachedResponse };
4068
+ export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, HF_CDN_TEST_URL, type HuggingFaceUrlInfo, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type TranscriptionResult, type Transition, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, type WhisperConfig, WhisperInference, type WhisperModel, blendEmotions, calculatePeak, calculateRMS, clearSpecificCache, clearTransformersCache, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isHuggingFaceCDNReachable, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, listCaches, nukeBrowserCaches, parseHuggingFaceUrl, preloadModels, remapWav2ArkitToLam, resolveBackend, scanForInvalidCaches, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes, validateCachedResponse };