@omote/core 0.4.7 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,6 +1,5 @@
1
1
  import { EventEmitter, OmoteEvents, AISessionState, AnimationEvent } from './events/index.js';
2
2
  export { BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.js';
3
- import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
4
3
  export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.js';
5
4
  export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
6
5
 
@@ -379,7 +378,7 @@ declare function shouldEnableWasmProxy(): boolean;
379
378
  */
380
379
  declare function isSafari(): boolean;
381
380
  /**
382
- * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
381
+ * Recommend using CPU-optimized A2E model (wav2arkit_cpu)
383
382
  *
384
383
  * All iOS browsers use WebKit and have tight memory limits — the 384MB
385
384
  * LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
@@ -390,7 +389,7 @@ declare function isSafari(): boolean;
390
389
  *
391
390
  * @returns true if iOS (any browser) or Safari (any platform)
392
391
  */
393
- declare function shouldUseCpuLipSync(): boolean;
392
+ declare function shouldUseCpuA2E(): boolean;
394
393
  /**
395
394
  * Check if Web Speech API is available in the browser
396
395
  *
@@ -415,18 +414,18 @@ declare function shouldUseNativeASR(): boolean;
415
414
  /**
416
415
  * Recommend using server-side LAM over client-side on iOS
417
416
  *
418
- * On iOS, LAM lip sync via WASM takes ~332ms per second of audio (3.3x over target).
417
+ * On iOS, LAM A2E via WASM takes ~332ms per second of audio (3.3x over target).
419
418
  * Server-side inference with GPU can achieve ~50ms, providing:
420
- * - Real-time lip sync (under 100ms target)
419
+ * - Real-time A2E (under 100ms target)
421
420
  * - Reduced iOS device thermal/battery impact
422
421
  * - Better user experience
423
422
  *
424
- * @returns true if on iOS (should use server-side lip sync)
423
+ * @returns true if on iOS (should use server-side A2E)
425
424
  */
426
- declare function shouldUseServerLipSync(): boolean;
425
+ declare function shouldUseServerA2E(): boolean;
427
426
 
428
427
  /**
429
- * Common interface for lip sync inference backends
428
+ * Common interface for audio-to-expression (A2E) inference backends
430
429
  *
431
430
  * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
432
431
  * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
@@ -438,19 +437,19 @@ declare function shouldUseServerLipSync(): boolean;
438
437
  /**
439
438
  * Model loading information returned by load()
440
439
  */
441
- interface LipSyncModelInfo {
440
+ interface A2EModelInfo {
442
441
  backend: RuntimeBackend;
443
442
  loadTimeMs: number;
444
443
  inputNames: string[];
445
444
  outputNames: string[];
446
445
  }
447
446
  /**
448
- * Result from lip sync inference
447
+ * Result from A2E inference
449
448
  *
450
449
  * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
451
450
  * Models with different native orderings must remap internally before returning.
452
451
  */
453
- interface LipSyncResult {
452
+ interface A2EResult {
454
453
  /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
455
454
  blendshapes: Float32Array[];
456
455
  /** Number of blendshape frames */
@@ -459,31 +458,33 @@ interface LipSyncResult {
459
458
  inferenceTimeMs: number;
460
459
  }
461
460
  /**
462
- * Common interface for lip sync inference engines
461
+ * Common interface for A2E (audio-to-expression) inference engines
463
462
  *
464
463
  * Implemented by:
465
- * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
466
- * - Wav2ArkitCpuInference (WASM-only, 404MB, lip sync only)
464
+ * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + A2E)
465
+ * - Wav2ArkitCpuInference (WASM-only, 404MB, A2E only)
467
466
  */
468
- interface LipSyncBackend {
467
+ interface A2EBackend {
469
468
  /** Model identifier for backend-specific tuning (e.g. audio delay) */
470
469
  readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
471
470
  /** Current backend type (webgpu, wasm, or null if not loaded) */
472
471
  readonly backend: RuntimeBackend | null;
473
472
  /** Whether the model is loaded and ready for inference */
474
473
  readonly isLoaded: boolean;
474
+ /** Optimal number of audio samples per inference call (e.g. 16000 = 1s at 16kHz) */
475
+ readonly chunkSize: number;
475
476
  /**
476
477
  * Load the ONNX model
477
478
  * @returns Model loading information
478
479
  */
479
- load(): Promise<LipSyncModelInfo>;
480
+ load(): Promise<A2EModelInfo>;
480
481
  /**
481
482
  * Run inference on raw audio
482
483
  * @param audioSamples - Float32Array of raw audio at 16kHz
483
484
  * @param identityIndex - Optional identity index (ignored by CPU model)
484
- * @returns Lip sync result with blendshapes in LAM_BLENDSHAPES order
485
+ * @returns A2E result with blendshapes in LAM_BLENDSHAPES order
485
486
  */
486
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<LipSyncResult>;
487
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
487
488
  /**
488
489
  * Dispose of the model and free resources
489
490
  */
@@ -491,542 +492,16 @@ interface LipSyncBackend {
491
492
  }
492
493
 
493
494
  /**
494
- * LAMPipeline - Coordinate LAM (Wav2Vec2) inference with frame synchronization
495
+ * FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
495
496
  *
496
- * Manages the buffering and processing pipeline for LAM lip sync:
497
- * 1. Accumulates audio samples in a ring buffer
498
- * 2. Triggers LAM inference when buffer reaches required size (16000 samples @ 16kHz = 1.0s)
499
- * 3. Queues resulting blendshape frames with precise timestamps
500
- * 4. Provides frames synchronized to AudioContext clock
497
+ * Orchestrates full-face animation by:
498
+ * 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
499
+ * 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
500
+ * 3. Applying per-character ExpressionProfile scaling to raw A2E output
501
501
  *
502
- * Key Design Decisions:
503
- * - Ring buffer pattern for efficient sample accumulation (no allocation churn)
504
- * - Frame queue with timestamps for deterministic playback
505
- * - Timestamp-based frame retrieval (not callback) for renderer flexibility
506
- *
507
- * Based on patterns from Chrome Audio Worklet design and Web Audio clock management.
508
- *
509
- * @see https://developer.chrome.com/blog/audio-worklet-design-pattern
510
- * @category Audio
511
- */
512
-
513
- interface LAMFrame {
514
- /** 52 ARKit blendshape weights */
515
- frame: Float32Array;
516
- /** AudioContext time when this frame should be displayed */
517
- timestamp: number;
518
- }
519
- interface LAMPipelineOptions {
520
- /**
521
- * Sample rate in Hz (must match audio playback)
522
- * Default: 16000
523
- */
524
- sampleRate?: number;
525
- /**
526
- * LAM inference callback
527
- * Called each time LAM processes a buffer
528
- */
529
- onInference?: (frameCount: number) => void;
530
- /**
531
- * Error callback for inference failures
532
- */
533
- onError?: (error: Error) => void;
534
- }
535
- declare class LAMPipeline {
536
- private readonly options;
537
- private readonly REQUIRED_SAMPLES;
538
- private readonly FRAME_RATE;
539
- private buffer;
540
- private bufferStartTime;
541
- private frameQueue;
542
- /**
543
- * Last successfully retrieved frame
544
- * Used as fallback when no new frame is available to prevent avatar freezing
545
- */
546
- private lastFrame;
547
- constructor(options?: LAMPipelineOptions);
548
- /**
549
- * Push audio samples into the pipeline
550
- *
551
- * Accumulates samples and triggers LAM inference when buffer is full.
552
- * Multiple calls may be needed to accumulate enough samples.
553
- *
554
- * @param samples - Float32Array of audio samples
555
- * @param timestamp - AudioContext time when these samples start playing
556
- * @param lam - LAM inference engine
557
- */
558
- push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
559
- /**
560
- * Process accumulated buffer through LAM inference
561
- */
562
- private processBuffer;
563
- /**
564
- * Get the frame that should be displayed at the current time
565
- *
566
- * Automatically removes frames that have already been displayed.
567
- * This prevents memory leaks from accumulating old frames.
568
- *
569
- * Discard Window (prevents premature frame discarding):
570
- * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
571
- * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
572
- *
573
- * Last-Frame-Hold: Returns last valid frame instead of null to prevent
574
- * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
575
- *
576
- * @param currentTime - Current AudioContext time
577
- * @param lam - LAM inference engine (optional, for backend detection)
578
- * @returns Current frame, or last frame as fallback, or null if no frames yet
579
- */
580
- getFrameForTime(currentTime: number, lam?: {
581
- backend: 'webgpu' | 'wasm' | null;
582
- }): Float32Array | null;
583
- /**
584
- * Get all frames in the queue (for debugging/monitoring)
585
- */
586
- getQueuedFrames(): LAMFrame[];
587
- /**
588
- * Get current buffer fill level (0-1)
589
- */
590
- get fillLevel(): number;
591
- /**
592
- * Get number of frames queued
593
- */
594
- get queuedFrameCount(): number;
595
- /**
596
- * Get buffered audio duration in seconds
597
- */
598
- get bufferedDuration(): number;
599
- /**
600
- * Flush remaining buffered audio
601
- *
602
- * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
603
- * This ensures the final audio chunk generates blendshape frames.
604
- *
605
- * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
606
- *
607
- * @param lam - LAM inference engine
608
- */
609
- flush(lam: LipSyncBackend): Promise<void>;
610
- /**
611
- * Adjust all queued frame timestamps by an offset
612
- *
613
- * Used for synchronization when audio scheduling time differs from
614
- * the estimated time used during LAM processing.
615
- *
616
- * @param offset - Time offset in seconds to add to all timestamps
617
- */
618
- adjustTimestamps(offset: number): void;
619
- /**
620
- * Reset the pipeline
621
- */
622
- reset(): void;
623
- }
624
-
625
- /**
626
- * SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
627
- *
628
- * Orchestrates the complete pipeline for synchronized audio playback and lip sync:
629
- * 1. Network chunks → Coalescer → Optimized buffers
630
- * 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
631
- * 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
632
- * 4. Frames synchronized to AudioContext clock → Renderer
633
- *
634
- * Key Architecture Pattern: Audio-First, LAM-Background
635
- * - Audio chunks are scheduled for playback immediately (never waits for LAM)
636
- * - LAM inference runs in background without blocking the audio path
637
- * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
638
- * - Once LAM catches up, frames stay synchronized to AudioContext clock
639
- *
640
- * This decoupled design prevents LAM inference (50-300ms) from blocking audio
641
- * scheduling, which caused audible stuttering when audio arrived as a continuous
642
- * stream (e.g., single-call TTS from ElevenLabs via AgentCore).
643
- *
644
- * @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
645
- * @category Audio
646
- */
647
-
648
- interface SyncedAudioPipelineOptions {
649
- /** Sample rate in Hz (default: 16000) */
650
- sampleRate?: number;
651
- /** Target chunk duration in ms for coalescing (default: 200) */
652
- chunkTargetMs?: number;
653
- /** LAM inference engine */
654
- lam: LipSyncBackend;
655
- /**
656
- * Audio playback delay in ms before first audio plays.
657
- * Gives LAM inference time to pre-compute blendshapes.
658
- * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
659
- */
660
- audioDelayMs?: number;
661
- }
662
- interface SyncedAudioPipelineEvents {
663
- /** New frame ready for display */
664
- frame_ready: Float32Array;
665
- /** Playback has completed */
666
- playback_complete: void;
667
- /** First audio chunk scheduled, playback starting */
668
- playback_start: number;
669
- /** Error occurred */
670
- error: Error;
671
- /** Index signature for EventEmitter compatibility */
672
- [key: string]: unknown;
673
- }
674
- declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents> {
675
- private readonly options;
676
- private scheduler;
677
- private coalescer;
678
- private lamPipeline;
679
- private playbackStarted;
680
- private monitorInterval;
681
- private frameAnimationId;
682
- constructor(options: SyncedAudioPipelineOptions);
683
- /**
684
- * Initialize the pipeline
685
- */
686
- initialize(): Promise<void>;
687
- /**
688
- * Start a new playback session
689
- *
690
- * Resets all state and prepares for incoming audio chunks.
691
- * Audio will be scheduled immediately as chunks arrive (no buffering).
692
- */
693
- start(): void;
694
- /**
695
- * Receive audio chunk from network
696
- *
697
- * Audio-first design: schedules audio immediately, LAM runs in background.
698
- * This prevents LAM inference (50-300ms) from blocking audio scheduling,
699
- * which caused audible stuttering with continuous audio streams.
700
- *
701
- * @param chunk - Uint8Array containing Int16 PCM audio
702
- */
703
- onAudioChunk(chunk: Uint8Array): Promise<void>;
704
- /**
705
- * End of audio stream
706
- *
707
- * Flushes any remaining buffered data.
708
- */
709
- end(): Promise<void>;
710
- /**
711
- * Stop playback immediately with smooth fade-out
712
- *
713
- * Gracefully cancels all audio playback and LAM processing:
714
- * - Fades out audio over specified duration (default: 50ms)
715
- * - Cancels pending LAM inferences
716
- * - Clears all buffers and queues
717
- * - Emits 'playback_complete' event
718
- *
719
- * Use this for interruptions (e.g., user barge-in during AI speech).
720
- *
721
- * @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
722
- * @returns Promise that resolves when fade-out completes
723
- */
724
- stop(fadeOutMs?: number): Promise<void>;
725
- /**
726
- * Start frame animation loop
727
- *
728
- * Uses requestAnimationFrame to check for new LAM frames.
729
- * Synchronized to AudioContext clock (not visual refresh rate).
730
- *
731
- * Frame Emission Strategy:
732
- * - LAMPipeline uses last-frame-hold to prevent null returns
733
- * - Always emit frames (even repeated frames) to maintain smooth animation
734
- * - Renderer is responsible for detecting duplicate frames if needed
735
- */
736
- private startFrameLoop;
737
- /**
738
- * Start monitoring for playback completion
739
- */
740
- private startMonitoring;
741
- /**
742
- * Stop monitoring
743
- */
744
- private stopMonitoring;
745
- /**
746
- * Get current pipeline state (for debugging/monitoring)
747
- */
748
- getState(): {
749
- playbackStarted: boolean;
750
- coalescerFill: number;
751
- lamFill: number;
752
- queuedFrames: number;
753
- currentTime: number;
754
- playbackEndTime: number;
755
- };
756
- /**
757
- * Cleanup resources
758
- */
759
- dispose(): void;
760
- }
761
-
762
- /**
763
- * Emotion to ARKit Blendshape Mapper
764
- *
765
- * Converts Emotion2VecInference output to upper face ARKit blendshapes for
766
- * expressive avatar animation. Maps 4 emotion categories (neutral, happy, angry, sad)
767
- * to 11 upper face blendshapes (brows, eyes, cheeks).
768
- *
769
- * Supports two blend modes:
770
- * - 'dominant': Uses only the strongest emotion (simpler, more stable)
771
- * - 'weighted': Blends all emotions by probability (more nuanced, e.g., bittersweet)
772
- *
773
- * Also supports energy modulation to scale emotion intensity by audio energy,
774
- * making expressions stronger during emphasized speech.
775
- *
776
- * @example Basic usage
777
- * ```typescript
778
- * import { EmotionToBlendshapeMapper } from '@omote/core';
779
- * import { Emotion2VecInference } from '@omote/core';
780
- *
781
- * const emotion = new Emotion2VecInference({ modelUrl: '/models/emotion.onnx' });
782
- * const mapper = new EmotionToBlendshapeMapper();
783
- *
784
- * // Process emotion frame
785
- * const result = await emotion.infer(audioSamples);
786
- * const blendshapes = mapper.mapFrame(result.dominant);
787
- *
788
- * // Apply to avatar
789
- * for (const [name, value] of Object.entries(blendshapes)) {
790
- * avatar.setBlendshape(name, value);
791
- * }
792
- * ```
793
- *
794
- * @example Weighted blending for nuanced expressions
795
- * ```typescript
796
- * const mapper = new EmotionToBlendshapeMapper({
797
- * blendMode: 'weighted',
798
- * minBlendProbability: 0.1,
799
- * });
800
- *
801
- * // Frame with mixed emotions: { happy: 0.6, sad: 0.3, neutral: 0.1 }
802
- * // Result: bittersweet expression (smiling but worried brow)
803
- * const blendshapes = mapper.mapFrame(emotionFrame);
804
- * ```
805
- *
806
- * @example Energy-modulated emotion
807
- * ```typescript
808
- * import { AudioEnergyAnalyzer } from '@omote/core';
809
- *
810
- * const energyAnalyzer = new AudioEnergyAnalyzer();
811
- * const mapper = new EmotionToBlendshapeMapper({ energyModulation: true });
812
- *
813
- * // In animation loop
814
- * function animate(audioChunk: Float32Array, emotionFrame: EmotionFrame) {
815
- * const { energy } = energyAnalyzer.analyze(audioChunk);
816
- * mapper.mapFrame(emotionFrame, energy); // Louder = stronger emotion
817
- * mapper.update(16);
818
- * applyToAvatar(mapper.getCurrentBlendshapes());
819
- * }
820
- * ```
821
- *
822
- * @module animation
823
- */
824
- declare const EMOTION2VEC_LABELS: readonly ["neutral", "happy", "angry", "sad"];
825
- type Emotion2VecLabel = (typeof EMOTION2VEC_LABELS)[number];
826
- interface EmotionFrame {
827
- /** Primary emotion label */
828
- emotion: Emotion2VecLabel;
829
- /** Confidence for primary emotion (0-1) */
830
- confidence: number;
831
- /** All emotion probabilities */
832
- probabilities: Record<Emotion2VecLabel, number>;
833
- }
834
- /**
835
- * Upper face ARKit blendshape names (11 total)
836
- *
837
- * These blendshapes control the upper face (brows, eyes, cheeks) and are
838
- * driven by emotion detection, complementing the mouth blendshapes from
839
- * LAM lip sync.
840
- */
841
- declare const UPPER_FACE_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "cheekSquintLeft", "cheekSquintRight"];
842
- type UpperFaceBlendshapeName = (typeof UPPER_FACE_BLENDSHAPES)[number];
843
- /**
844
- * Upper face blendshape values (0-1 for each)
845
- */
846
- type UpperFaceBlendshapes = Record<UpperFaceBlendshapeName, number>;
847
- /**
848
- * Blend mode for combining emotions
849
- * - 'dominant': Use only the strongest emotion (default, more stable)
850
- * - 'weighted': Blend all emotions by probability (more nuanced)
851
- */
852
- type EmotionBlendMode = 'dominant' | 'weighted';
853
- /**
854
- * Emotion to ARKit blendshape mapping
855
- *
856
- * Based on Paul Ekman's FACS (Facial Action Coding System) research:
857
- *
858
- * - Happy (AU6+AU12): Cheek raise + lip corner pull (Duchenne smile)
859
- * Upper face: cheekSquint (AU6) + slight eyeSquint from genuine smile
860
- *
861
- * - Angry (AU4+AU5+AU7+AU23): Brow lower + eye wide + lid tighten + lip press
862
- * Upper face: browDown (AU4) + eyeWide (AU5) + eyeSquint (AU7) creates the "glare"
863
- *
864
- * - Sad (AU1+AU4+AU15): Inner brow raise + brow furrow + lip corner depress
865
- * Upper face: browInnerUp (AU1) + browDown (AU4) creates the worried/sad brow
866
- *
867
- * - Neutral: All zeros (no expression overlay)
868
- *
869
- * @see https://imotions.com/blog/learning/research-fundamentals/facial-action-coding-system/
870
- * @see https://melindaozel.com/arkit-to-facs-cheat-sheet/
871
- */
872
- declare const EMOTION_ARKIT_MAP: Record<Emotion2VecLabel, Partial<UpperFaceBlendshapes>>;
873
- /**
874
- * Configuration for EmotionToBlendshapeMapper
875
- */
876
- interface EmotionBlendshapeConfig {
877
- /**
878
- * Smoothing factor for exponential moving average (0-1)
879
- * Lower = slower, smoother transitions
880
- * Higher = faster, more responsive
881
- * @default 0.15
882
- */
883
- smoothingFactor?: number;
884
- /**
885
- * Minimum confidence threshold for emotion to take effect
886
- * Emotions below this confidence are treated as neutral
887
- * @default 0.3
888
- */
889
- confidenceThreshold?: number;
890
- /**
891
- * Global intensity multiplier for all blendshapes (0-2)
892
- * @default 1.0
893
- */
894
- intensity?: number;
895
- /**
896
- * Blend mode for combining emotions
897
- * - 'dominant': Use only the strongest emotion (default)
898
- * - 'weighted': Blend all emotions by probability
899
- * @default 'dominant'
900
- */
901
- blendMode?: EmotionBlendMode;
902
- /**
903
- * Minimum probability for an emotion to contribute in weighted blend mode
904
- * Emotions with probability below this are ignored
905
- * @default 0.1
906
- */
907
- minBlendProbability?: number;
908
- /**
909
- * Enable energy modulation - scale emotion intensity by audio energy
910
- * When enabled, louder speech produces stronger expressions
911
- * @default false
912
- */
913
- energyModulation?: boolean;
914
- /**
915
- * Minimum energy scale when energy modulation is enabled (0-1)
916
- * At zero audio energy, emotion intensity is scaled by this factor
917
- * @default 0.3
918
- */
919
- minEnergyScale?: number;
920
- /**
921
- * Maximum energy scale when energy modulation is enabled (0-2)
922
- * At maximum audio energy, emotion intensity is scaled by this factor
923
- * @default 1.0
924
- */
925
- maxEnergyScale?: number;
926
- }
927
- /**
928
- * EmotionToBlendshapeMapper
929
- *
930
- * Converts emotion detection output to upper face ARKit blendshapes.
931
- * Provides smooth transitions between emotion states using exponential
932
- * moving average interpolation.
933
- *
934
- * Supports two blend modes:
935
- * - 'dominant': Uses only the strongest emotion
936
- * - 'weighted': Blends all emotions by probability for nuanced expressions
937
- *
938
- * Also supports energy modulation to scale emotion intensity by audio energy.
939
- */
940
- declare class EmotionToBlendshapeMapper {
941
- private config;
942
- private targetBlendshapes;
943
- private currentBlendshapes;
944
- private currentEnergy;
945
- /**
946
- * Create a new EmotionToBlendshapeMapper
947
- *
948
- * @param config - Optional configuration
949
- */
950
- constructor(config?: EmotionBlendshapeConfig);
951
- /**
952
- * Map an emotion frame to target blendshapes
953
- *
954
- * This sets the target values that the mapper will smoothly interpolate
955
- * towards. Call update() each frame to apply smoothing.
956
- *
957
- * @param frame - Emotion frame from Emotion2VecInference
958
- * @param audioEnergy - Optional audio energy (0-1) for energy modulation
959
- * @returns Target upper face blendshapes (before smoothing)
960
- */
961
- mapFrame(frame: EmotionFrame, audioEnergy?: number): UpperFaceBlendshapes;
962
- /**
963
- * Map using dominant emotion only (original behavior)
964
- */
965
- private mapFrameDominant;
966
- /**
967
- * Map using weighted blend of all emotions by probability
968
- * Creates more nuanced expressions (e.g., bittersweet = happy + sad)
969
- */
970
- private mapFrameWeighted;
971
- /**
972
- * Apply energy modulation to scale emotion intensity by audio energy
973
- * Louder speech = stronger expressions
974
- */
975
- private applyEnergyModulation;
976
- /**
977
- * Apply smoothing to interpolate current values towards target
978
- *
979
- * Uses exponential moving average:
980
- * current = current + smoothingFactor * (target - current)
981
- *
982
- * @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
983
- */
984
- update(_deltaMs: number): void;
985
- /**
986
- * Get current smoothed blendshape values
987
- *
988
- * @returns Current upper face blendshapes (after smoothing)
989
- */
990
- getCurrentBlendshapes(): UpperFaceBlendshapes;
991
- /**
992
- * Reset mapper to neutral state
993
- *
994
- * Sets both target and current blendshapes to zero.
995
- */
996
- reset(): void;
997
- /**
998
- * Get current configuration
999
- */
1000
- getConfig(): Required<EmotionBlendshapeConfig>;
1001
- /**
1002
- * Update configuration
1003
- *
1004
- * @param config - Partial configuration to update
1005
- */
1006
- setConfig(config: Partial<EmotionBlendshapeConfig>): void;
1007
- }
1008
-
1009
- /**
1010
- * FullFacePipeline - Combined LAM lip sync + Emotion upper face pipeline
1011
- *
1012
- * Orchestrates full-face animation by combining:
1013
- * 1. LAM lip sync (52 ARKit blendshapes) via audio-first scheduling
1014
- * 2. Emotion labels (from backend LLM or `setEmotionLabel()`) for upper face
1015
- * 3. AudioEnergyAnalyzer for prosody-driven fallback when no emotion label is set
1016
- *
1017
- * Architecture: Audio-First, LAM-Background (same as SyncedAudioPipeline)
1018
- * - Audio chunks are scheduled for playback immediately (never waits for LAM)
1019
- * - LAM inference runs in background without blocking the audio path
1020
- * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
1021
- *
1022
- * Merge Strategy:
1023
- * - Lower face (41 blendshapes): 100% from LAM (mouth, jaw, tongue, etc.)
1024
- * - Upper face (11 blendshapes): Emotion overlay with LAM as subtle fallback
1025
- * Formula: emotion * emotionBlendFactor + lam * lamBlendFactor
1026
- *
1027
- * Emotion Sources (in priority order):
1028
- * 1. `setEmotionLabel()` — explicit label from backend LLM (recommended)
1029
- * 2. Prosody fallback — subtle brow movement from audio energy (automatic)
502
+ * The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
503
+ * mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
504
+ * by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
1030
505
  *
1031
506
  * @category Audio
1032
507
  *
@@ -1036,8 +511,7 @@ declare class EmotionToBlendshapeMapper {
1036
511
  *
1037
512
  * const pipeline = new FullFacePipeline({
1038
513
  * lam,
1039
- * emotionBlendFactor: 0.8,
1040
- * lamBlendFactor: 0.2,
514
+ * profile: { mouth: 1.2, brows: 0.8 },
1041
515
  * });
1042
516
  * await pipeline.initialize();
1043
517
  *
@@ -1046,11 +520,41 @@ declare class EmotionToBlendshapeMapper {
1046
520
  * });
1047
521
  *
1048
522
  * pipeline.start();
1049
- * pipeline.setEmotionLabel('happy'); // From backend LLM
1050
523
  * await pipeline.onAudioChunk(audioData);
1051
524
  * ```
1052
525
  */
1053
526
 
527
+ type BlendshapeGroup = 'eyes' | 'brows' | 'jaw' | 'mouth' | 'cheeks' | 'nose' | 'tongue';
528
+ /**
529
+ * Per-character weight scaling for A2E blendshape output.
530
+ *
531
+ * Group scalers multiply all blendshapes in that group (default 1.0).
532
+ * Per-blendshape overrides take priority over group scalers.
533
+ * Final values are clamped to [0, 1].
534
+ */
535
+ interface ExpressionProfile {
536
+ /** eyeBlink*, eyeLook*, eyeSquint*, eyeWide* (14 blendshapes) */
537
+ eyes?: number;
538
+ /** browDown*, browInnerUp, browOuterUp* (5 blendshapes) */
539
+ brows?: number;
540
+ /** jawForward, jawLeft, jawRight, jawOpen (4 blendshapes) */
541
+ jaw?: number;
542
+ /** mouth* (23 blendshapes) */
543
+ mouth?: number;
544
+ /** cheekPuff, cheekSquint* (3 blendshapes) */
545
+ cheeks?: number;
546
+ /** noseSneer* (2 blendshapes) */
547
+ nose?: number;
548
+ /** tongueOut (1 blendshape) */
549
+ tongue?: number;
550
+ /** Per-blendshape overrides (0-2). Takes priority over group scalers. */
551
+ overrides?: Partial<Record<string, number>>;
552
+ }
553
+ /**
554
+ * Map each LAM_BLENDSHAPES entry to its BlendshapeGroup.
555
+ * Built once at module load from prefix matching.
556
+ */
557
+ declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
1054
558
  /**
1055
559
  * Configuration for FullFacePipeline
1056
560
  */
@@ -1061,37 +565,43 @@ interface FullFacePipelineOptions {
1061
565
  chunkTargetMs?: number;
1062
566
  /**
1063
567
  * Audio playback delay in ms before first audio plays.
1064
- * Gives LAM inference time to pre-compute blendshapes.
1065
- * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
568
+ * Gives A2E inference time to pre-compute blendshapes before audio
569
+ * starts, preventing frame drops/desync. Must be chunkSize
570
+ * accumulation time + inference latency.
571
+ *
572
+ * Default: auto-calculated from chunkSize and backend type.
1066
573
  */
1067
574
  audioDelayMs?: number;
1068
- /** LAM inference engine */
1069
- lam: LipSyncBackend;
1070
575
  /**
1071
- * Emotion blend factor for upper face blendshapes (0-1)
1072
- * Higher values give more weight to emotion detection
1073
- * @default 0.8
576
+ * A2E inference chunk size in samples.
577
+ * Controls how many samples accumulate before each inference call.
578
+ * Smaller = lower latency (less delay before first frame), more overhead.
579
+ * Larger = higher latency, less overhead.
580
+ *
581
+ * Default: 16000 (1s) — the model's native window size.
582
+ * Smaller chunks get zero-padded, causing near-zero blendshape output.
1074
583
  */
1075
- emotionBlendFactor?: number;
584
+ chunkSize?: number;
585
+ /** A2E inference engine */
586
+ lam: A2EBackend;
587
+ /** Per-character expression weight scaling */
588
+ profile?: ExpressionProfile;
1076
589
  /**
1077
- * LAM blend factor for upper face blendshapes (0-1)
1078
- * Provides subtle fallback from LAM when emotion is weak
1079
- * @default 0.2
590
+ * Time in ms with no new inference frames before logging a stale warning.
591
+ *
592
+ * Must be larger than the inter-batch gap (chunkSize/sampleRate + inference time).
593
+ * Default: 2000
1080
594
  */
1081
- lamBlendFactor?: number;
595
+ staleThresholdMs?: number;
1082
596
  }
1083
597
  /**
1084
- * Full face frame with merged blendshapes and emotion data
598
+ * Full face frame with scaled blendshapes
1085
599
  */
1086
600
  interface FullFaceFrame {
1087
- /** Merged 52 ARKit blendshapes (lower face from LAM + upper face from emotion) */
601
+ /** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
1088
602
  blendshapes: Float32Array;
1089
- /** Original LAM blendshapes (52) */
1090
- lamBlendshapes: Float32Array;
1091
- /** Emotion-driven upper face blendshapes (11) */
1092
- emotionBlendshapes: UpperFaceBlendshapes;
1093
- /** Raw emotion frame data */
1094
- emotion: EmotionFrame | null;
603
+ /** Raw A2E output (52 blendshapes, before profile scaling) */
604
+ rawBlendshapes: Float32Array;
1095
605
  /** AudioContext timestamp for this frame */
1096
606
  timestamp: number;
1097
607
  }
@@ -1103,8 +613,6 @@ interface FullFacePipelineEvents {
1103
613
  full_frame_ready: FullFaceFrame;
1104
614
  /** Raw LAM frame ready (for debugging/monitoring) */
1105
615
  lam_frame_ready: Float32Array;
1106
- /** Emotion frame ready (for debugging/monitoring) */
1107
- emotion_frame_ready: EmotionFrame;
1108
616
  /** Playback has completed */
1109
617
  playback_complete: void;
1110
618
  /** First frame ready, playback starting */
@@ -1115,53 +623,45 @@ interface FullFacePipelineEvents {
1115
623
  [key: string]: unknown;
1116
624
  }
1117
625
  /**
1118
- * FullFacePipeline - Unified LAM + Emotion animation pipeline
626
+ * FullFacePipeline - A2E animation pipeline with ExpressionProfile scaling
1119
627
  *
1120
628
  * Audio-first design matching SyncedAudioPipeline:
1121
- * - Audio is scheduled immediately (never waits for LAM)
1122
- * - LAM runs in background (fire-and-forget)
1123
- * - Emotion from setEmotionLabel() or prosody fallback
629
+ * - Audio is scheduled immediately (never waits for A2E)
630
+ * - A2E runs in background (fire-and-forget via A2EProcessor)
631
+ * - ExpressionProfile scales raw A2E output per-character
1124
632
  */
1125
633
  declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
1126
634
  private readonly options;
1127
635
  private scheduler;
1128
636
  private coalescer;
1129
- private lamPipeline;
1130
- private emotionMapper;
1131
- private energyAnalyzer;
637
+ private processor;
1132
638
  private playbackStarted;
1133
639
  private monitorInterval;
1134
640
  private frameAnimationId;
1135
- private lastEmotionFrame;
1136
- private currentAudioEnergy;
1137
641
  private lastNewFrameTime;
1138
642
  private lastKnownLamFrame;
1139
643
  private staleWarningEmitted;
1140
- private static readonly STALE_FRAME_THRESHOLD_MS;
1141
- private emotionBlendFactor;
1142
- private lamBlendFactor;
644
+ private readonly staleThresholdMs;
645
+ private frameLoopCount;
646
+ private profile;
1143
647
  constructor(options: FullFacePipelineOptions);
1144
648
  /**
1145
649
  * Initialize the pipeline
1146
650
  */
1147
651
  initialize(): Promise<void>;
1148
652
  /**
1149
- * Set emotion label from backend (e.g., LLM response emotion).
1150
- *
1151
- * Converts a natural language emotion label into an EmotionFrame
1152
- * that drives upper face blendshapes for the duration of the utterance.
1153
- *
1154
- * Supported labels: happy, excited, joyful, sad, melancholic, angry,
1155
- * frustrated, neutral, etc.
1156
- *
1157
- * @param label - Emotion label string (case-insensitive)
653
+ * Update the ExpressionProfile at runtime (e.g., character switch).
1158
654
  */
1159
- setEmotionLabel(label: string): void;
655
+ setProfile(profile: ExpressionProfile): void;
1160
656
  /**
1161
- * Clear any set emotion label.
1162
- * Falls back to prosody-only upper face animation.
657
+ * Apply ExpressionProfile scaling to raw A2E blendshapes.
658
+ *
659
+ * For each blendshape:
660
+ * 1. If an override exists for the blendshape name, use override as scaler
661
+ * 2. Otherwise, use the group scaler (default 1.0)
662
+ * 3. Clamp result to [0, 1]
1163
663
  */
1164
- clearEmotionLabel(): void;
664
+ applyProfile(raw: Float32Array): Float32Array;
1165
665
  /**
1166
666
  * Start a new playback session
1167
667
  *
@@ -1172,29 +672,18 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
1172
672
  /**
1173
673
  * Receive audio chunk from network
1174
674
  *
1175
- * Audio-first design: schedules audio immediately, LAM runs in background.
1176
- * This prevents LAM inference (50-300ms) from blocking audio scheduling.
675
+ * Audio-first design: schedules audio immediately, A2E runs in background.
676
+ * This prevents A2E inference (50-300ms) from blocking audio scheduling.
1177
677
  *
1178
678
  * @param chunk - Uint8Array containing Int16 PCM audio
1179
679
  */
1180
680
  onAudioChunk(chunk: Uint8Array): Promise<void>;
1181
- /**
1182
- * Get emotion frame for current animation.
1183
- *
1184
- * Priority:
1185
- * 1. Explicit emotion label from setEmotionLabel()
1186
- * 2. Prosody fallback: subtle brow movement from audio energy
1187
- */
1188
- private getEmotionFrame;
1189
- /**
1190
- * Merge LAM blendshapes with emotion upper face blendshapes
1191
- */
1192
- mergeBlendshapes(lamFrame: Float32Array, emotionFrame: EmotionFrame | null, audioEnergy?: number): {
1193
- merged: Float32Array;
1194
- emotionBlendshapes: UpperFaceBlendshapes;
1195
- };
1196
681
  /**
1197
682
  * Start frame animation loop
683
+ *
684
+ * Polls A2EProcessor at render rate (60fps) for the latest inference frame
685
+ * matching the current AudioContext time. Between inference batches (~30fps
686
+ * bursts), getFrameForTime() holds the last frame.
1198
687
  */
1199
688
  private startFrameLoop;
1200
689
  /**
@@ -1219,17 +708,11 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
1219
708
  getState(): {
1220
709
  playbackStarted: boolean;
1221
710
  coalescerFill: number;
1222
- lamFill: number;
1223
- queuedLAMFrames: number;
1224
- emotionLabel: "neutral" | "happy" | "angry" | "sad" | null;
1225
- currentAudioEnergy: number;
711
+ processorFill: number;
712
+ queuedFrames: number;
1226
713
  currentTime: number;
1227
714
  playbackEndTime: number;
1228
715
  };
1229
- /**
1230
- * Check if an explicit emotion label is currently set
1231
- */
1232
- get hasEmotionLabel(): boolean;
1233
716
  /**
1234
717
  * Cleanup resources
1235
718
  */
@@ -1255,13 +738,6 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
1255
738
  * @module inference/onnxLoader
1256
739
  */
1257
740
 
1258
- type OrtModule = {
1259
- InferenceSession: typeof InferenceSession;
1260
- Tensor: typeof Tensor;
1261
- env: Env;
1262
- };
1263
- type SessionOptions = InferenceSession.SessionOptions;
1264
-
1265
741
  /**
1266
742
  * Check if WebGPU is available and likely to work
1267
743
  *
@@ -1271,74 +747,6 @@ type SessionOptions = InferenceSession.SessionOptions;
1271
747
  * @returns true if WebGPU is available and working
1272
748
  */
1273
749
  declare function isWebGPUAvailable(): Promise<boolean>;
1274
- /**
1275
- * Load ONNX Runtime with the specified backend
1276
- *
1277
- * This lazily loads the appropriate bundle:
1278
- * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
1279
- * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
1280
- *
1281
- * Once loaded, the same instance is reused for all subsequent calls.
1282
- * If you need to switch backends, you must reload the page.
1283
- *
1284
- * @param backend The backend to load ('webgpu' or 'wasm')
1285
- * @returns The ONNX Runtime module
1286
- */
1287
- declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
1288
- /**
1289
- * Get the appropriate ONNX Runtime based on user preference
1290
- *
1291
- * This resolves the user's preference against platform capabilities
1292
- * and loads the appropriate bundle.
1293
- *
1294
- * @param preference User's backend preference
1295
- * @returns The ONNX Runtime module and the resolved backend
1296
- */
1297
- declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
1298
- ort: OrtModule;
1299
- backend: RuntimeBackend;
1300
- }>;
1301
- /**
1302
- * Get session options for creating an inference session
1303
- *
1304
- * This returns optimized session options based on the backend and platform.
1305
- *
1306
- * @param backend The backend being used
1307
- * @returns Session options for InferenceSession.create()
1308
- */
1309
- declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
1310
- /**
1311
- * Create an inference session with automatic fallback
1312
- *
1313
- * If WebGPU session creation fails, automatically falls back to WASM.
1314
- *
1315
- * @param modelBuffer The model data as ArrayBuffer
1316
- * @param preferredBackend The preferred backend
1317
- * @returns The created session and the backend used
1318
- */
1319
- declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
1320
- session: InferenceSession;
1321
- backend: RuntimeBackend;
1322
- }>;
1323
- /**
1324
- * Get the currently loaded backend (if any)
1325
- */
1326
- declare function getLoadedBackend(): RuntimeBackend | null;
1327
- /**
1328
- * Check if ONNX Runtime has been loaded
1329
- */
1330
- declare function isOnnxRuntimeLoaded(): boolean;
1331
- /**
1332
- * Preload ONNX Runtime and compile the WASM binary early
1333
- *
1334
- * Call this before loading heavy resources (Three.js, VRM models) to ensure
1335
- * WASM memory is allocated in a clean JS heap, reducing iOS memory pressure.
1336
- * Uses the singleton pattern — subsequent model loading reuses this instance.
1337
- *
1338
- * @param preference Backend preference (default: 'auto')
1339
- * @returns The resolved backend that was loaded
1340
- */
1341
- declare function preloadOnnxRuntime(preference?: BackendPreference): Promise<RuntimeBackend>;
1342
750
 
1343
751
  /**
1344
752
  * SenseVoice automatic speech recognition using ONNX Runtime Web
@@ -2094,8 +1502,9 @@ interface Wav2ArkitCpuWorkerConfig {
2094
1502
  *
2095
1503
  * @see Wav2ArkitCpuInference for main-thread version
2096
1504
  */
2097
- declare class Wav2ArkitCpuWorker implements LipSyncBackend {
1505
+ declare class Wav2ArkitCpuWorker implements A2EBackend {
2098
1506
  readonly modelId: "wav2arkit_cpu";
1507
+ readonly chunkSize: number;
2099
1508
  private worker;
2100
1509
  private config;
2101
1510
  private isLoading;
@@ -2124,7 +1533,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
2124
1533
  /**
2125
1534
  * Load the ONNX model in the worker
2126
1535
  */
2127
- load(): Promise<LipSyncModelInfo>;
1536
+ load(): Promise<A2EModelInfo>;
2128
1537
  /**
2129
1538
  * Run inference on raw audio
2130
1539
  *
@@ -2134,7 +1543,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
2134
1543
  * @param audioSamples - Float32Array of raw audio at 16kHz
2135
1544
  * @param _identityIndex - Ignored (identity 11 is baked into the model)
2136
1545
  */
2137
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
1546
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2138
1547
  /**
2139
1548
  * Queue inference to serialize worker calls
2140
1549
  */
@@ -2166,7 +1575,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
2166
1575
  * await worker.init();
2167
1576
  *
2168
1577
  * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
2169
- * const lam = createLipSync({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
1578
+ * const lam = createA2E({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
2170
1579
  * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
2171
1580
  * ```
2172
1581
  *
@@ -2196,17 +1605,17 @@ declare class UnifiedInferenceWorker {
2196
1605
  }): Promise<SenseVoiceModelInfo>;
2197
1606
  transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
2198
1607
  disposeSenseVoice(): Promise<void>;
2199
- loadLipSync(config: {
1608
+ loadA2E(config: {
2200
1609
  modelUrl: string;
2201
1610
  externalDataUrl: string | null;
2202
- }): Promise<LipSyncModelInfo>;
2203
- inferLipSync(audio: Float32Array): Promise<{
1611
+ }): Promise<A2EModelInfo>;
1612
+ inferA2E(audio: Float32Array): Promise<{
2204
1613
  blendshapes: Float32Array;
2205
1614
  numFrames: number;
2206
1615
  numBlendshapes: number;
2207
1616
  inferenceTimeMs: number;
2208
1617
  }>;
2209
- disposeLipSync(): Promise<void>;
1618
+ disposeA2E(): Promise<void>;
2210
1619
  loadVAD(config: {
2211
1620
  modelUrl: string;
2212
1621
  sampleRate: number;
@@ -2252,10 +1661,11 @@ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
2252
1661
  /**
2253
1662
  * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
2254
1663
  *
2255
- * Implements LipSyncBackend, delegating all inference to the shared worker.
1664
+ * Implements A2EBackend, delegating all inference to the shared worker.
2256
1665
  */
2257
- declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
1666
+ declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
2258
1667
  readonly modelId: "wav2arkit_cpu";
1668
+ readonly chunkSize: number;
2259
1669
  private worker;
2260
1670
  private config;
2261
1671
  private _isLoaded;
@@ -2263,8 +1673,8 @@ declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
2263
1673
  constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
2264
1674
  get isLoaded(): boolean;
2265
1675
  get backend(): RuntimeBackend | null;
2266
- load(): Promise<LipSyncModelInfo>;
2267
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
1676
+ load(): Promise<A2EModelInfo>;
1677
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2268
1678
  dispose(): Promise<void>;
2269
1679
  }
2270
1680
  /**
@@ -2392,116 +1802,6 @@ interface CreateSenseVoiceConfig {
2392
1802
  */
2393
1803
  declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
2394
1804
 
2395
- /**
2396
- * Kaldi-compatible filterbank (fbank) feature extraction
2397
- *
2398
- * Pure TypeScript implementation matching kaldi-native-fbank parameters
2399
- * used by SenseVoice. No external dependencies.
2400
- *
2401
- * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
2402
- *
2403
- * @module inference/kaldiFbank
2404
- */
2405
- interface KaldiFbankOptions {
2406
- /** Frame length in ms (default: 25) */
2407
- frameLengthMs?: number;
2408
- /** Frame shift in ms (default: 10) */
2409
- frameShiftMs?: number;
2410
- /** Low frequency cutoff in Hz (default: 20) */
2411
- lowFreq?: number;
2412
- /** High frequency cutoff in Hz (default: sampleRate / 2) */
2413
- highFreq?: number;
2414
- /** Dither amount (default: 0 for deterministic output) */
2415
- dither?: number;
2416
- /** Preemphasis coefficient (default: 0.97) */
2417
- preemphasis?: number;
2418
- }
2419
- /**
2420
- * Compute Kaldi-compatible log mel filterbank features
2421
- *
2422
- * @param audio Raw audio samples (float32, [-1, 1] range)
2423
- * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
2424
- * @param numMelBins Number of mel bins (80 for SenseVoice)
2425
- * @param opts Optional parameters
2426
- * @returns Flattened Float32Array of shape [numFrames, numMelBins]
2427
- */
2428
- declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
2429
- /**
2430
- * Apply Low Frame Rate stacking for SenseVoice
2431
- *
2432
- * Concatenates lfrM consecutive frames with stride lfrN.
2433
- * Left-pads with copies of first frame, right-pads last group.
2434
- *
2435
- * @param features Flattened [numFrames, featureDim]
2436
- * @param featureDim Feature dimension per frame (e.g., 80)
2437
- * @param lfrM Number of frames to stack (default: 7)
2438
- * @param lfrN Stride (default: 6)
2439
- * @returns Flattened [numOutputFrames, featureDim * lfrM]
2440
- */
2441
- declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
2442
- /**
2443
- * Apply CMVN normalization in-place
2444
- *
2445
- * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
2446
- *
2447
- * @param features Flattened feature array (modified in-place)
2448
- * @param dim Feature dimension (560 for SenseVoice after LFR)
2449
- * @param negMean Negative mean vector (dim-dimensional)
2450
- * @param invStddev Inverse standard deviation vector (dim-dimensional)
2451
- * @returns The same features array (for chaining)
2452
- */
2453
- declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
2454
- /**
2455
- * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
2456
- *
2457
- * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
2458
- * as comma-separated float strings in the model's metadata.
2459
- */
2460
- declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
2461
- negMean: Float32Array;
2462
- invStddev: Float32Array;
2463
- };
2464
-
2465
- /**
2466
- * CTC greedy decoder for SenseVoice
2467
- *
2468
- * Decodes CTC logits into text with structured token parsing
2469
- * for language, emotion, and audio event detection.
2470
- *
2471
- * @module inference/ctcDecoder
2472
- */
2473
- interface CTCDecodeResult {
2474
- /** Decoded text (speech content only) */
2475
- text: string;
2476
- /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
2477
- language?: string;
2478
- /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
2479
- emotion?: string;
2480
- /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
2481
- event?: string;
2482
- }
2483
- /** Resolve language string to SenseVoice language ID */
2484
- declare function resolveLanguageId(language: string): number;
2485
- /** Resolve text norm string to SenseVoice text norm ID */
2486
- declare function resolveTextNormId(textNorm: string): number;
2487
- /**
2488
- * Parse tokens.txt into a token ID → string map
2489
- *
2490
- * Format: each line is "token_string token_id"
2491
- * e.g., "<unk> 0", "▁the 3", "s 4"
2492
- */
2493
- declare function parseTokensFile(content: string): Map<number, string>;
2494
- /**
2495
- * CTC greedy decode
2496
- *
2497
- * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
2498
- * @param seqLen Sequence length (time steps)
2499
- * @param vocabSize Vocabulary size
2500
- * @param tokenMap Token ID → string map from tokens.txt
2501
- * @returns Decoded text and structured metadata
2502
- */
2503
- declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
2504
-
2505
1805
  /**
2506
1806
  * Shared blendshape constants and utilities for lip sync inference
2507
1807
  *
@@ -2521,26 +1821,18 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
2521
1821
  /** Alias for backwards compatibility */
2522
1822
  declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2523
1823
  /**
2524
- * Symmetrize blendshapes by averaging left/right pairs
2525
- * From LAM official postprocessing (models/utils.py)
2526
- * This fixes asymmetric output from the raw model
2527
- */
2528
- declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
2529
- /**
2530
- * wav2arkit_cpu model blendshape ordering
1824
+ * Linearly interpolate between two blendshape weight arrays.
2531
1825
  *
2532
- * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
2533
- * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
2534
- * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
2535
- */
2536
- declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
2537
- /**
2538
- * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
1826
+ * Pure math utility with zero renderer dependency — used by all renderer
1827
+ * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
1828
+ * transitions.
2539
1829
  *
2540
- * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
2541
- * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
1830
+ * @param current - Current blendshape weights
1831
+ * @param target - Target blendshape weights
1832
+ * @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
1833
+ * @returns Interpolated weights as number[]
2542
1834
  */
2543
- declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
1835
+ declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
2544
1836
 
2545
1837
  /**
2546
1838
  * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
@@ -2582,6 +1874,12 @@ interface Wav2Vec2InferenceConfig {
2582
1874
  backend?: InferenceBackend;
2583
1875
  /** Number of identity classes (default: 12 for streaming model) */
2584
1876
  numIdentityClasses?: number;
1877
+ /**
1878
+ * Number of audio samples per inference chunk (default: 16000).
1879
+ * Model supports variable chunk sizes. Smaller chunks = lower latency,
1880
+ * more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
1881
+ */
1882
+ chunkSize?: number;
2585
1883
  }
2586
1884
  interface ModelInfo {
2587
1885
  backend: 'webgpu' | 'wasm';
@@ -2608,7 +1906,7 @@ interface Wav2Vec2Result {
2608
1906
  /** Inference time in ms */
2609
1907
  inferenceTimeMs: number;
2610
1908
  }
2611
- declare class Wav2Vec2Inference implements LipSyncBackend {
1909
+ declare class Wav2Vec2Inference implements A2EBackend {
2612
1910
  readonly modelId: "wav2vec2";
2613
1911
  private session;
2614
1912
  private ort;
@@ -2616,6 +1914,7 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
2616
1914
  private _backend;
2617
1915
  private isLoading;
2618
1916
  private numIdentityClasses;
1917
+ readonly chunkSize: number;
2619
1918
  private inferenceQueue;
2620
1919
  private poisoned;
2621
1920
  private static readonly INFERENCE_TIMEOUT_MS;
@@ -2635,11 +1934,10 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
2635
1934
  load(): Promise<ModelInfo>;
2636
1935
  /**
2637
1936
  * Run inference on raw audio
2638
- * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
1937
+ * @param audioSamples - Float32Array of raw audio at 16kHz
2639
1938
  * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
2640
1939
  *
2641
- * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
2642
- * Audio will be zero-padded or truncated to 16000 samples.
1940
+ * Audio will be zero-padded or truncated to chunkSize samples.
2643
1941
  */
2644
1942
  infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
2645
1943
  /**
@@ -2707,8 +2005,9 @@ interface Wav2ArkitCpuConfig {
2707
2005
  /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
2708
2006
  backend?: BackendPreference;
2709
2007
  }
2710
- declare class Wav2ArkitCpuInference implements LipSyncBackend {
2008
+ declare class Wav2ArkitCpuInference implements A2EBackend {
2711
2009
  readonly modelId: "wav2arkit_cpu";
2010
+ readonly chunkSize: number;
2712
2011
  private session;
2713
2012
  private ort;
2714
2013
  private config;
@@ -2723,7 +2022,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2723
2022
  /**
2724
2023
  * Load the ONNX model
2725
2024
  */
2726
- load(): Promise<LipSyncModelInfo>;
2025
+ load(): Promise<A2EModelInfo>;
2727
2026
  /**
2728
2027
  * Run inference on raw audio
2729
2028
  *
@@ -2733,7 +2032,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2733
2032
  * @param audioSamples - Float32Array of raw audio at 16kHz
2734
2033
  * @param _identityIndex - Ignored (identity 11 is baked into the model)
2735
2034
  */
2736
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
2035
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2737
2036
  /**
2738
2037
  * Queue inference to serialize ONNX session calls
2739
2038
  */
@@ -2745,7 +2044,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2745
2044
  }
2746
2045
 
2747
2046
  /**
2748
- * Factory function for lip sync with automatic GPU/CPU model selection
2047
+ * Factory function for A2E with automatic GPU/CPU model selection
2749
2048
  *
2750
2049
  * Provides a unified API that automatically selects the optimal model:
2751
2050
  * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
@@ -2766,20 +2065,20 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2766
2065
  *
2767
2066
  * @example Auto-detect (recommended)
2768
2067
  * ```typescript
2769
- * import { createLipSync } from '@omote/core';
2068
+ * import { createA2E } from '@omote/core';
2770
2069
  *
2771
- * const lam = createLipSync({
2070
+ * const a2e = createA2E({
2772
2071
  * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2773
2072
  * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2774
2073
  * });
2775
2074
  *
2776
- * await lam.load();
2777
- * const { blendshapes } = await lam.infer(audioSamples);
2075
+ * await a2e.load();
2076
+ * const { blendshapes } = await a2e.infer(audioSamples);
2778
2077
  * ```
2779
2078
  *
2780
2079
  * @example Force CPU model
2781
2080
  * ```typescript
2782
- * const lam = createLipSync({
2081
+ * const a2e = createA2E({
2783
2082
  * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2784
2083
  * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2785
2084
  * mode: 'cpu',
@@ -2788,9 +2087,9 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2788
2087
  */
2789
2088
 
2790
2089
  /**
2791
- * Configuration for the lip sync factory
2090
+ * Configuration for the A2E factory
2792
2091
  */
2793
- interface CreateLipSyncConfig {
2092
+ interface CreateA2EConfig {
2794
2093
  /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
2795
2094
  gpuModelUrl: string;
2796
2095
  /**
@@ -2804,7 +2103,7 @@ interface CreateLipSyncConfig {
2804
2103
  cpuModelUrl: string;
2805
2104
  /**
2806
2105
  * Model selection mode:
2807
- * - 'auto': Safari/iOS CPU, everything else GPU (default)
2106
+ * - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
2808
2107
  * - 'gpu': Force GPU model (Wav2Vec2Inference)
2809
2108
  * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
2810
2109
  */
@@ -2838,12 +2137,322 @@ interface CreateLipSyncConfig {
2838
2137
  unifiedWorker?: UnifiedInferenceWorker;
2839
2138
  }
2840
2139
  /**
2841
- * Create a lip sync instance with automatic GPU/CPU model selection
2140
+ * Create an A2E instance with automatic GPU/CPU model selection
2842
2141
  *
2843
2142
  * @param config - Factory configuration
2844
- * @returns A LipSyncBackend instance (either GPU or CPU model)
2143
+ * @returns An A2EBackend instance (either GPU or CPU model)
2144
+ */
2145
+ declare function createA2E(config: CreateA2EConfig): A2EBackend;
2146
+
2147
+ /**
2148
+ * A2EProcessor — Engine-agnostic audio-to-expression processor
2149
+ *
2150
+ * The core inference primitive: audio samples in → blendshape frames out.
2151
+ * No mic capture, no audio playback, no Web Audio API.
2152
+ *
2153
+ * This is what Unity/Unreal/Godot/any engine would use directly.
2154
+ * Web-specific concerns (mic, AudioContext, scheduling) live in the
2155
+ * orchestrator and pipeline layers above.
2156
+ *
2157
+ * Two output modes:
2158
+ * - **Pull mode**: `pushAudio(samples, timestamp)` + `getFrameForTime(t)`
2159
+ * For TTS playback where frames are synced to AudioContext clock.
2160
+ * - **Push mode**: `pushAudio(samples)` + `startDrip()` + `latestFrame`
2161
+ * For live mic / game loop where frames are consumed at ~30fps.
2162
+ *
2163
+ * @category Inference
2164
+ *
2165
+ * @example Pull mode (TTS playback)
2166
+ * ```typescript
2167
+ * const processor = new A2EProcessor({ backend: a2e });
2168
+ * processor.pushAudio(samples, audioContext.currentTime + delay);
2169
+ * const frame = processor.getFrameForTime(audioContext.currentTime);
2170
+ * ```
2171
+ *
2172
+ * @example Push mode (live mic)
2173
+ * ```typescript
2174
+ * const processor = new A2EProcessor({
2175
+ * backend: a2e,
2176
+ * onFrame: (frame) => applyToAvatar(frame),
2177
+ * });
2178
+ * processor.startDrip();
2179
+ * processor.pushAudio(micSamples); // no timestamp → drip mode
2180
+ * ```
2181
+ */
2182
+
2183
+ interface A2EProcessorConfig {
2184
+ /** Inference backend */
2185
+ backend: A2EBackend;
2186
+ /** Sample rate (default: 16000) */
2187
+ sampleRate?: number;
2188
+ /** Samples per inference chunk (default: 16000 = 1s) */
2189
+ chunkSize?: number;
2190
+ /** Callback fired with each blendshape frame (push mode) */
2191
+ onFrame?: (frame: Float32Array) => void;
2192
+ /** Error callback */
2193
+ onError?: (error: Error) => void;
2194
+ }
2195
+ declare class A2EProcessor {
2196
+ private readonly backend;
2197
+ private readonly sampleRate;
2198
+ private readonly chunkSize;
2199
+ private readonly onFrame?;
2200
+ private readonly onError?;
2201
+ private bufferCapacity;
2202
+ private buffer;
2203
+ private writeOffset;
2204
+ private bufferStartTime;
2205
+ private timestampedQueue;
2206
+ private plainQueue;
2207
+ private _latestFrame;
2208
+ private dripInterval;
2209
+ private lastPulledFrame;
2210
+ private inferenceRunning;
2211
+ private pendingChunks;
2212
+ private getFrameCallCount;
2213
+ private disposed;
2214
+ constructor(config: A2EProcessorConfig);
2215
+ /**
2216
+ * Push audio samples for inference (any source: mic, TTS, file).
2217
+ *
2218
+ * - With `timestamp`: frames stored with timestamps (pull mode)
2219
+ * - Without `timestamp`: frames stored in plain queue (drip/push mode)
2220
+ *
2221
+ * Fire-and-forget: returns immediately, inference runs async.
2222
+ */
2223
+ pushAudio(samples: Float32Array, timestamp?: number): void;
2224
+ /**
2225
+ * Flush remaining buffered audio (pads to chunkSize).
2226
+ * Call at end of stream to process final partial chunk.
2227
+ *
2228
+ * Routes through the serialized pendingChunks pipeline to maintain
2229
+ * correct frame ordering. Without this, flush() could push frames
2230
+ * with the latest timestamp to the queue before drainPendingChunks()
2231
+ * finishes pushing frames with earlier timestamps — causing
2232
+ * getFrameForTime() to see out-of-order timestamps and stall.
2233
+ */
2234
+ flush(): Promise<void>;
2235
+ /**
2236
+ * Reset buffer and frame queues
2237
+ */
2238
+ reset(): void;
2239
+ /**
2240
+ * Get frame synced to external clock (e.g. AudioContext.currentTime).
2241
+ *
2242
+ * Discards frames that are too old, returns the current frame,
2243
+ * or holds last frame as fallback to prevent avatar freezing.
2244
+ *
2245
+ * @param currentTime - Current playback time (seconds)
2246
+ * @returns Blendshape frame, or null if no frames yet
2247
+ */
2248
+ getFrameForTime(currentTime: number): Float32Array | null;
2249
+ /** Latest frame from drip-feed (live mic, game loop) */
2250
+ get latestFrame(): Float32Array | null;
2251
+ /** Start 30fps drip-feed timer (push mode) */
2252
+ startDrip(): void;
2253
+ /** Stop drip-feed timer */
2254
+ stopDrip(): void;
2255
+ /** Number of frames waiting in queue (both modes combined) */
2256
+ get queuedFrameCount(): number;
2257
+ /** Buffer fill level as fraction of chunkSize (0-1) */
2258
+ get fillLevel(): number;
2259
+ /** Dispose resources */
2260
+ dispose(): void;
2261
+ /**
2262
+ * Process pending chunks sequentially.
2263
+ * Fire-and-forget — called from pushAudio() without awaiting.
2264
+ */
2265
+ private drainPendingChunks;
2266
+ private handleError;
2267
+ }
2268
+
2269
+ /**
2270
+ * BlendshapeSmoother — Per-channel critically damped spring for 52 ARKit blendshapes
2271
+ *
2272
+ * Eliminates frame gaps between inference batches by smoothly interpolating
2273
+ * blendshape weights using critically damped springs (the game industry standard).
2274
+ *
2275
+ * Each of the 52 blendshape channels has its own spring with position + velocity
2276
+ * state. When a new inference frame arrives, spring targets are updated. Between
2277
+ * frames, springs continue converging toward the last target — no frozen face.
2278
+ *
2279
+ * When inference stalls, `decayToNeutral()` sets all targets to 0, and the
2280
+ * springs smoothly close the mouth / relax the face over the halflife period.
2281
+ *
2282
+ * Math from Daniel Holden's "Spring-It-On" (Epic Games):
2283
+ * https://theorangeduck.com/page/spring-roll-call
2284
+ *
2285
+ * @category Inference
2286
+ *
2287
+ * @example Basic usage
2288
+ * ```typescript
2289
+ * const smoother = new BlendshapeSmoother({ halflife: 0.06 });
2290
+ *
2291
+ * // In frame loop (60fps):
2292
+ * smoother.setTarget(inferenceFrame); // when new frame arrives
2293
+ * const smoothed = smoother.update(1/60); // every render frame
2294
+ * applyToAvatar(smoothed);
2295
+ * ```
2845
2296
  */
2846
- declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
2297
+ interface BlendshapeSmootherConfig {
2298
+ /**
2299
+ * Spring halflife in seconds — time for the distance to the target
2300
+ * to reduce by half. Lower = snappier, higher = smoother.
2301
+ *
2302
+ * - 0.04s (40ms): Very snappy, slight jitter on fast transitions
2303
+ * - 0.06s (60ms): Sweet spot for lip sync (default)
2304
+ * - 0.10s (100ms): Very smooth, slight lag on fast consonants
2305
+ * - 0: Bypass mode — passes through raw target values (no smoothing)
2306
+ *
2307
+ * Default: 0.06
2308
+ */
2309
+ halflife?: number;
2310
+ }
2311
+ declare class BlendshapeSmoother {
2312
+ private readonly halflife;
2313
+ /** Current smoothed blendshape values */
2314
+ private values;
2315
+ /** Per-channel spring velocities */
2316
+ private velocities;
2317
+ /** Current spring targets (from latest inference frame) */
2318
+ private targets;
2319
+ /** Whether any target has been set */
2320
+ private _hasTarget;
2321
+ constructor(config?: BlendshapeSmootherConfig);
2322
+ /** Whether a target frame has been set (false until first setTarget call) */
2323
+ get hasTarget(): boolean;
2324
+ /**
2325
+ * Set new target frame from inference output.
2326
+ * Springs will converge toward these values on subsequent update() calls.
2327
+ */
2328
+ setTarget(frame: Float32Array): void;
2329
+ /**
2330
+ * Advance all 52 springs by `dt` seconds and return the smoothed frame.
2331
+ *
2332
+ * Call this every render frame (e.g., inside requestAnimationFrame).
2333
+ * Returns the internal values buffer — do NOT mutate the returned array.
2334
+ *
2335
+ * @param dt - Time step in seconds (e.g., 1/60 for 60fps)
2336
+ * @returns Smoothed blendshape values (Float32Array of 52)
2337
+ */
2338
+ update(dt: number): Float32Array;
2339
+ /**
2340
+ * Decay all spring targets to neutral (0).
2341
+ *
2342
+ * Call when inference stalls (no new frames for threshold duration).
2343
+ * The springs will smoothly close the mouth / relax the face over
2344
+ * the halflife period rather than freezing.
2345
+ */
2346
+ decayToNeutral(): void;
2347
+ /**
2348
+ * Reset all state (values, velocities, targets).
2349
+ * Call when starting a new playback session.
2350
+ */
2351
+ reset(): void;
2352
+ }
2353
+
2354
+ /**
2355
+ * Renderer-agnostic A2E (audio-to-expression) orchestrator
2356
+ *
2357
+ * Manages the mic capture + A2E inference loop independently of any
2358
+ * 3D renderer. Adapter packages (@omote/three, @omote/babylon) wrap this
2359
+ * thinly and pipe `latestWeights` into their renderer-specific blendshape
2360
+ * controllers.
2361
+ *
2362
+ * Internally delegates all buffer accumulation, inference, and frame
2363
+ * drip-feeding to {@link A2EProcessor}. This class only handles mic capture
2364
+ * (getUserMedia, ScriptProcessorNode, resampling).
2365
+ *
2366
+ * @category Inference
2367
+ */
2368
+
2369
+ /**
2370
+ * Progress event emitted during model download / compile
2371
+ */
2372
+ interface A2EProgressEvent {
2373
+ phase: 'download' | 'compile';
2374
+ progress: number;
2375
+ }
2376
+ /**
2377
+ * Configuration for the A2EOrchestrator
2378
+ */
2379
+ interface A2EOrchestratorConfig {
2380
+ /** URL for the GPU model (Wav2Vec2, Chrome/Firefox/Edge) */
2381
+ gpuModelUrl: string;
2382
+ /** URL for GPU model external data file */
2383
+ gpuExternalDataUrl?: string | false;
2384
+ /** URL for the CPU model (wav2arkit_cpu, Safari/iOS) */
2385
+ cpuModelUrl?: string;
2386
+ /** Sample rate for mic capture (default: 16000) */
2387
+ sampleRate?: number;
2388
+ /** Chunk size in samples for mic capture (default: 16000 = 1s at 16kHz) */
2389
+ chunkSize?: number;
2390
+ /** Callback fired with new blendshape weights after each inference */
2391
+ onFrame?: (weights: Float32Array) => void;
2392
+ /** Callback fired during model loading progress */
2393
+ onProgress?: (event: A2EProgressEvent) => void;
2394
+ /** Callback fired on error */
2395
+ onError?: (error: Error) => void;
2396
+ /** Callback fired when model is loaded and ready */
2397
+ onReady?: () => void;
2398
+ /** Additional createA2E config options */
2399
+ a2eConfig?: Partial<CreateA2EConfig>;
2400
+ }
2401
+ /**
2402
+ * Renderer-agnostic A2E orchestrator.
2403
+ *
2404
+ * Manages mic capture + delegates inference to {@link A2EProcessor}.
2405
+ * Adapters read `latestWeights` each frame to apply to their meshes.
2406
+ *
2407
+ * @example Quick start (used by @omote/three and @omote/babylon adapters)
2408
+ * ```typescript
2409
+ * const orchestrator = new A2EOrchestrator({
2410
+ * gpuModelUrl: '/models/wav2vec2.onnx',
2411
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2412
+ * onFrame: (weights) => controller.update(weights),
2413
+ * });
2414
+ * await orchestrator.load();
2415
+ * await orchestrator.start();
2416
+ * ```
2417
+ */
2418
+ declare class A2EOrchestrator {
2419
+ private config;
2420
+ private a2e;
2421
+ private processor;
2422
+ private stream;
2423
+ private audioContext;
2424
+ private scriptProcessor;
2425
+ private nativeSampleRate;
2426
+ private _isReady;
2427
+ private _isStreaming;
2428
+ private _backend;
2429
+ private disposed;
2430
+ constructor(config: A2EOrchestratorConfig);
2431
+ /** Latest blendshape weights from inference (null if none yet) */
2432
+ get latestWeights(): Float32Array | null;
2433
+ /** Whether the model is loaded and ready for inference */
2434
+ get isReady(): boolean;
2435
+ /** Whether mic is active and inference loop is running */
2436
+ get isStreaming(): boolean;
2437
+ /** Current backend type (webgpu, wasm, or null) */
2438
+ get backend(): string | null;
2439
+ /**
2440
+ * Load the A2E model and create the processor
2441
+ */
2442
+ load(): Promise<void>;
2443
+ /**
2444
+ * Start mic capture and inference loop
2445
+ */
2446
+ start(): Promise<void>;
2447
+ /**
2448
+ * Stop mic capture and inference loop
2449
+ */
2450
+ stop(): void;
2451
+ /**
2452
+ * Dispose of all resources
2453
+ */
2454
+ dispose(): Promise<void>;
2455
+ }
2847
2456
 
2848
2457
  /**
2849
2458
  * Safari Web Speech API wrapper for iOS speech recognition
@@ -5190,4 +4799,4 @@ declare class ProceduralLifeLayer {
5190
4799
  private updateBrowNoise;
5191
4800
  }
5192
4801
 
5193
- export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, UnifiedInferenceWorker, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSenseVoice, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
4802
+ export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };