@omote/core 0.4.6 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,6 +1,5 @@
1
1
  import { EventEmitter, OmoteEvents, AISessionState, AnimationEvent } from './events/index.js';
2
2
  export { BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.js';
3
- import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
4
3
  export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.js';
5
4
  export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
6
5
 
@@ -379,7 +378,7 @@ declare function shouldEnableWasmProxy(): boolean;
379
378
  */
380
379
  declare function isSafari(): boolean;
381
380
  /**
382
- * Recommend using CPU-optimized lip sync model (wav2arkit_cpu)
381
+ * Recommend using CPU-optimized A2E model (wav2arkit_cpu)
383
382
  *
384
383
  * All iOS browsers use WebKit and have tight memory limits — the 384MB
385
384
  * LAM model causes silent crashes. wav2arkit_cpu uses URL pass-through
@@ -390,7 +389,7 @@ declare function isSafari(): boolean;
390
389
  *
391
390
  * @returns true if iOS (any browser) or Safari (any platform)
392
391
  */
393
- declare function shouldUseCpuLipSync(): boolean;
392
+ declare function shouldUseCpuA2E(): boolean;
394
393
  /**
395
394
  * Check if Web Speech API is available in the browser
396
395
  *
@@ -415,18 +414,18 @@ declare function shouldUseNativeASR(): boolean;
415
414
  /**
416
415
  * Recommend using server-side LAM over client-side on iOS
417
416
  *
418
- * On iOS, LAM lip sync via WASM takes ~332ms per second of audio (3.3x over target).
417
+ * On iOS, LAM A2E via WASM takes ~332ms per second of audio (3.3x over target).
419
418
  * Server-side inference with GPU can achieve ~50ms, providing:
420
- * - Real-time lip sync (under 100ms target)
419
+ * - Real-time A2E (under 100ms target)
421
420
  * - Reduced iOS device thermal/battery impact
422
421
  * - Better user experience
423
422
  *
424
- * @returns true if on iOS (should use server-side lip sync)
423
+ * @returns true if on iOS (should use server-side A2E)
425
424
  */
426
- declare function shouldUseServerLipSync(): boolean;
425
+ declare function shouldUseServerA2E(): boolean;
427
426
 
428
427
  /**
429
- * Common interface for lip sync inference backends
428
+ * Common interface for audio-to-expression (A2E) inference backends
430
429
  *
431
430
  * Both Wav2Vec2Inference (GPU, 384MB) and Wav2ArkitCpuInference (CPU, 404MB)
432
431
  * implement this interface, allowing SyncedAudioPipeline and LAMPipeline to
@@ -438,19 +437,19 @@ declare function shouldUseServerLipSync(): boolean;
438
437
  /**
439
438
  * Model loading information returned by load()
440
439
  */
441
- interface LipSyncModelInfo {
440
+ interface A2EModelInfo {
442
441
  backend: RuntimeBackend;
443
442
  loadTimeMs: number;
444
443
  inputNames: string[];
445
444
  outputNames: string[];
446
445
  }
447
446
  /**
448
- * Result from lip sync inference
447
+ * Result from A2E inference
449
448
  *
450
449
  * All implementations must return blendshapes in LAM_BLENDSHAPES order (alphabetical).
451
450
  * Models with different native orderings must remap internally before returning.
452
451
  */
453
- interface LipSyncResult {
452
+ interface A2EResult {
454
453
  /** Blendshape weights [frames, 52] in LAM_BLENDSHAPES order - 30fps */
455
454
  blendshapes: Float32Array[];
456
455
  /** Number of blendshape frames */
@@ -459,31 +458,33 @@ interface LipSyncResult {
459
458
  inferenceTimeMs: number;
460
459
  }
461
460
  /**
462
- * Common interface for lip sync inference engines
461
+ * Common interface for A2E (audio-to-expression) inference engines
463
462
  *
464
463
  * Implemented by:
465
- * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + lip sync)
466
- * - Wav2ArkitCpuInference (WASM-only, 404MB, lip sync only)
464
+ * - Wav2Vec2Inference (WebGPU/WASM, 384MB, ASR + A2E)
465
+ * - Wav2ArkitCpuInference (WASM-only, 404MB, A2E only)
467
466
  */
468
- interface LipSyncBackend {
467
+ interface A2EBackend {
469
468
  /** Model identifier for backend-specific tuning (e.g. audio delay) */
470
469
  readonly modelId: 'wav2vec2' | 'wav2arkit_cpu';
471
470
  /** Current backend type (webgpu, wasm, or null if not loaded) */
472
471
  readonly backend: RuntimeBackend | null;
473
472
  /** Whether the model is loaded and ready for inference */
474
473
  readonly isLoaded: boolean;
474
+ /** Optimal number of audio samples per inference call (e.g. 16000 = 1s at 16kHz) */
475
+ readonly chunkSize: number;
475
476
  /**
476
477
  * Load the ONNX model
477
478
  * @returns Model loading information
478
479
  */
479
- load(): Promise<LipSyncModelInfo>;
480
+ load(): Promise<A2EModelInfo>;
480
481
  /**
481
482
  * Run inference on raw audio
482
483
  * @param audioSamples - Float32Array of raw audio at 16kHz
483
484
  * @param identityIndex - Optional identity index (ignored by CPU model)
484
- * @returns Lip sync result with blendshapes in LAM_BLENDSHAPES order
485
+ * @returns A2E result with blendshapes in LAM_BLENDSHAPES order
485
486
  */
486
- infer(audioSamples: Float32Array, identityIndex?: number): Promise<LipSyncResult>;
487
+ infer(audioSamples: Float32Array, identityIndex?: number): Promise<A2EResult>;
487
488
  /**
488
489
  * Dispose of the model and free resources
489
490
  */
@@ -491,542 +492,16 @@ interface LipSyncBackend {
491
492
  }
492
493
 
493
494
  /**
494
- * LAMPipeline - Coordinate LAM (Wav2Vec2) inference with frame synchronization
495
+ * FullFacePipeline - A2E expression pipeline with ExpressionProfile weight scaling
495
496
  *
496
- * Manages the buffering and processing pipeline for LAM lip sync:
497
- * 1. Accumulates audio samples in a ring buffer
498
- * 2. Triggers LAM inference when buffer reaches required size (16000 samples @ 16kHz = 1.0s)
499
- * 3. Queues resulting blendshape frames with precise timestamps
500
- * 4. Provides frames synchronized to AudioContext clock
497
+ * Orchestrates full-face animation by:
498
+ * 1. Scheduling audio for playback immediately (audio-first, never waits for A2E)
499
+ * 2. Running A2E inference in background (fire-and-forget via A2EProcessor)
500
+ * 3. Applying per-character ExpressionProfile scaling to raw A2E output
501
501
  *
502
- * Key Design Decisions:
503
- * - Ring buffer pattern for efficient sample accumulation (no allocation churn)
504
- * - Frame queue with timestamps for deterministic playback
505
- * - Timestamp-based frame retrieval (not callback) for renderer flexibility
506
- *
507
- * Based on patterns from Chrome Audio Worklet design and Web Audio clock management.
508
- *
509
- * @see https://developer.chrome.com/blog/audio-worklet-design-pattern
510
- * @category Audio
511
- */
512
-
513
- interface LAMFrame {
514
- /** 52 ARKit blendshape weights */
515
- frame: Float32Array;
516
- /** AudioContext time when this frame should be displayed */
517
- timestamp: number;
518
- }
519
- interface LAMPipelineOptions {
520
- /**
521
- * Sample rate in Hz (must match audio playback)
522
- * Default: 16000
523
- */
524
- sampleRate?: number;
525
- /**
526
- * LAM inference callback
527
- * Called each time LAM processes a buffer
528
- */
529
- onInference?: (frameCount: number) => void;
530
- /**
531
- * Error callback for inference failures
532
- */
533
- onError?: (error: Error) => void;
534
- }
535
- declare class LAMPipeline {
536
- private readonly options;
537
- private readonly REQUIRED_SAMPLES;
538
- private readonly FRAME_RATE;
539
- private buffer;
540
- private bufferStartTime;
541
- private frameQueue;
542
- /**
543
- * Last successfully retrieved frame
544
- * Used as fallback when no new frame is available to prevent avatar freezing
545
- */
546
- private lastFrame;
547
- constructor(options?: LAMPipelineOptions);
548
- /**
549
- * Push audio samples into the pipeline
550
- *
551
- * Accumulates samples and triggers LAM inference when buffer is full.
552
- * Multiple calls may be needed to accumulate enough samples.
553
- *
554
- * @param samples - Float32Array of audio samples
555
- * @param timestamp - AudioContext time when these samples start playing
556
- * @param lam - LAM inference engine
557
- */
558
- push(samples: Float32Array, timestamp: number, lam: LipSyncBackend): Promise<void>;
559
- /**
560
- * Process accumulated buffer through LAM inference
561
- */
562
- private processBuffer;
563
- /**
564
- * Get the frame that should be displayed at the current time
565
- *
566
- * Automatically removes frames that have already been displayed.
567
- * This prevents memory leaks from accumulating old frames.
568
- *
569
- * Discard Window (prevents premature frame discarding):
570
- * - WebGPU: 0.5s (LAM inference 20-100ms + RAF jitter + React stalls)
571
- * - WASM: 1.0s (LAM inference 50-500ms + higher variability)
572
- *
573
- * Last-Frame-Hold: Returns last valid frame instead of null to prevent
574
- * avatar freezing when between frames (RAF at 60fps vs LAM at 30fps).
575
- *
576
- * @param currentTime - Current AudioContext time
577
- * @param lam - LAM inference engine (optional, for backend detection)
578
- * @returns Current frame, or last frame as fallback, or null if no frames yet
579
- */
580
- getFrameForTime(currentTime: number, lam?: {
581
- backend: 'webgpu' | 'wasm' | null;
582
- }): Float32Array | null;
583
- /**
584
- * Get all frames in the queue (for debugging/monitoring)
585
- */
586
- getQueuedFrames(): LAMFrame[];
587
- /**
588
- * Get current buffer fill level (0-1)
589
- */
590
- get fillLevel(): number;
591
- /**
592
- * Get number of frames queued
593
- */
594
- get queuedFrameCount(): number;
595
- /**
596
- * Get buffered audio duration in seconds
597
- */
598
- get bufferedDuration(): number;
599
- /**
600
- * Flush remaining buffered audio
601
- *
602
- * Processes any remaining audio in the buffer, even if less than REQUIRED_SAMPLES.
603
- * This ensures the final audio chunk generates blendshape frames.
604
- *
605
- * Should be called when audio stream ends to prevent losing the last 0-1 seconds.
606
- *
607
- * @param lam - LAM inference engine
608
- */
609
- flush(lam: LipSyncBackend): Promise<void>;
610
- /**
611
- * Adjust all queued frame timestamps by an offset
612
- *
613
- * Used for synchronization when audio scheduling time differs from
614
- * the estimated time used during LAM processing.
615
- *
616
- * @param offset - Time offset in seconds to add to all timestamps
617
- */
618
- adjustTimestamps(offset: number): void;
619
- /**
620
- * Reset the pipeline
621
- */
622
- reset(): void;
623
- }
624
-
625
- /**
626
- * SyncedAudioPipeline - Audio playback + LAM lip sync coordinator
627
- *
628
- * Orchestrates the complete pipeline for synchronized audio playback and lip sync:
629
- * 1. Network chunks → Coalescer → Optimized buffers
630
- * 2. Audio buffers → Scheduler → Gapless playback (immediate, never blocks)
631
- * 3. Audio buffers → LAM Pipeline → Blendshape frames (background, fire-and-forget)
632
- * 4. Frames synchronized to AudioContext clock → Renderer
633
- *
634
- * Key Architecture Pattern: Audio-First, LAM-Background
635
- * - Audio chunks are scheduled for playback immediately (never waits for LAM)
636
- * - LAM inference runs in background without blocking the audio path
637
- * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
638
- * - Once LAM catches up, frames stay synchronized to AudioContext clock
639
- *
640
- * This decoupled design prevents LAM inference (50-300ms) from blocking audio
641
- * scheduling, which caused audible stuttering when audio arrived as a continuous
642
- * stream (e.g., single-call TTS from ElevenLabs via AgentCore).
643
- *
644
- * @see https://web.dev/articles/audio-scheduling (Web Audio clock patterns)
645
- * @category Audio
646
- */
647
-
648
- interface SyncedAudioPipelineOptions {
649
- /** Sample rate in Hz (default: 16000) */
650
- sampleRate?: number;
651
- /** Target chunk duration in ms for coalescing (default: 200) */
652
- chunkTargetMs?: number;
653
- /** LAM inference engine */
654
- lam: LipSyncBackend;
655
- /**
656
- * Audio playback delay in ms before first audio plays.
657
- * Gives LAM inference time to pre-compute blendshapes.
658
- * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
659
- */
660
- audioDelayMs?: number;
661
- }
662
- interface SyncedAudioPipelineEvents {
663
- /** New frame ready for display */
664
- frame_ready: Float32Array;
665
- /** Playback has completed */
666
- playback_complete: void;
667
- /** First audio chunk scheduled, playback starting */
668
- playback_start: number;
669
- /** Error occurred */
670
- error: Error;
671
- /** Index signature for EventEmitter compatibility */
672
- [key: string]: unknown;
673
- }
674
- declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents> {
675
- private readonly options;
676
- private scheduler;
677
- private coalescer;
678
- private lamPipeline;
679
- private playbackStarted;
680
- private monitorInterval;
681
- private frameAnimationId;
682
- constructor(options: SyncedAudioPipelineOptions);
683
- /**
684
- * Initialize the pipeline
685
- */
686
- initialize(): Promise<void>;
687
- /**
688
- * Start a new playback session
689
- *
690
- * Resets all state and prepares for incoming audio chunks.
691
- * Audio will be scheduled immediately as chunks arrive (no buffering).
692
- */
693
- start(): void;
694
- /**
695
- * Receive audio chunk from network
696
- *
697
- * Audio-first design: schedules audio immediately, LAM runs in background.
698
- * This prevents LAM inference (50-300ms) from blocking audio scheduling,
699
- * which caused audible stuttering with continuous audio streams.
700
- *
701
- * @param chunk - Uint8Array containing Int16 PCM audio
702
- */
703
- onAudioChunk(chunk: Uint8Array): Promise<void>;
704
- /**
705
- * End of audio stream
706
- *
707
- * Flushes any remaining buffered data.
708
- */
709
- end(): Promise<void>;
710
- /**
711
- * Stop playback immediately with smooth fade-out
712
- *
713
- * Gracefully cancels all audio playback and LAM processing:
714
- * - Fades out audio over specified duration (default: 50ms)
715
- * - Cancels pending LAM inferences
716
- * - Clears all buffers and queues
717
- * - Emits 'playback_complete' event
718
- *
719
- * Use this for interruptions (e.g., user barge-in during AI speech).
720
- *
721
- * @param fadeOutMs - Fade-out duration in milliseconds (default: 50ms)
722
- * @returns Promise that resolves when fade-out completes
723
- */
724
- stop(fadeOutMs?: number): Promise<void>;
725
- /**
726
- * Start frame animation loop
727
- *
728
- * Uses requestAnimationFrame to check for new LAM frames.
729
- * Synchronized to AudioContext clock (not visual refresh rate).
730
- *
731
- * Frame Emission Strategy:
732
- * - LAMPipeline uses last-frame-hold to prevent null returns
733
- * - Always emit frames (even repeated frames) to maintain smooth animation
734
- * - Renderer is responsible for detecting duplicate frames if needed
735
- */
736
- private startFrameLoop;
737
- /**
738
- * Start monitoring for playback completion
739
- */
740
- private startMonitoring;
741
- /**
742
- * Stop monitoring
743
- */
744
- private stopMonitoring;
745
- /**
746
- * Get current pipeline state (for debugging/monitoring)
747
- */
748
- getState(): {
749
- playbackStarted: boolean;
750
- coalescerFill: number;
751
- lamFill: number;
752
- queuedFrames: number;
753
- currentTime: number;
754
- playbackEndTime: number;
755
- };
756
- /**
757
- * Cleanup resources
758
- */
759
- dispose(): void;
760
- }
761
-
762
- /**
763
- * Emotion to ARKit Blendshape Mapper
764
- *
765
- * Converts Emotion2VecInference output to upper face ARKit blendshapes for
766
- * expressive avatar animation. Maps 4 emotion categories (neutral, happy, angry, sad)
767
- * to 11 upper face blendshapes (brows, eyes, cheeks).
768
- *
769
- * Supports two blend modes:
770
- * - 'dominant': Uses only the strongest emotion (simpler, more stable)
771
- * - 'weighted': Blends all emotions by probability (more nuanced, e.g., bittersweet)
772
- *
773
- * Also supports energy modulation to scale emotion intensity by audio energy,
774
- * making expressions stronger during emphasized speech.
775
- *
776
- * @example Basic usage
777
- * ```typescript
778
- * import { EmotionToBlendshapeMapper } from '@omote/core';
779
- * import { Emotion2VecInference } from '@omote/core';
780
- *
781
- * const emotion = new Emotion2VecInference({ modelUrl: '/models/emotion.onnx' });
782
- * const mapper = new EmotionToBlendshapeMapper();
783
- *
784
- * // Process emotion frame
785
- * const result = await emotion.infer(audioSamples);
786
- * const blendshapes = mapper.mapFrame(result.dominant);
787
- *
788
- * // Apply to avatar
789
- * for (const [name, value] of Object.entries(blendshapes)) {
790
- * avatar.setBlendshape(name, value);
791
- * }
792
- * ```
793
- *
794
- * @example Weighted blending for nuanced expressions
795
- * ```typescript
796
- * const mapper = new EmotionToBlendshapeMapper({
797
- * blendMode: 'weighted',
798
- * minBlendProbability: 0.1,
799
- * });
800
- *
801
- * // Frame with mixed emotions: { happy: 0.6, sad: 0.3, neutral: 0.1 }
802
- * // Result: bittersweet expression (smiling but worried brow)
803
- * const blendshapes = mapper.mapFrame(emotionFrame);
804
- * ```
805
- *
806
- * @example Energy-modulated emotion
807
- * ```typescript
808
- * import { AudioEnergyAnalyzer } from '@omote/core';
809
- *
810
- * const energyAnalyzer = new AudioEnergyAnalyzer();
811
- * const mapper = new EmotionToBlendshapeMapper({ energyModulation: true });
812
- *
813
- * // In animation loop
814
- * function animate(audioChunk: Float32Array, emotionFrame: EmotionFrame) {
815
- * const { energy } = energyAnalyzer.analyze(audioChunk);
816
- * mapper.mapFrame(emotionFrame, energy); // Louder = stronger emotion
817
- * mapper.update(16);
818
- * applyToAvatar(mapper.getCurrentBlendshapes());
819
- * }
820
- * ```
821
- *
822
- * @module animation
823
- */
824
- declare const EMOTION2VEC_LABELS: readonly ["neutral", "happy", "angry", "sad"];
825
- type Emotion2VecLabel = (typeof EMOTION2VEC_LABELS)[number];
826
- interface EmotionFrame {
827
- /** Primary emotion label */
828
- emotion: Emotion2VecLabel;
829
- /** Confidence for primary emotion (0-1) */
830
- confidence: number;
831
- /** All emotion probabilities */
832
- probabilities: Record<Emotion2VecLabel, number>;
833
- }
834
- /**
835
- * Upper face ARKit blendshape names (11 total)
836
- *
837
- * These blendshapes control the upper face (brows, eyes, cheeks) and are
838
- * driven by emotion detection, complementing the mouth blendshapes from
839
- * LAM lip sync.
840
- */
841
- declare const UPPER_FACE_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "cheekSquintLeft", "cheekSquintRight"];
842
- type UpperFaceBlendshapeName = (typeof UPPER_FACE_BLENDSHAPES)[number];
843
- /**
844
- * Upper face blendshape values (0-1 for each)
845
- */
846
- type UpperFaceBlendshapes = Record<UpperFaceBlendshapeName, number>;
847
- /**
848
- * Blend mode for combining emotions
849
- * - 'dominant': Use only the strongest emotion (default, more stable)
850
- * - 'weighted': Blend all emotions by probability (more nuanced)
851
- */
852
- type EmotionBlendMode = 'dominant' | 'weighted';
853
- /**
854
- * Emotion to ARKit blendshape mapping
855
- *
856
- * Based on Paul Ekman's FACS (Facial Action Coding System) research:
857
- *
858
- * - Happy (AU6+AU12): Cheek raise + lip corner pull (Duchenne smile)
859
- * Upper face: cheekSquint (AU6) + slight eyeSquint from genuine smile
860
- *
861
- * - Angry (AU4+AU5+AU7+AU23): Brow lower + eye wide + lid tighten + lip press
862
- * Upper face: browDown (AU4) + eyeWide (AU5) + eyeSquint (AU7) creates the "glare"
863
- *
864
- * - Sad (AU1+AU4+AU15): Inner brow raise + brow furrow + lip corner depress
865
- * Upper face: browInnerUp (AU1) + browDown (AU4) creates the worried/sad brow
866
- *
867
- * - Neutral: All zeros (no expression overlay)
868
- *
869
- * @see https://imotions.com/blog/learning/research-fundamentals/facial-action-coding-system/
870
- * @see https://melindaozel.com/arkit-to-facs-cheat-sheet/
871
- */
872
- declare const EMOTION_ARKIT_MAP: Record<Emotion2VecLabel, Partial<UpperFaceBlendshapes>>;
873
- /**
874
- * Configuration for EmotionToBlendshapeMapper
875
- */
876
- interface EmotionBlendshapeConfig {
877
- /**
878
- * Smoothing factor for exponential moving average (0-1)
879
- * Lower = slower, smoother transitions
880
- * Higher = faster, more responsive
881
- * @default 0.15
882
- */
883
- smoothingFactor?: number;
884
- /**
885
- * Minimum confidence threshold for emotion to take effect
886
- * Emotions below this confidence are treated as neutral
887
- * @default 0.3
888
- */
889
- confidenceThreshold?: number;
890
- /**
891
- * Global intensity multiplier for all blendshapes (0-2)
892
- * @default 1.0
893
- */
894
- intensity?: number;
895
- /**
896
- * Blend mode for combining emotions
897
- * - 'dominant': Use only the strongest emotion (default)
898
- * - 'weighted': Blend all emotions by probability
899
- * @default 'dominant'
900
- */
901
- blendMode?: EmotionBlendMode;
902
- /**
903
- * Minimum probability for an emotion to contribute in weighted blend mode
904
- * Emotions with probability below this are ignored
905
- * @default 0.1
906
- */
907
- minBlendProbability?: number;
908
- /**
909
- * Enable energy modulation - scale emotion intensity by audio energy
910
- * When enabled, louder speech produces stronger expressions
911
- * @default false
912
- */
913
- energyModulation?: boolean;
914
- /**
915
- * Minimum energy scale when energy modulation is enabled (0-1)
916
- * At zero audio energy, emotion intensity is scaled by this factor
917
- * @default 0.3
918
- */
919
- minEnergyScale?: number;
920
- /**
921
- * Maximum energy scale when energy modulation is enabled (0-2)
922
- * At maximum audio energy, emotion intensity is scaled by this factor
923
- * @default 1.0
924
- */
925
- maxEnergyScale?: number;
926
- }
927
- /**
928
- * EmotionToBlendshapeMapper
929
- *
930
- * Converts emotion detection output to upper face ARKit blendshapes.
931
- * Provides smooth transitions between emotion states using exponential
932
- * moving average interpolation.
933
- *
934
- * Supports two blend modes:
935
- * - 'dominant': Uses only the strongest emotion
936
- * - 'weighted': Blends all emotions by probability for nuanced expressions
937
- *
938
- * Also supports energy modulation to scale emotion intensity by audio energy.
939
- */
940
- declare class EmotionToBlendshapeMapper {
941
- private config;
942
- private targetBlendshapes;
943
- private currentBlendshapes;
944
- private currentEnergy;
945
- /**
946
- * Create a new EmotionToBlendshapeMapper
947
- *
948
- * @param config - Optional configuration
949
- */
950
- constructor(config?: EmotionBlendshapeConfig);
951
- /**
952
- * Map an emotion frame to target blendshapes
953
- *
954
- * This sets the target values that the mapper will smoothly interpolate
955
- * towards. Call update() each frame to apply smoothing.
956
- *
957
- * @param frame - Emotion frame from Emotion2VecInference
958
- * @param audioEnergy - Optional audio energy (0-1) for energy modulation
959
- * @returns Target upper face blendshapes (before smoothing)
960
- */
961
- mapFrame(frame: EmotionFrame, audioEnergy?: number): UpperFaceBlendshapes;
962
- /**
963
- * Map using dominant emotion only (original behavior)
964
- */
965
- private mapFrameDominant;
966
- /**
967
- * Map using weighted blend of all emotions by probability
968
- * Creates more nuanced expressions (e.g., bittersweet = happy + sad)
969
- */
970
- private mapFrameWeighted;
971
- /**
972
- * Apply energy modulation to scale emotion intensity by audio energy
973
- * Louder speech = stronger expressions
974
- */
975
- private applyEnergyModulation;
976
- /**
977
- * Apply smoothing to interpolate current values towards target
978
- *
979
- * Uses exponential moving average:
980
- * current = current + smoothingFactor * (target - current)
981
- *
982
- * @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
983
- */
984
- update(_deltaMs: number): void;
985
- /**
986
- * Get current smoothed blendshape values
987
- *
988
- * @returns Current upper face blendshapes (after smoothing)
989
- */
990
- getCurrentBlendshapes(): UpperFaceBlendshapes;
991
- /**
992
- * Reset mapper to neutral state
993
- *
994
- * Sets both target and current blendshapes to zero.
995
- */
996
- reset(): void;
997
- /**
998
- * Get current configuration
999
- */
1000
- getConfig(): Required<EmotionBlendshapeConfig>;
1001
- /**
1002
- * Update configuration
1003
- *
1004
- * @param config - Partial configuration to update
1005
- */
1006
- setConfig(config: Partial<EmotionBlendshapeConfig>): void;
1007
- }
1008
-
1009
- /**
1010
- * FullFacePipeline - Combined LAM lip sync + Emotion upper face pipeline
1011
- *
1012
- * Orchestrates full-face animation by combining:
1013
- * 1. LAM lip sync (52 ARKit blendshapes) via audio-first scheduling
1014
- * 2. Emotion labels (from backend LLM or `setEmotionLabel()`) for upper face
1015
- * 3. AudioEnergyAnalyzer for prosody-driven fallback when no emotion label is set
1016
- *
1017
- * Architecture: Audio-First, LAM-Background (same as SyncedAudioPipeline)
1018
- * - Audio chunks are scheduled for playback immediately (never waits for LAM)
1019
- * - LAM inference runs in background without blocking the audio path
1020
- * - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
1021
- *
1022
- * Merge Strategy:
1023
- * - Lower face (41 blendshapes): 100% from LAM (mouth, jaw, tongue, etc.)
1024
- * - Upper face (11 blendshapes): Emotion overlay with LAM as subtle fallback
1025
- * Formula: emotion * emotionBlendFactor + lam * lamBlendFactor
1026
- *
1027
- * Emotion Sources (in priority order):
1028
- * 1. `setEmotionLabel()` — explicit label from backend LLM (recommended)
1029
- * 2. Prosody fallback — subtle brow movement from audio energy (automatic)
502
+ * The A2E model outputs all 52 ARKit blendshapes from audio — brows, eyes, cheeks,
503
+ * mouth, jaw, everything. ExpressionProfile allows per-character weight scaling
504
+ * by group (eyes, brows, jaw, mouth, cheeks, nose, tongue) with per-blendshape overrides.
1030
505
  *
1031
506
  * @category Audio
1032
507
  *
@@ -1036,8 +511,7 @@ declare class EmotionToBlendshapeMapper {
1036
511
  *
1037
512
  * const pipeline = new FullFacePipeline({
1038
513
  * lam,
1039
- * emotionBlendFactor: 0.8,
1040
- * lamBlendFactor: 0.2,
514
+ * profile: { mouth: 1.2, brows: 0.8 },
1041
515
  * });
1042
516
  * await pipeline.initialize();
1043
517
  *
@@ -1046,11 +520,41 @@ declare class EmotionToBlendshapeMapper {
1046
520
  * });
1047
521
  *
1048
522
  * pipeline.start();
1049
- * pipeline.setEmotionLabel('happy'); // From backend LLM
1050
523
  * await pipeline.onAudioChunk(audioData);
1051
524
  * ```
1052
525
  */
1053
526
 
527
+ type BlendshapeGroup = 'eyes' | 'brows' | 'jaw' | 'mouth' | 'cheeks' | 'nose' | 'tongue';
528
+ /**
529
+ * Per-character weight scaling for A2E blendshape output.
530
+ *
531
+ * Group scalers multiply all blendshapes in that group (default 1.0).
532
+ * Per-blendshape overrides take priority over group scalers.
533
+ * Final values are clamped to [0, 1].
534
+ */
535
+ interface ExpressionProfile {
536
+ /** eyeBlink*, eyeLook*, eyeSquint*, eyeWide* (14 blendshapes) */
537
+ eyes?: number;
538
+ /** browDown*, browInnerUp, browOuterUp* (5 blendshapes) */
539
+ brows?: number;
540
+ /** jawForward, jawLeft, jawRight, jawOpen (4 blendshapes) */
541
+ jaw?: number;
542
+ /** mouth* (23 blendshapes) */
543
+ mouth?: number;
544
+ /** cheekPuff, cheekSquint* (3 blendshapes) */
545
+ cheeks?: number;
546
+ /** noseSneer* (2 blendshapes) */
547
+ nose?: number;
548
+ /** tongueOut (1 blendshape) */
549
+ tongue?: number;
550
+ /** Per-blendshape overrides (0-2). Takes priority over group scalers. */
551
+ overrides?: Partial<Record<string, number>>;
552
+ }
553
+ /**
554
+ * Map each LAM_BLENDSHAPES entry to its BlendshapeGroup.
555
+ * Built once at module load from prefix matching.
556
+ */
557
+ declare const BLENDSHAPE_TO_GROUP: Map<string, BlendshapeGroup>;
1054
558
  /**
1055
559
  * Configuration for FullFacePipeline
1056
560
  */
@@ -1061,37 +565,54 @@ interface FullFacePipelineOptions {
1061
565
  chunkTargetMs?: number;
1062
566
  /**
1063
567
  * Audio playback delay in ms before first audio plays.
1064
- * Gives LAM inference time to pre-compute blendshapes.
1065
- * Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
568
+ * Gives A2E inference time to pre-compute blendshapes before audio
569
+ * starts, preventing frame drops/desync. Must be chunkSize
570
+ * accumulation time + inference latency.
571
+ *
572
+ * Default: auto-calculated from chunkSize and backend type.
1066
573
  */
1067
574
  audioDelayMs?: number;
1068
- /** LAM inference engine */
1069
- lam: LipSyncBackend;
1070
575
  /**
1071
- * Emotion blend factor for upper face blendshapes (0-1)
1072
- * Higher values give more weight to emotion detection
1073
- * @default 0.8
576
+ * A2E inference chunk size in samples.
577
+ * Controls how many samples accumulate before each inference call.
578
+ * Smaller = lower latency (less delay before first frame), more overhead.
579
+ * Larger = higher latency, less overhead.
580
+ *
581
+ * Default: 16000 (1s) — the model's native window size.
582
+ * Smaller chunks get zero-padded, causing near-zero blendshape output.
1074
583
  */
1075
- emotionBlendFactor?: number;
584
+ chunkSize?: number;
585
+ /** A2E inference engine */
586
+ lam: A2EBackend;
587
+ /** Per-character expression weight scaling */
588
+ profile?: ExpressionProfile;
589
+ /**
590
+ * Spring smoothing halflife in seconds.
591
+ * Controls how quickly blendshapes converge to inference targets.
592
+ * Lower = snappier but more jittery. Higher = smoother but laggy.
593
+ * Set to 0 to disable smoothing (raw frame pass-through).
594
+ *
595
+ * Default: 0.06 (60ms)
596
+ */
597
+ smoothingHalflife?: number;
1076
598
  /**
1077
- * LAM blend factor for upper face blendshapes (0-1)
1078
- * Provides subtle fallback from LAM when emotion is weak
1079
- * @default 0.2
599
+ * Time in ms with no new inference frames before decaying to neutral.
600
+ * When exceeded, spring targets are set to 0 and the face smoothly
601
+ * relaxes rather than freezing on the last frame.
602
+ *
603
+ * Must be larger than the inter-batch gap (chunkSize/sampleRate + inference time).
604
+ * Default: 2000
1080
605
  */
1081
- lamBlendFactor?: number;
606
+ staleThresholdMs?: number;
1082
607
  }
1083
608
  /**
1084
- * Full face frame with merged blendshapes and emotion data
609
+ * Full face frame with scaled blendshapes
1085
610
  */
1086
611
  interface FullFaceFrame {
1087
- /** Merged 52 ARKit blendshapes (lower face from LAM + upper face from emotion) */
612
+ /** Scaled 52 ARKit blendshapes (ExpressionProfile applied) */
1088
613
  blendshapes: Float32Array;
1089
- /** Original LAM blendshapes (52) */
1090
- lamBlendshapes: Float32Array;
1091
- /** Emotion-driven upper face blendshapes (11) */
1092
- emotionBlendshapes: UpperFaceBlendshapes;
1093
- /** Raw emotion frame data */
1094
- emotion: EmotionFrame | null;
614
+ /** Raw A2E output (52 blendshapes, before profile scaling) */
615
+ rawBlendshapes: Float32Array;
1095
616
  /** AudioContext timestamp for this frame */
1096
617
  timestamp: number;
1097
618
  }
@@ -1103,8 +624,6 @@ interface FullFacePipelineEvents {
1103
624
  full_frame_ready: FullFaceFrame;
1104
625
  /** Raw LAM frame ready (for debugging/monitoring) */
1105
626
  lam_frame_ready: Float32Array;
1106
- /** Emotion frame ready (for debugging/monitoring) */
1107
- emotion_frame_ready: EmotionFrame;
1108
627
  /** Playback has completed */
1109
628
  playback_complete: void;
1110
629
  /** First frame ready, playback starting */
@@ -1115,53 +634,47 @@ interface FullFacePipelineEvents {
1115
634
  [key: string]: unknown;
1116
635
  }
1117
636
  /**
1118
- * FullFacePipeline - Unified LAM + Emotion animation pipeline
637
+ * FullFacePipeline - A2E animation pipeline with ExpressionProfile scaling
1119
638
  *
1120
639
  * Audio-first design matching SyncedAudioPipeline:
1121
- * - Audio is scheduled immediately (never waits for LAM)
1122
- * - LAM runs in background (fire-and-forget)
1123
- * - Emotion from setEmotionLabel() or prosody fallback
640
+ * - Audio is scheduled immediately (never waits for A2E)
641
+ * - A2E runs in background (fire-and-forget via A2EProcessor)
642
+ * - ExpressionProfile scales raw A2E output per-character
1124
643
  */
1125
644
  declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
1126
645
  private readonly options;
1127
646
  private scheduler;
1128
647
  private coalescer;
1129
- private lamPipeline;
1130
- private emotionMapper;
1131
- private energyAnalyzer;
648
+ private processor;
649
+ private smoother;
1132
650
  private playbackStarted;
1133
651
  private monitorInterval;
1134
652
  private frameAnimationId;
1135
- private lastEmotionFrame;
1136
- private currentAudioEnergy;
1137
653
  private lastNewFrameTime;
1138
654
  private lastKnownLamFrame;
1139
655
  private staleWarningEmitted;
1140
- private static readonly STALE_FRAME_THRESHOLD_MS;
1141
- private emotionBlendFactor;
1142
- private lamBlendFactor;
656
+ private readonly staleThresholdMs;
657
+ private lastFrameLoopTime;
658
+ private frameLoopCount;
659
+ private profile;
1143
660
  constructor(options: FullFacePipelineOptions);
1144
661
  /**
1145
662
  * Initialize the pipeline
1146
663
  */
1147
664
  initialize(): Promise<void>;
1148
665
  /**
1149
- * Set emotion label from backend (e.g., LLM response emotion).
1150
- *
1151
- * Converts a natural language emotion label into an EmotionFrame
1152
- * that drives upper face blendshapes for the duration of the utterance.
1153
- *
1154
- * Supported labels: happy, excited, joyful, sad, melancholic, angry,
1155
- * frustrated, neutral, etc.
1156
- *
1157
- * @param label - Emotion label string (case-insensitive)
666
+ * Update the ExpressionProfile at runtime (e.g., character switch).
1158
667
  */
1159
- setEmotionLabel(label: string): void;
668
+ setProfile(profile: ExpressionProfile): void;
1160
669
  /**
1161
- * Clear any set emotion label.
1162
- * Falls back to prosody-only upper face animation.
670
+ * Apply ExpressionProfile scaling to raw A2E blendshapes.
671
+ *
672
+ * For each blendshape:
673
+ * 1. If an override exists for the blendshape name, use override as scaler
674
+ * 2. Otherwise, use the group scaler (default 1.0)
675
+ * 3. Clamp result to [0, 1]
1163
676
  */
1164
- clearEmotionLabel(): void;
677
+ applyProfile(raw: Float32Array): Float32Array;
1165
678
  /**
1166
679
  * Start a new playback session
1167
680
  *
@@ -1172,29 +685,19 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
1172
685
  /**
1173
686
  * Receive audio chunk from network
1174
687
  *
1175
- * Audio-first design: schedules audio immediately, LAM runs in background.
1176
- * This prevents LAM inference (50-300ms) from blocking audio scheduling.
688
+ * Audio-first design: schedules audio immediately, A2E runs in background.
689
+ * This prevents A2E inference (50-300ms) from blocking audio scheduling.
1177
690
  *
1178
691
  * @param chunk - Uint8Array containing Int16 PCM audio
1179
692
  */
1180
693
  onAudioChunk(chunk: Uint8Array): Promise<void>;
1181
- /**
1182
- * Get emotion frame for current animation.
1183
- *
1184
- * Priority:
1185
- * 1. Explicit emotion label from setEmotionLabel()
1186
- * 2. Prosody fallback: subtle brow movement from audio energy
1187
- */
1188
- private getEmotionFrame;
1189
- /**
1190
- * Merge LAM blendshapes with emotion upper face blendshapes
1191
- */
1192
- mergeBlendshapes(lamFrame: Float32Array, emotionFrame: EmotionFrame | null, audioEnergy?: number): {
1193
- merged: Float32Array;
1194
- emotionBlendshapes: UpperFaceBlendshapes;
1195
- };
1196
694
  /**
1197
695
  * Start frame animation loop
696
+ *
697
+ * Uses critically damped spring smoother to produce continuous output
698
+ * at render rate (60fps), even between inference batches (~30fps bursts).
699
+ * Springs interpolate toward the latest inference target, and decay
700
+ * to neutral when inference stalls.
1198
701
  */
1199
702
  private startFrameLoop;
1200
703
  /**
@@ -1219,17 +722,11 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
1219
722
  getState(): {
1220
723
  playbackStarted: boolean;
1221
724
  coalescerFill: number;
1222
- lamFill: number;
1223
- queuedLAMFrames: number;
1224
- emotionLabel: "neutral" | "happy" | "angry" | "sad" | null;
1225
- currentAudioEnergy: number;
725
+ processorFill: number;
726
+ queuedFrames: number;
1226
727
  currentTime: number;
1227
728
  playbackEndTime: number;
1228
729
  };
1229
- /**
1230
- * Check if an explicit emotion label is currently set
1231
- */
1232
- get hasEmotionLabel(): boolean;
1233
730
  /**
1234
731
  * Cleanup resources
1235
732
  */
@@ -1255,13 +752,6 @@ declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
1255
752
  * @module inference/onnxLoader
1256
753
  */
1257
754
 
1258
- type OrtModule = {
1259
- InferenceSession: typeof InferenceSession;
1260
- Tensor: typeof Tensor;
1261
- env: Env;
1262
- };
1263
- type SessionOptions = InferenceSession.SessionOptions;
1264
-
1265
755
  /**
1266
756
  * Check if WebGPU is available and likely to work
1267
757
  *
@@ -1271,74 +761,6 @@ type SessionOptions = InferenceSession.SessionOptions;
1271
761
  * @returns true if WebGPU is available and working
1272
762
  */
1273
763
  declare function isWebGPUAvailable(): Promise<boolean>;
1274
- /**
1275
- * Load ONNX Runtime with the specified backend
1276
- *
1277
- * This lazily loads the appropriate bundle:
1278
- * - 'wasm': Loads onnxruntime-web (WASM-only, smaller)
1279
- * - 'webgpu': Loads onnxruntime-web/webgpu (includes WebGPU + WASM fallback)
1280
- *
1281
- * Once loaded, the same instance is reused for all subsequent calls.
1282
- * If you need to switch backends, you must reload the page.
1283
- *
1284
- * @param backend The backend to load ('webgpu' or 'wasm')
1285
- * @returns The ONNX Runtime module
1286
- */
1287
- declare function getOnnxRuntime(backend: RuntimeBackend): Promise<OrtModule>;
1288
- /**
1289
- * Get the appropriate ONNX Runtime based on user preference
1290
- *
1291
- * This resolves the user's preference against platform capabilities
1292
- * and loads the appropriate bundle.
1293
- *
1294
- * @param preference User's backend preference
1295
- * @returns The ONNX Runtime module and the resolved backend
1296
- */
1297
- declare function getOnnxRuntimeForPreference(preference?: BackendPreference): Promise<{
1298
- ort: OrtModule;
1299
- backend: RuntimeBackend;
1300
- }>;
1301
- /**
1302
- * Get session options for creating an inference session
1303
- *
1304
- * This returns optimized session options based on the backend and platform.
1305
- *
1306
- * @param backend The backend being used
1307
- * @returns Session options for InferenceSession.create()
1308
- */
1309
- declare function getSessionOptions(backend: RuntimeBackend): SessionOptions;
1310
- /**
1311
- * Create an inference session with automatic fallback
1312
- *
1313
- * If WebGPU session creation fails, automatically falls back to WASM.
1314
- *
1315
- * @param modelBuffer The model data as ArrayBuffer
1316
- * @param preferredBackend The preferred backend
1317
- * @returns The created session and the backend used
1318
- */
1319
- declare function createSessionWithFallback(modelBuffer: ArrayBuffer, preferredBackend: RuntimeBackend): Promise<{
1320
- session: InferenceSession;
1321
- backend: RuntimeBackend;
1322
- }>;
1323
- /**
1324
- * Get the currently loaded backend (if any)
1325
- */
1326
- declare function getLoadedBackend(): RuntimeBackend | null;
1327
- /**
1328
- * Check if ONNX Runtime has been loaded
1329
- */
1330
- declare function isOnnxRuntimeLoaded(): boolean;
1331
- /**
1332
- * Preload ONNX Runtime and compile the WASM binary early
1333
- *
1334
- * Call this before loading heavy resources (Three.js, VRM models) to ensure
1335
- * WASM memory is allocated in a clean JS heap, reducing iOS memory pressure.
1336
- * Uses the singleton pattern — subsequent model loading reuses this instance.
1337
- *
1338
- * @param preference Backend preference (default: 'auto')
1339
- * @returns The resolved backend that was loaded
1340
- */
1341
- declare function preloadOnnxRuntime(preference?: BackendPreference): Promise<RuntimeBackend>;
1342
764
 
1343
765
  /**
1344
766
  * SenseVoice automatic speech recognition using ONNX Runtime Web
@@ -2094,8 +1516,9 @@ interface Wav2ArkitCpuWorkerConfig {
2094
1516
  *
2095
1517
  * @see Wav2ArkitCpuInference for main-thread version
2096
1518
  */
2097
- declare class Wav2ArkitCpuWorker implements LipSyncBackend {
1519
+ declare class Wav2ArkitCpuWorker implements A2EBackend {
2098
1520
  readonly modelId: "wav2arkit_cpu";
1521
+ readonly chunkSize: number;
2099
1522
  private worker;
2100
1523
  private config;
2101
1524
  private isLoading;
@@ -2124,7 +1547,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
2124
1547
  /**
2125
1548
  * Load the ONNX model in the worker
2126
1549
  */
2127
- load(): Promise<LipSyncModelInfo>;
1550
+ load(): Promise<A2EModelInfo>;
2128
1551
  /**
2129
1552
  * Run inference on raw audio
2130
1553
  *
@@ -2134,7 +1557,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
2134
1557
  * @param audioSamples - Float32Array of raw audio at 16kHz
2135
1558
  * @param _identityIndex - Ignored (identity 11 is baked into the model)
2136
1559
  */
2137
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
1560
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2138
1561
  /**
2139
1562
  * Queue inference to serialize worker calls
2140
1563
  */
@@ -2166,7 +1589,7 @@ declare class Wav2ArkitCpuWorker implements LipSyncBackend {
2166
1589
  * await worker.init();
2167
1590
  *
2168
1591
  * const asr = createSenseVoice({ modelUrl: '...', unifiedWorker: worker });
2169
- * const lam = createLipSync({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
1592
+ * const lam = createA2E({ gpuModelUrl: '...', cpuModelUrl: '...', unifiedWorker: worker });
2170
1593
  * const vad = createSileroVAD({ modelUrl: '...', unifiedWorker: worker });
2171
1594
  * ```
2172
1595
  *
@@ -2196,17 +1619,17 @@ declare class UnifiedInferenceWorker {
2196
1619
  }): Promise<SenseVoiceModelInfo>;
2197
1620
  transcribe(audio: Float32Array): Promise<SenseVoiceResult>;
2198
1621
  disposeSenseVoice(): Promise<void>;
2199
- loadLipSync(config: {
1622
+ loadA2E(config: {
2200
1623
  modelUrl: string;
2201
1624
  externalDataUrl: string | null;
2202
- }): Promise<LipSyncModelInfo>;
2203
- inferLipSync(audio: Float32Array): Promise<{
1625
+ }): Promise<A2EModelInfo>;
1626
+ inferA2E(audio: Float32Array): Promise<{
2204
1627
  blendshapes: Float32Array;
2205
1628
  numFrames: number;
2206
1629
  numBlendshapes: number;
2207
1630
  inferenceTimeMs: number;
2208
1631
  }>;
2209
- disposeLipSync(): Promise<void>;
1632
+ disposeA2E(): Promise<void>;
2210
1633
  loadVAD(config: {
2211
1634
  modelUrl: string;
2212
1635
  sampleRate: number;
@@ -2252,10 +1675,11 @@ declare class SenseVoiceUnifiedAdapter implements SenseVoiceBackend {
2252
1675
  /**
2253
1676
  * Wav2ArkitCpu adapter backed by UnifiedInferenceWorker
2254
1677
  *
2255
- * Implements LipSyncBackend, delegating all inference to the shared worker.
1678
+ * Implements A2EBackend, delegating all inference to the shared worker.
2256
1679
  */
2257
- declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
1680
+ declare class Wav2ArkitCpuUnifiedAdapter implements A2EBackend {
2258
1681
  readonly modelId: "wav2arkit_cpu";
1682
+ readonly chunkSize: number;
2259
1683
  private worker;
2260
1684
  private config;
2261
1685
  private _isLoaded;
@@ -2263,8 +1687,8 @@ declare class Wav2ArkitCpuUnifiedAdapter implements LipSyncBackend {
2263
1687
  constructor(worker: UnifiedInferenceWorker, config: Wav2ArkitCpuWorkerConfig);
2264
1688
  get isLoaded(): boolean;
2265
1689
  get backend(): RuntimeBackend | null;
2266
- load(): Promise<LipSyncModelInfo>;
2267
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
1690
+ load(): Promise<A2EModelInfo>;
1691
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2268
1692
  dispose(): Promise<void>;
2269
1693
  }
2270
1694
  /**
@@ -2392,116 +1816,6 @@ interface CreateSenseVoiceConfig {
2392
1816
  */
2393
1817
  declare function createSenseVoice(config: CreateSenseVoiceConfig): SenseVoiceBackend;
2394
1818
 
2395
- /**
2396
- * Kaldi-compatible filterbank (fbank) feature extraction
2397
- *
2398
- * Pure TypeScript implementation matching kaldi-native-fbank parameters
2399
- * used by SenseVoice. No external dependencies.
2400
- *
2401
- * Pipeline: audio → framing → windowing → FFT → power spectrum → mel filterbank → log
2402
- *
2403
- * @module inference/kaldiFbank
2404
- */
2405
- interface KaldiFbankOptions {
2406
- /** Frame length in ms (default: 25) */
2407
- frameLengthMs?: number;
2408
- /** Frame shift in ms (default: 10) */
2409
- frameShiftMs?: number;
2410
- /** Low frequency cutoff in Hz (default: 20) */
2411
- lowFreq?: number;
2412
- /** High frequency cutoff in Hz (default: sampleRate / 2) */
2413
- highFreq?: number;
2414
- /** Dither amount (default: 0 for deterministic output) */
2415
- dither?: number;
2416
- /** Preemphasis coefficient (default: 0.97) */
2417
- preemphasis?: number;
2418
- }
2419
- /**
2420
- * Compute Kaldi-compatible log mel filterbank features
2421
- *
2422
- * @param audio Raw audio samples (float32, [-1, 1] range)
2423
- * @param sampleRate Sample rate in Hz (must be 16000 for SenseVoice)
2424
- * @param numMelBins Number of mel bins (80 for SenseVoice)
2425
- * @param opts Optional parameters
2426
- * @returns Flattened Float32Array of shape [numFrames, numMelBins]
2427
- */
2428
- declare function computeKaldiFbank(audio: Float32Array, sampleRate: number, numMelBins: number, opts?: KaldiFbankOptions): Float32Array;
2429
- /**
2430
- * Apply Low Frame Rate stacking for SenseVoice
2431
- *
2432
- * Concatenates lfrM consecutive frames with stride lfrN.
2433
- * Left-pads with copies of first frame, right-pads last group.
2434
- *
2435
- * @param features Flattened [numFrames, featureDim]
2436
- * @param featureDim Feature dimension per frame (e.g., 80)
2437
- * @param lfrM Number of frames to stack (default: 7)
2438
- * @param lfrN Stride (default: 6)
2439
- * @returns Flattened [numOutputFrames, featureDim * lfrM]
2440
- */
2441
- declare function applyLFR(features: Float32Array, featureDim: number, lfrM?: number, lfrN?: number): Float32Array;
2442
- /**
2443
- * Apply CMVN normalization in-place
2444
- *
2445
- * Formula: normalized[i] = (features[i] + negMean[i % dim]) * invStddev[i % dim]
2446
- *
2447
- * @param features Flattened feature array (modified in-place)
2448
- * @param dim Feature dimension (560 for SenseVoice after LFR)
2449
- * @param negMean Negative mean vector (dim-dimensional)
2450
- * @param invStddev Inverse standard deviation vector (dim-dimensional)
2451
- * @returns The same features array (for chaining)
2452
- */
2453
- declare function applyCMVN(features: Float32Array, dim: number, negMean: Float32Array, invStddev: Float32Array): Float32Array;
2454
- /**
2455
- * Parse CMVN vectors from comma-separated strings (stored in ONNX metadata)
2456
- *
2457
- * The sherpa-onnx SenseVoice export stores neg_mean and inv_stddev
2458
- * as comma-separated float strings in the model's metadata.
2459
- */
2460
- declare function parseCMVNFromMetadata(negMeanStr: string, invStddevStr: string): {
2461
- negMean: Float32Array;
2462
- invStddev: Float32Array;
2463
- };
2464
-
2465
- /**
2466
- * CTC greedy decoder for SenseVoice
2467
- *
2468
- * Decodes CTC logits into text with structured token parsing
2469
- * for language, emotion, and audio event detection.
2470
- *
2471
- * @module inference/ctcDecoder
2472
- */
2473
- interface CTCDecodeResult {
2474
- /** Decoded text (speech content only) */
2475
- text: string;
2476
- /** Detected language (e.g., 'zh', 'en', 'ja', 'ko', 'yue') */
2477
- language?: string;
2478
- /** Detected emotion (e.g., 'HAPPY', 'SAD', 'ANGRY', 'NEUTRAL') */
2479
- emotion?: string;
2480
- /** Detected audio event (e.g., 'Speech', 'BGM', 'Laughter') */
2481
- event?: string;
2482
- }
2483
- /** Resolve language string to SenseVoice language ID */
2484
- declare function resolveLanguageId(language: string): number;
2485
- /** Resolve text norm string to SenseVoice text norm ID */
2486
- declare function resolveTextNormId(textNorm: string): number;
2487
- /**
2488
- * Parse tokens.txt into a token ID → string map
2489
- *
2490
- * Format: each line is "token_string token_id"
2491
- * e.g., "<unk> 0", "▁the 3", "s 4"
2492
- */
2493
- declare function parseTokensFile(content: string): Map<number, string>;
2494
- /**
2495
- * CTC greedy decode
2496
- *
2497
- * @param logits Raw logits from model output, flattened [seqLen, vocabSize]
2498
- * @param seqLen Sequence length (time steps)
2499
- * @param vocabSize Vocabulary size
2500
- * @param tokenMap Token ID → string map from tokens.txt
2501
- * @returns Decoded text and structured metadata
2502
- */
2503
- declare function ctcGreedyDecode(logits: Float32Array, seqLen: number, vocabSize: number, tokenMap: Map<number, string>): CTCDecodeResult;
2504
-
2505
1819
  /**
2506
1820
  * Shared blendshape constants and utilities for lip sync inference
2507
1821
  *
@@ -2521,26 +1835,18 @@ declare const LAM_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browI
2521
1835
  /** Alias for backwards compatibility */
2522
1836
  declare const ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "jawRight", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut"];
2523
1837
  /**
2524
- * Symmetrize blendshapes by averaging left/right pairs
2525
- * From LAM official postprocessing (models/utils.py)
2526
- * This fixes asymmetric output from the raw model
2527
- */
2528
- declare function symmetrizeBlendshapes(frame: Float32Array): Float32Array;
2529
- /**
2530
- * wav2arkit_cpu model blendshape ordering
1838
+ * Linearly interpolate between two blendshape weight arrays.
2531
1839
  *
2532
- * Indices 0-24 match LAM_BLENDSHAPES, but 25+ diverge:
2533
- * - LAM puts jawRight, mouthClose, mouthDimpleLeft, mouthDimpleRight at 25-28
2534
- * - wav2arkit_cpu puts mouthFrownLeft at 25 and moves those four to 48-51
2535
- */
2536
- declare const WAV2ARKIT_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "cheekPuff", "cheekSquintLeft", "cheekSquintRight", "eyeBlinkLeft", "eyeBlinkRight", "eyeLookDownLeft", "eyeLookDownRight", "eyeLookInLeft", "eyeLookInRight", "eyeLookOutLeft", "eyeLookOutRight", "eyeLookUpLeft", "eyeLookUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "jawForward", "jawLeft", "jawOpen", "mouthFrownLeft", "mouthFrownRight", "mouthFunnel", "mouthLeft", "mouthLowerDownLeft", "mouthLowerDownRight", "mouthPressLeft", "mouthPressRight", "mouthPucker", "mouthRight", "mouthRollLower", "mouthRollUpper", "mouthShrugLower", "mouthShrugUpper", "mouthSmileLeft", "mouthSmileRight", "mouthStretchLeft", "mouthStretchRight", "mouthUpperUpLeft", "mouthUpperUpRight", "noseSneerLeft", "noseSneerRight", "tongueOut", "mouthClose", "mouthDimpleLeft", "mouthDimpleRight", "jawRight"];
2537
- /**
2538
- * Remap a blendshape frame from wav2arkit_cpu ordering to LAM_BLENDSHAPES ordering
1840
+ * Pure math utility with zero renderer dependency — used by all renderer
1841
+ * adapters (@omote/three, @omote/babylon, @omote/r3f) for smooth frame
1842
+ * transitions.
2539
1843
  *
2540
- * @param frame - Float32Array of 52 blendshape values in wav2arkit_cpu order
2541
- * @returns Float32Array of 52 blendshape values in LAM_BLENDSHAPES order
1844
+ * @param current - Current blendshape weights
1845
+ * @param target - Target blendshape weights
1846
+ * @param factor - Interpolation factor (0 = no change, 1 = snap to target). Default: 0.3
1847
+ * @returns Interpolated weights as number[]
2542
1848
  */
2543
- declare function remapWav2ArkitToLam(frame: Float32Array): Float32Array;
1849
+ declare function lerpBlendshapes(current: Float32Array | number[], target: Float32Array | number[], factor?: number): number[];
2544
1850
 
2545
1851
  /**
2546
1852
  * Unified Wav2Vec2 inference engine for Audio-to-Expression + ASR
@@ -2582,6 +1888,12 @@ interface Wav2Vec2InferenceConfig {
2582
1888
  backend?: InferenceBackend;
2583
1889
  /** Number of identity classes (default: 12 for streaming model) */
2584
1890
  numIdentityClasses?: number;
1891
+ /**
1892
+ * Number of audio samples per inference chunk (default: 16000).
1893
+ * Model supports variable chunk sizes. Smaller chunks = lower latency,
1894
+ * more inference overhead. 8000 (500ms) is recommended for real-time lip sync.
1895
+ */
1896
+ chunkSize?: number;
2585
1897
  }
2586
1898
  interface ModelInfo {
2587
1899
  backend: 'webgpu' | 'wasm';
@@ -2608,7 +1920,7 @@ interface Wav2Vec2Result {
2608
1920
  /** Inference time in ms */
2609
1921
  inferenceTimeMs: number;
2610
1922
  }
2611
- declare class Wav2Vec2Inference implements LipSyncBackend {
1923
+ declare class Wav2Vec2Inference implements A2EBackend {
2612
1924
  readonly modelId: "wav2vec2";
2613
1925
  private session;
2614
1926
  private ort;
@@ -2616,6 +1928,7 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
2616
1928
  private _backend;
2617
1929
  private isLoading;
2618
1930
  private numIdentityClasses;
1931
+ readonly chunkSize: number;
2619
1932
  private inferenceQueue;
2620
1933
  private poisoned;
2621
1934
  private static readonly INFERENCE_TIMEOUT_MS;
@@ -2635,11 +1948,10 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
2635
1948
  load(): Promise<ModelInfo>;
2636
1949
  /**
2637
1950
  * Run inference on raw audio
2638
- * @param audioSamples - Float32Array of raw audio at 16kHz (16000 samples = 1 second)
1951
+ * @param audioSamples - Float32Array of raw audio at 16kHz
2639
1952
  * @param identityIndex - Optional identity index (0-11, default 0 = neutral)
2640
1953
  *
2641
- * Note: Model expects 1-second chunks (16000 samples) for optimal performance.
2642
- * Audio will be zero-padded or truncated to 16000 samples.
1954
+ * Audio will be zero-padded or truncated to chunkSize samples.
2643
1955
  */
2644
1956
  infer(audioSamples: Float32Array, identityIndex?: number): Promise<Wav2Vec2Result>;
2645
1957
  /**
@@ -2707,8 +2019,9 @@ interface Wav2ArkitCpuConfig {
2707
2019
  /** Preferred backend (default: 'wasm' — this model is CPU-optimized) */
2708
2020
  backend?: BackendPreference;
2709
2021
  }
2710
- declare class Wav2ArkitCpuInference implements LipSyncBackend {
2022
+ declare class Wav2ArkitCpuInference implements A2EBackend {
2711
2023
  readonly modelId: "wav2arkit_cpu";
2024
+ readonly chunkSize: number;
2712
2025
  private session;
2713
2026
  private ort;
2714
2027
  private config;
@@ -2723,7 +2036,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2723
2036
  /**
2724
2037
  * Load the ONNX model
2725
2038
  */
2726
- load(): Promise<LipSyncModelInfo>;
2039
+ load(): Promise<A2EModelInfo>;
2727
2040
  /**
2728
2041
  * Run inference on raw audio
2729
2042
  *
@@ -2733,7 +2046,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2733
2046
  * @param audioSamples - Float32Array of raw audio at 16kHz
2734
2047
  * @param _identityIndex - Ignored (identity 11 is baked into the model)
2735
2048
  */
2736
- infer(audioSamples: Float32Array, _identityIndex?: number): Promise<LipSyncResult>;
2049
+ infer(audioSamples: Float32Array, _identityIndex?: number): Promise<A2EResult>;
2737
2050
  /**
2738
2051
  * Queue inference to serialize ONNX session calls
2739
2052
  */
@@ -2745,7 +2058,7 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2745
2058
  }
2746
2059
 
2747
2060
  /**
2748
- * Factory function for lip sync with automatic GPU/CPU model selection
2061
+ * Factory function for A2E with automatic GPU/CPU model selection
2749
2062
  *
2750
2063
  * Provides a unified API that automatically selects the optimal model:
2751
2064
  * - Safari (macOS + iOS): Uses Wav2ArkitCpuInference (404MB, WASM)
@@ -2766,20 +2079,20 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2766
2079
  *
2767
2080
  * @example Auto-detect (recommended)
2768
2081
  * ```typescript
2769
- * import { createLipSync } from '@omote/core';
2082
+ * import { createA2E } from '@omote/core';
2770
2083
  *
2771
- * const lam = createLipSync({
2084
+ * const a2e = createA2E({
2772
2085
  * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2773
2086
  * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2774
2087
  * });
2775
2088
  *
2776
- * await lam.load();
2777
- * const { blendshapes } = await lam.infer(audioSamples);
2089
+ * await a2e.load();
2090
+ * const { blendshapes } = await a2e.infer(audioSamples);
2778
2091
  * ```
2779
2092
  *
2780
2093
  * @example Force CPU model
2781
2094
  * ```typescript
2782
- * const lam = createLipSync({
2095
+ * const a2e = createA2E({
2783
2096
  * gpuModelUrl: '/models/unified_wav2vec2_asr_a2e.onnx',
2784
2097
  * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2785
2098
  * mode: 'cpu',
@@ -2788,9 +2101,9 @@ declare class Wav2ArkitCpuInference implements LipSyncBackend {
2788
2101
  */
2789
2102
 
2790
2103
  /**
2791
- * Configuration for the lip sync factory
2104
+ * Configuration for the A2E factory
2792
2105
  */
2793
- interface CreateLipSyncConfig {
2106
+ interface CreateA2EConfig {
2794
2107
  /** URL for the GPU model (Wav2Vec2, used on Chrome/Firefox/Edge) */
2795
2108
  gpuModelUrl: string;
2796
2109
  /**
@@ -2804,7 +2117,7 @@ interface CreateLipSyncConfig {
2804
2117
  cpuModelUrl: string;
2805
2118
  /**
2806
2119
  * Model selection mode:
2807
- * - 'auto': Safari/iOS CPU, everything else GPU (default)
2120
+ * - 'auto': Safari/iOS -> CPU, everything else -> GPU (default)
2808
2121
  * - 'gpu': Force GPU model (Wav2Vec2Inference)
2809
2122
  * - 'cpu': Force CPU model (Wav2ArkitCpuInference)
2810
2123
  */
@@ -2838,12 +2151,322 @@ interface CreateLipSyncConfig {
2838
2151
  unifiedWorker?: UnifiedInferenceWorker;
2839
2152
  }
2840
2153
  /**
2841
- * Create a lip sync instance with automatic GPU/CPU model selection
2154
+ * Create an A2E instance with automatic GPU/CPU model selection
2842
2155
  *
2843
2156
  * @param config - Factory configuration
2844
- * @returns A LipSyncBackend instance (either GPU or CPU model)
2157
+ * @returns An A2EBackend instance (either GPU or CPU model)
2158
+ */
2159
+ declare function createA2E(config: CreateA2EConfig): A2EBackend;
2160
+
2161
+ /**
2162
+ * A2EProcessor — Engine-agnostic audio-to-expression processor
2163
+ *
2164
+ * The core inference primitive: audio samples in → blendshape frames out.
2165
+ * No mic capture, no audio playback, no Web Audio API.
2166
+ *
2167
+ * This is what Unity/Unreal/Godot/any engine would use directly.
2168
+ * Web-specific concerns (mic, AudioContext, scheduling) live in the
2169
+ * orchestrator and pipeline layers above.
2170
+ *
2171
+ * Two output modes:
2172
+ * - **Pull mode**: `pushAudio(samples, timestamp)` + `getFrameForTime(t)`
2173
+ * For TTS playback where frames are synced to AudioContext clock.
2174
+ * - **Push mode**: `pushAudio(samples)` + `startDrip()` + `latestFrame`
2175
+ * For live mic / game loop where frames are consumed at ~30fps.
2176
+ *
2177
+ * @category Inference
2178
+ *
2179
+ * @example Pull mode (TTS playback)
2180
+ * ```typescript
2181
+ * const processor = new A2EProcessor({ backend: a2e });
2182
+ * processor.pushAudio(samples, audioContext.currentTime + delay);
2183
+ * const frame = processor.getFrameForTime(audioContext.currentTime);
2184
+ * ```
2185
+ *
2186
+ * @example Push mode (live mic)
2187
+ * ```typescript
2188
+ * const processor = new A2EProcessor({
2189
+ * backend: a2e,
2190
+ * onFrame: (frame) => applyToAvatar(frame),
2191
+ * });
2192
+ * processor.startDrip();
2193
+ * processor.pushAudio(micSamples); // no timestamp → drip mode
2194
+ * ```
2195
+ */
2196
+
2197
+ interface A2EProcessorConfig {
2198
+ /** Inference backend */
2199
+ backend: A2EBackend;
2200
+ /** Sample rate (default: 16000) */
2201
+ sampleRate?: number;
2202
+ /** Samples per inference chunk (default: 16000 = 1s) */
2203
+ chunkSize?: number;
2204
+ /** Callback fired with each blendshape frame (push mode) */
2205
+ onFrame?: (frame: Float32Array) => void;
2206
+ /** Error callback */
2207
+ onError?: (error: Error) => void;
2208
+ }
2209
+ declare class A2EProcessor {
2210
+ private readonly backend;
2211
+ private readonly sampleRate;
2212
+ private readonly chunkSize;
2213
+ private readonly onFrame?;
2214
+ private readonly onError?;
2215
+ private bufferCapacity;
2216
+ private buffer;
2217
+ private writeOffset;
2218
+ private bufferStartTime;
2219
+ private timestampedQueue;
2220
+ private plainQueue;
2221
+ private _latestFrame;
2222
+ private dripInterval;
2223
+ private lastPulledFrame;
2224
+ private inferenceRunning;
2225
+ private pendingChunks;
2226
+ private getFrameCallCount;
2227
+ private disposed;
2228
+ constructor(config: A2EProcessorConfig);
2229
+ /**
2230
+ * Push audio samples for inference (any source: mic, TTS, file).
2231
+ *
2232
+ * - With `timestamp`: frames stored with timestamps (pull mode)
2233
+ * - Without `timestamp`: frames stored in plain queue (drip/push mode)
2234
+ *
2235
+ * Fire-and-forget: returns immediately, inference runs async.
2236
+ */
2237
+ pushAudio(samples: Float32Array, timestamp?: number): void;
2238
+ /**
2239
+ * Flush remaining buffered audio (pads to chunkSize).
2240
+ * Call at end of stream to process final partial chunk.
2241
+ *
2242
+ * Routes through the serialized pendingChunks pipeline to maintain
2243
+ * correct frame ordering. Without this, flush() could push frames
2244
+ * with the latest timestamp to the queue before drainPendingChunks()
2245
+ * finishes pushing frames with earlier timestamps — causing
2246
+ * getFrameForTime() to see out-of-order timestamps and stall.
2247
+ */
2248
+ flush(): Promise<void>;
2249
+ /**
2250
+ * Reset buffer and frame queues
2251
+ */
2252
+ reset(): void;
2253
+ /**
2254
+ * Get frame synced to external clock (e.g. AudioContext.currentTime).
2255
+ *
2256
+ * Discards frames that are too old, returns the current frame,
2257
+ * or holds last frame as fallback to prevent avatar freezing.
2258
+ *
2259
+ * @param currentTime - Current playback time (seconds)
2260
+ * @returns Blendshape frame, or null if no frames yet
2261
+ */
2262
+ getFrameForTime(currentTime: number): Float32Array | null;
2263
+ /** Latest frame from drip-feed (live mic, game loop) */
2264
+ get latestFrame(): Float32Array | null;
2265
+ /** Start 30fps drip-feed timer (push mode) */
2266
+ startDrip(): void;
2267
+ /** Stop drip-feed timer */
2268
+ stopDrip(): void;
2269
+ /** Number of frames waiting in queue (both modes combined) */
2270
+ get queuedFrameCount(): number;
2271
+ /** Buffer fill level as fraction of chunkSize (0-1) */
2272
+ get fillLevel(): number;
2273
+ /** Dispose resources */
2274
+ dispose(): void;
2275
+ /**
2276
+ * Process pending chunks sequentially.
2277
+ * Fire-and-forget — called from pushAudio() without awaiting.
2278
+ */
2279
+ private drainPendingChunks;
2280
+ private handleError;
2281
+ }
2282
+
2283
+ /**
2284
+ * BlendshapeSmoother — Per-channel critically damped spring for 52 ARKit blendshapes
2285
+ *
2286
+ * Eliminates frame gaps between inference batches by smoothly interpolating
2287
+ * blendshape weights using critically damped springs (the game industry standard).
2288
+ *
2289
+ * Each of the 52 blendshape channels has its own spring with position + velocity
2290
+ * state. When a new inference frame arrives, spring targets are updated. Between
2291
+ * frames, springs continue converging toward the last target — no frozen face.
2292
+ *
2293
+ * When inference stalls, `decayToNeutral()` sets all targets to 0, and the
2294
+ * springs smoothly close the mouth / relax the face over the halflife period.
2295
+ *
2296
+ * Math from Daniel Holden's "Spring-It-On" (Epic Games):
2297
+ * https://theorangeduck.com/page/spring-roll-call
2298
+ *
2299
+ * @category Inference
2300
+ *
2301
+ * @example Basic usage
2302
+ * ```typescript
2303
+ * const smoother = new BlendshapeSmoother({ halflife: 0.06 });
2304
+ *
2305
+ * // In frame loop (60fps):
2306
+ * smoother.setTarget(inferenceFrame); // when new frame arrives
2307
+ * const smoothed = smoother.update(1/60); // every render frame
2308
+ * applyToAvatar(smoothed);
2309
+ * ```
2310
+ */
2311
+ interface BlendshapeSmootherConfig {
2312
+ /**
2313
+ * Spring halflife in seconds — time for the distance to the target
2314
+ * to reduce by half. Lower = snappier, higher = smoother.
2315
+ *
2316
+ * - 0.04s (40ms): Very snappy, slight jitter on fast transitions
2317
+ * - 0.06s (60ms): Sweet spot for lip sync (default)
2318
+ * - 0.10s (100ms): Very smooth, slight lag on fast consonants
2319
+ * - 0: Bypass mode — passes through raw target values (no smoothing)
2320
+ *
2321
+ * Default: 0.06
2322
+ */
2323
+ halflife?: number;
2324
+ }
2325
+ declare class BlendshapeSmoother {
2326
+ private readonly halflife;
2327
+ /** Current smoothed blendshape values */
2328
+ private values;
2329
+ /** Per-channel spring velocities */
2330
+ private velocities;
2331
+ /** Current spring targets (from latest inference frame) */
2332
+ private targets;
2333
+ /** Whether any target has been set */
2334
+ private _hasTarget;
2335
+ constructor(config?: BlendshapeSmootherConfig);
2336
+ /** Whether a target frame has been set (false until first setTarget call) */
2337
+ get hasTarget(): boolean;
2338
+ /**
2339
+ * Set new target frame from inference output.
2340
+ * Springs will converge toward these values on subsequent update() calls.
2341
+ */
2342
+ setTarget(frame: Float32Array): void;
2343
+ /**
2344
+ * Advance all 52 springs by `dt` seconds and return the smoothed frame.
2345
+ *
2346
+ * Call this every render frame (e.g., inside requestAnimationFrame).
2347
+ * Returns the internal values buffer — do NOT mutate the returned array.
2348
+ *
2349
+ * @param dt - Time step in seconds (e.g., 1/60 for 60fps)
2350
+ * @returns Smoothed blendshape values (Float32Array of 52)
2351
+ */
2352
+ update(dt: number): Float32Array;
2353
+ /**
2354
+ * Decay all spring targets to neutral (0).
2355
+ *
2356
+ * Call when inference stalls (no new frames for threshold duration).
2357
+ * The springs will smoothly close the mouth / relax the face over
2358
+ * the halflife period rather than freezing.
2359
+ */
2360
+ decayToNeutral(): void;
2361
+ /**
2362
+ * Reset all state (values, velocities, targets).
2363
+ * Call when starting a new playback session.
2364
+ */
2365
+ reset(): void;
2366
+ }
2367
+
2368
+ /**
2369
+ * Renderer-agnostic A2E (audio-to-expression) orchestrator
2370
+ *
2371
+ * Manages the mic capture + A2E inference loop independently of any
2372
+ * 3D renderer. Adapter packages (@omote/three, @omote/babylon) wrap this
2373
+ * thinly and pipe `latestWeights` into their renderer-specific blendshape
2374
+ * controllers.
2375
+ *
2376
+ * Internally delegates all buffer accumulation, inference, and frame
2377
+ * drip-feeding to {@link A2EProcessor}. This class only handles mic capture
2378
+ * (getUserMedia, ScriptProcessorNode, resampling).
2379
+ *
2380
+ * @category Inference
2381
+ */
2382
+
2383
+ /**
2384
+ * Progress event emitted during model download / compile
2385
+ */
2386
+ interface A2EProgressEvent {
2387
+ phase: 'download' | 'compile';
2388
+ progress: number;
2389
+ }
2390
+ /**
2391
+ * Configuration for the A2EOrchestrator
2392
+ */
2393
+ interface A2EOrchestratorConfig {
2394
+ /** URL for the GPU model (Wav2Vec2, Chrome/Firefox/Edge) */
2395
+ gpuModelUrl: string;
2396
+ /** URL for GPU model external data file */
2397
+ gpuExternalDataUrl?: string | false;
2398
+ /** URL for the CPU model (wav2arkit_cpu, Safari/iOS) */
2399
+ cpuModelUrl?: string;
2400
+ /** Sample rate for mic capture (default: 16000) */
2401
+ sampleRate?: number;
2402
+ /** Chunk size in samples for mic capture (default: 16000 = 1s at 16kHz) */
2403
+ chunkSize?: number;
2404
+ /** Callback fired with new blendshape weights after each inference */
2405
+ onFrame?: (weights: Float32Array) => void;
2406
+ /** Callback fired during model loading progress */
2407
+ onProgress?: (event: A2EProgressEvent) => void;
2408
+ /** Callback fired on error */
2409
+ onError?: (error: Error) => void;
2410
+ /** Callback fired when model is loaded and ready */
2411
+ onReady?: () => void;
2412
+ /** Additional createA2E config options */
2413
+ a2eConfig?: Partial<CreateA2EConfig>;
2414
+ }
2415
+ /**
2416
+ * Renderer-agnostic A2E orchestrator.
2417
+ *
2418
+ * Manages mic capture + delegates inference to {@link A2EProcessor}.
2419
+ * Adapters read `latestWeights` each frame to apply to their meshes.
2420
+ *
2421
+ * @example Quick start (used by @omote/three and @omote/babylon adapters)
2422
+ * ```typescript
2423
+ * const orchestrator = new A2EOrchestrator({
2424
+ * gpuModelUrl: '/models/wav2vec2.onnx',
2425
+ * cpuModelUrl: '/models/wav2arkit_cpu.onnx',
2426
+ * onFrame: (weights) => controller.update(weights),
2427
+ * });
2428
+ * await orchestrator.load();
2429
+ * await orchestrator.start();
2430
+ * ```
2845
2431
  */
2846
- declare function createLipSync(config: CreateLipSyncConfig): LipSyncBackend;
2432
+ declare class A2EOrchestrator {
2433
+ private config;
2434
+ private a2e;
2435
+ private processor;
2436
+ private stream;
2437
+ private audioContext;
2438
+ private scriptProcessor;
2439
+ private nativeSampleRate;
2440
+ private _isReady;
2441
+ private _isStreaming;
2442
+ private _backend;
2443
+ private disposed;
2444
+ constructor(config: A2EOrchestratorConfig);
2445
+ /** Latest blendshape weights from inference (null if none yet) */
2446
+ get latestWeights(): Float32Array | null;
2447
+ /** Whether the model is loaded and ready for inference */
2448
+ get isReady(): boolean;
2449
+ /** Whether mic is active and inference loop is running */
2450
+ get isStreaming(): boolean;
2451
+ /** Current backend type (webgpu, wasm, or null) */
2452
+ get backend(): string | null;
2453
+ /**
2454
+ * Load the A2E model and create the processor
2455
+ */
2456
+ load(): Promise<void>;
2457
+ /**
2458
+ * Start mic capture and inference loop
2459
+ */
2460
+ start(): Promise<void>;
2461
+ /**
2462
+ * Stop mic capture and inference loop
2463
+ */
2464
+ stop(): void;
2465
+ /**
2466
+ * Dispose of all resources
2467
+ */
2468
+ dispose(): Promise<void>;
2469
+ }
2847
2470
 
2848
2471
  /**
2849
2472
  * Safari Web Speech API wrapper for iOS speech recognition
@@ -3992,11 +3615,6 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
3992
3615
  * @param audioEnergy - Optional RMS energy for logging (default: 0)
3993
3616
  */
3994
3617
  processVADResult(vadProbability: number, audioEnergy?: number): void;
3995
- /**
3996
- * @deprecated Use processVADResult() instead. This method uses naive RMS detection.
3997
- * Process audio samples for VAD (legacy - uses simple RMS)
3998
- */
3999
- processAudio(samples: Float32Array | Int16Array): void;
4000
3618
  /**
4001
3619
  * Notify that AI started speaking
4002
3620
  */
@@ -4020,7 +3638,6 @@ declare class InterruptionHandler extends EventEmitter<InterruptionEvents> {
4020
3638
  isSpeaking: boolean;
4021
3639
  speechDurationMs: number;
4022
3640
  };
4023
- private calculateRMS;
4024
3641
  private onSpeechDetected;
4025
3642
  private onSilenceDetected;
4026
3643
  }
@@ -5196,4 +4813,4 @@ declare class ProceduralLifeLayer {
5196
4813
  private updateBrowNoise;
5197
4814
  }
5198
4815
 
5199
- export { type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, UnifiedInferenceWorker, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSenseVoice, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
4816
+ export { type A2EBackend, type A2EModelInfo, A2EOrchestrator, type A2EOrchestratorConfig, A2EProcessor, type A2EProcessorConfig, type A2EProgressEvent, type A2EResult, type AIAdapter, type AIAdapterEvents, AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, BLENDSHAPE_TO_GROUP, type BackendPreference, type BlendWeight, type BlendshapeGroup, BlendshapeSmoother, type BlendshapeSmootherConfig, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateA2EConfig, type CreateSenseVoiceConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type EmotionAnimationMap, EmotionController, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, type EmotionWeights, EmphasisDetector, EventEmitter, type ExpressionProfile, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceBackend, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, SenseVoiceUnifiedAdapter, SenseVoiceWorker, type SenseVoiceWorkerConfig, type SessionConfig, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADUnifiedAdapter, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UnifiedInferenceWorker, type VADBackend, type VADModelInfo, type VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2ArkitCpuUnifiedAdapter, Wav2ArkitCpuWorker, type Wav2ArkitCpuWorkerConfig, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, blendEmotions, calculatePeak, calculateRMS, configureCacheLimit, configureTelemetry, createA2E, createEmotionVector, createSenseVoice, createSileroVAD, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getModelCache, getOptimalWasmThreads, getRecommendedBackend, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpBlendshapes, lerpEmotion, preloadModels, resolveBackend, shouldEnableWasmProxy, shouldUseCpuA2E, shouldUseNativeASR, shouldUseServerA2E, supportsVADWorker };