@omote/core 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +636 -1
- package/dist/index.d.ts +636 -1
- package/dist/index.js +1429 -203
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1429 -203
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -1
package/dist/index.d.mts
CHANGED
|
@@ -2,6 +2,7 @@ import { EventEmitter, OmoteEvents, AnimationEvent } from './events/index.mjs';
|
|
|
2
2
|
export { BackendEvent, EmotionEvent, GazeEvent, STTFinalEvent, STTPartialEvent, SessionStateEvent, TTSEndEvent, TTSMarkEvent, TTSStartEvent, VisemeEvent } from './events/index.mjs';
|
|
3
3
|
import { InferenceSession, Tensor, Env } from 'onnxruntime-common';
|
|
4
4
|
export { D as DEFAULT_LOGGING_CONFIG, I as ILogger, e as LOG_LEVEL_PRIORITY, b as LogEntry, L as LogFormatter, a as LogLevel, c as LogSink, d as LoggingConfig, f as configureLogging, i as createLogger, g as getLoggingConfig, n as noopLogger, r as resetLoggingConfig, s as setLogLevel, h as setLoggingEnabled } from './Logger-I_k4sGhM.mjs';
|
|
5
|
+
export { ARKitToFLAMEMapping, ApiError, AudioChunkEvent, AvatarFormat, Character, CharacterAvatar, CharacterMemory, CharacterPersonality, CharacterSpec, CharacterVoice, CreateCharacterRequest, CreateCharacterResponse, CreateLAMJobRequest, CreateLAMJobResponse, CreateSessionRequest, CreateSessionResponse, GSplatConfig, LAMJob, LAMJobStatus, PROTOCOL_VERSION, PaginatedResponse, PlatformSession, ErrorEvent as ProtocolErrorEvent, ProtocolEvent, ResponseChunkEvent, ResponseEndEvent, ResponseStartEvent, SessionMessage, SessionStatus, isProtocolEvent } from '@omote/types';
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* Microphone capture - renderer-agnostic audio input
|
|
@@ -758,6 +759,483 @@ declare class SyncedAudioPipeline extends EventEmitter<SyncedAudioPipelineEvents
|
|
|
758
759
|
dispose(): void;
|
|
759
760
|
}
|
|
760
761
|
|
|
762
|
+
/**
|
|
763
|
+
* Emotion to ARKit Blendshape Mapper
|
|
764
|
+
*
|
|
765
|
+
* Converts Emotion2VecInference output to upper face ARKit blendshapes for
|
|
766
|
+
* expressive avatar animation. Maps 4 emotion categories (neutral, happy, angry, sad)
|
|
767
|
+
* to 11 upper face blendshapes (brows, eyes, cheeks).
|
|
768
|
+
*
|
|
769
|
+
* Supports two blend modes:
|
|
770
|
+
* - 'dominant': Uses only the strongest emotion (simpler, more stable)
|
|
771
|
+
* - 'weighted': Blends all emotions by probability (more nuanced, e.g., bittersweet)
|
|
772
|
+
*
|
|
773
|
+
* Also supports energy modulation to scale emotion intensity by audio energy,
|
|
774
|
+
* making expressions stronger during emphasized speech.
|
|
775
|
+
*
|
|
776
|
+
* @example Basic usage
|
|
777
|
+
* ```typescript
|
|
778
|
+
* import { EmotionToBlendshapeMapper } from '@omote/core';
|
|
779
|
+
* import { Emotion2VecInference } from '@omote/core';
|
|
780
|
+
*
|
|
781
|
+
* const emotion = new Emotion2VecInference({ modelUrl: '/models/emotion.onnx' });
|
|
782
|
+
* const mapper = new EmotionToBlendshapeMapper();
|
|
783
|
+
*
|
|
784
|
+
* // Process emotion frame
|
|
785
|
+
* const result = await emotion.infer(audioSamples);
|
|
786
|
+
* const blendshapes = mapper.mapFrame(result.dominant);
|
|
787
|
+
*
|
|
788
|
+
* // Apply to avatar
|
|
789
|
+
* for (const [name, value] of Object.entries(blendshapes)) {
|
|
790
|
+
* avatar.setBlendshape(name, value);
|
|
791
|
+
* }
|
|
792
|
+
* ```
|
|
793
|
+
*
|
|
794
|
+
* @example Weighted blending for nuanced expressions
|
|
795
|
+
* ```typescript
|
|
796
|
+
* const mapper = new EmotionToBlendshapeMapper({
|
|
797
|
+
* blendMode: 'weighted',
|
|
798
|
+
* minBlendProbability: 0.1,
|
|
799
|
+
* });
|
|
800
|
+
*
|
|
801
|
+
* // Frame with mixed emotions: { happy: 0.6, sad: 0.3, neutral: 0.1 }
|
|
802
|
+
* // Result: bittersweet expression (smiling but worried brow)
|
|
803
|
+
* const blendshapes = mapper.mapFrame(emotionFrame);
|
|
804
|
+
* ```
|
|
805
|
+
*
|
|
806
|
+
* @example Energy-modulated emotion
|
|
807
|
+
* ```typescript
|
|
808
|
+
* import { AudioEnergyAnalyzer } from '@omote/core';
|
|
809
|
+
*
|
|
810
|
+
* const energyAnalyzer = new AudioEnergyAnalyzer();
|
|
811
|
+
* const mapper = new EmotionToBlendshapeMapper({ energyModulation: true });
|
|
812
|
+
*
|
|
813
|
+
* // In animation loop
|
|
814
|
+
* function animate(audioChunk: Float32Array, emotionFrame: EmotionFrame) {
|
|
815
|
+
* const { energy } = energyAnalyzer.analyze(audioChunk);
|
|
816
|
+
* mapper.mapFrame(emotionFrame, energy); // Louder = stronger emotion
|
|
817
|
+
* mapper.update(16);
|
|
818
|
+
* applyToAvatar(mapper.getCurrentBlendshapes());
|
|
819
|
+
* }
|
|
820
|
+
* ```
|
|
821
|
+
*
|
|
822
|
+
* @module animation
|
|
823
|
+
*/
|
|
824
|
+
declare const EMOTION2VEC_LABELS: readonly ["neutral", "happy", "angry", "sad"];
|
|
825
|
+
type Emotion2VecLabel = (typeof EMOTION2VEC_LABELS)[number];
|
|
826
|
+
interface EmotionFrame {
|
|
827
|
+
/** Primary emotion label */
|
|
828
|
+
emotion: Emotion2VecLabel;
|
|
829
|
+
/** Confidence for primary emotion (0-1) */
|
|
830
|
+
confidence: number;
|
|
831
|
+
/** All emotion probabilities */
|
|
832
|
+
probabilities: Record<Emotion2VecLabel, number>;
|
|
833
|
+
}
|
|
834
|
+
/**
|
|
835
|
+
* Upper face ARKit blendshape names (11 total)
|
|
836
|
+
*
|
|
837
|
+
* These blendshapes control the upper face (brows, eyes, cheeks) and are
|
|
838
|
+
* driven by emotion detection, complementing the mouth blendshapes from
|
|
839
|
+
* LAM lip sync.
|
|
840
|
+
*/
|
|
841
|
+
declare const UPPER_FACE_BLENDSHAPES: readonly ["browDownLeft", "browDownRight", "browInnerUp", "browOuterUpLeft", "browOuterUpRight", "eyeSquintLeft", "eyeSquintRight", "eyeWideLeft", "eyeWideRight", "cheekSquintLeft", "cheekSquintRight"];
|
|
842
|
+
type UpperFaceBlendshapeName = (typeof UPPER_FACE_BLENDSHAPES)[number];
|
|
843
|
+
/**
|
|
844
|
+
* Upper face blendshape values (0-1 for each)
|
|
845
|
+
*/
|
|
846
|
+
type UpperFaceBlendshapes = Record<UpperFaceBlendshapeName, number>;
|
|
847
|
+
/**
|
|
848
|
+
* Blend mode for combining emotions
|
|
849
|
+
* - 'dominant': Use only the strongest emotion (default, more stable)
|
|
850
|
+
* - 'weighted': Blend all emotions by probability (more nuanced)
|
|
851
|
+
*/
|
|
852
|
+
type EmotionBlendMode = 'dominant' | 'weighted';
|
|
853
|
+
/**
|
|
854
|
+
* Emotion to ARKit blendshape mapping
|
|
855
|
+
*
|
|
856
|
+
* Based on Paul Ekman's FACS (Facial Action Coding System) research:
|
|
857
|
+
*
|
|
858
|
+
* - Happy (AU6+AU12): Cheek raise + lip corner pull (Duchenne smile)
|
|
859
|
+
* Upper face: cheekSquint (AU6) + slight eyeSquint from genuine smile
|
|
860
|
+
*
|
|
861
|
+
* - Angry (AU4+AU5+AU7+AU23): Brow lower + eye wide + lid tighten + lip press
|
|
862
|
+
* Upper face: browDown (AU4) + eyeWide (AU5) + eyeSquint (AU7) creates the "glare"
|
|
863
|
+
*
|
|
864
|
+
* - Sad (AU1+AU4+AU15): Inner brow raise + brow furrow + lip corner depress
|
|
865
|
+
* Upper face: browInnerUp (AU1) + browDown (AU4) creates the worried/sad brow
|
|
866
|
+
*
|
|
867
|
+
* - Neutral: All zeros (no expression overlay)
|
|
868
|
+
*
|
|
869
|
+
* @see https://imotions.com/blog/learning/research-fundamentals/facial-action-coding-system/
|
|
870
|
+
* @see https://melindaozel.com/arkit-to-facs-cheat-sheet/
|
|
871
|
+
*/
|
|
872
|
+
declare const EMOTION_ARKIT_MAP: Record<Emotion2VecLabel, Partial<UpperFaceBlendshapes>>;
|
|
873
|
+
/**
|
|
874
|
+
* Configuration for EmotionToBlendshapeMapper
|
|
875
|
+
*/
|
|
876
|
+
interface EmotionBlendshapeConfig {
|
|
877
|
+
/**
|
|
878
|
+
* Smoothing factor for exponential moving average (0-1)
|
|
879
|
+
* Lower = slower, smoother transitions
|
|
880
|
+
* Higher = faster, more responsive
|
|
881
|
+
* @default 0.15
|
|
882
|
+
*/
|
|
883
|
+
smoothingFactor?: number;
|
|
884
|
+
/**
|
|
885
|
+
* Minimum confidence threshold for emotion to take effect
|
|
886
|
+
* Emotions below this confidence are treated as neutral
|
|
887
|
+
* @default 0.3
|
|
888
|
+
*/
|
|
889
|
+
confidenceThreshold?: number;
|
|
890
|
+
/**
|
|
891
|
+
* Global intensity multiplier for all blendshapes (0-2)
|
|
892
|
+
* @default 1.0
|
|
893
|
+
*/
|
|
894
|
+
intensity?: number;
|
|
895
|
+
/**
|
|
896
|
+
* Blend mode for combining emotions
|
|
897
|
+
* - 'dominant': Use only the strongest emotion (default)
|
|
898
|
+
* - 'weighted': Blend all emotions by probability
|
|
899
|
+
* @default 'dominant'
|
|
900
|
+
*/
|
|
901
|
+
blendMode?: EmotionBlendMode;
|
|
902
|
+
/**
|
|
903
|
+
* Minimum probability for an emotion to contribute in weighted blend mode
|
|
904
|
+
* Emotions with probability below this are ignored
|
|
905
|
+
* @default 0.1
|
|
906
|
+
*/
|
|
907
|
+
minBlendProbability?: number;
|
|
908
|
+
/**
|
|
909
|
+
* Enable energy modulation - scale emotion intensity by audio energy
|
|
910
|
+
* When enabled, louder speech produces stronger expressions
|
|
911
|
+
* @default false
|
|
912
|
+
*/
|
|
913
|
+
energyModulation?: boolean;
|
|
914
|
+
/**
|
|
915
|
+
* Minimum energy scale when energy modulation is enabled (0-1)
|
|
916
|
+
* At zero audio energy, emotion intensity is scaled by this factor
|
|
917
|
+
* @default 0.3
|
|
918
|
+
*/
|
|
919
|
+
minEnergyScale?: number;
|
|
920
|
+
/**
|
|
921
|
+
* Maximum energy scale when energy modulation is enabled (0-2)
|
|
922
|
+
* At maximum audio energy, emotion intensity is scaled by this factor
|
|
923
|
+
* @default 1.0
|
|
924
|
+
*/
|
|
925
|
+
maxEnergyScale?: number;
|
|
926
|
+
}
|
|
927
|
+
/**
|
|
928
|
+
* EmotionToBlendshapeMapper
|
|
929
|
+
*
|
|
930
|
+
* Converts emotion detection output to upper face ARKit blendshapes.
|
|
931
|
+
* Provides smooth transitions between emotion states using exponential
|
|
932
|
+
* moving average interpolation.
|
|
933
|
+
*
|
|
934
|
+
* Supports two blend modes:
|
|
935
|
+
* - 'dominant': Uses only the strongest emotion
|
|
936
|
+
* - 'weighted': Blends all emotions by probability for nuanced expressions
|
|
937
|
+
*
|
|
938
|
+
* Also supports energy modulation to scale emotion intensity by audio energy.
|
|
939
|
+
*/
|
|
940
|
+
declare class EmotionToBlendshapeMapper {
|
|
941
|
+
private config;
|
|
942
|
+
private targetBlendshapes;
|
|
943
|
+
private currentBlendshapes;
|
|
944
|
+
private currentEnergy;
|
|
945
|
+
/**
|
|
946
|
+
* Create a new EmotionToBlendshapeMapper
|
|
947
|
+
*
|
|
948
|
+
* @param config - Optional configuration
|
|
949
|
+
*/
|
|
950
|
+
constructor(config?: EmotionBlendshapeConfig);
|
|
951
|
+
/**
|
|
952
|
+
* Map an emotion frame to target blendshapes
|
|
953
|
+
*
|
|
954
|
+
* This sets the target values that the mapper will smoothly interpolate
|
|
955
|
+
* towards. Call update() each frame to apply smoothing.
|
|
956
|
+
*
|
|
957
|
+
* @param frame - Emotion frame from Emotion2VecInference
|
|
958
|
+
* @param audioEnergy - Optional audio energy (0-1) for energy modulation
|
|
959
|
+
* @returns Target upper face blendshapes (before smoothing)
|
|
960
|
+
*/
|
|
961
|
+
mapFrame(frame: EmotionFrame, audioEnergy?: number): UpperFaceBlendshapes;
|
|
962
|
+
/**
|
|
963
|
+
* Map using dominant emotion only (original behavior)
|
|
964
|
+
*/
|
|
965
|
+
private mapFrameDominant;
|
|
966
|
+
/**
|
|
967
|
+
* Map using weighted blend of all emotions by probability
|
|
968
|
+
* Creates more nuanced expressions (e.g., bittersweet = happy + sad)
|
|
969
|
+
*/
|
|
970
|
+
private mapFrameWeighted;
|
|
971
|
+
/**
|
|
972
|
+
* Apply energy modulation to scale emotion intensity by audio energy
|
|
973
|
+
* Louder speech = stronger expressions
|
|
974
|
+
*/
|
|
975
|
+
private applyEnergyModulation;
|
|
976
|
+
/**
|
|
977
|
+
* Apply smoothing to interpolate current values towards target
|
|
978
|
+
*
|
|
979
|
+
* Uses exponential moving average:
|
|
980
|
+
* current = current + smoothingFactor * (target - current)
|
|
981
|
+
*
|
|
982
|
+
* @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
|
|
983
|
+
*/
|
|
984
|
+
update(_deltaMs: number): void;
|
|
985
|
+
/**
|
|
986
|
+
* Get current smoothed blendshape values
|
|
987
|
+
*
|
|
988
|
+
* @returns Current upper face blendshapes (after smoothing)
|
|
989
|
+
*/
|
|
990
|
+
getCurrentBlendshapes(): UpperFaceBlendshapes;
|
|
991
|
+
/**
|
|
992
|
+
* Reset mapper to neutral state
|
|
993
|
+
*
|
|
994
|
+
* Sets both target and current blendshapes to zero.
|
|
995
|
+
*/
|
|
996
|
+
reset(): void;
|
|
997
|
+
/**
|
|
998
|
+
* Get current configuration
|
|
999
|
+
*/
|
|
1000
|
+
getConfig(): Required<EmotionBlendshapeConfig>;
|
|
1001
|
+
/**
|
|
1002
|
+
* Update configuration
|
|
1003
|
+
*
|
|
1004
|
+
* @param config - Partial configuration to update
|
|
1005
|
+
*/
|
|
1006
|
+
setConfig(config: Partial<EmotionBlendshapeConfig>): void;
|
|
1007
|
+
}
|
|
1008
|
+
|
|
1009
|
+
/**
|
|
1010
|
+
* FullFacePipeline - Combined LAM lip sync + Emotion upper face pipeline
|
|
1011
|
+
*
|
|
1012
|
+
* Orchestrates full-face animation by combining:
|
|
1013
|
+
* 1. LAM lip sync (52 ARKit blendshapes) via audio-first scheduling
|
|
1014
|
+
* 2. Emotion labels (from backend LLM or `setEmotionLabel()`) for upper face
|
|
1015
|
+
* 3. AudioEnergyAnalyzer for prosody-driven fallback when no emotion label is set
|
|
1016
|
+
*
|
|
1017
|
+
* Architecture: Audio-First, LAM-Background (same as SyncedAudioPipeline)
|
|
1018
|
+
* - Audio chunks are scheduled for playback immediately (never waits for LAM)
|
|
1019
|
+
* - LAM inference runs in background without blocking the audio path
|
|
1020
|
+
* - Lip sync starts ~1 second after audio (LAM needs 16000 samples to infer)
|
|
1021
|
+
*
|
|
1022
|
+
* Merge Strategy:
|
|
1023
|
+
* - Lower face (41 blendshapes): 100% from LAM (mouth, jaw, tongue, etc.)
|
|
1024
|
+
* - Upper face (11 blendshapes): Emotion overlay with LAM as subtle fallback
|
|
1025
|
+
* Formula: emotion * emotionBlendFactor + lam * lamBlendFactor
|
|
1026
|
+
*
|
|
1027
|
+
* Emotion Sources (in priority order):
|
|
1028
|
+
* 1. `setEmotionLabel()` — explicit label from backend LLM (recommended)
|
|
1029
|
+
* 2. Prosody fallback — subtle brow movement from audio energy (automatic)
|
|
1030
|
+
*
|
|
1031
|
+
* @category Audio
|
|
1032
|
+
*
|
|
1033
|
+
* @example Basic usage
|
|
1034
|
+
* ```typescript
|
|
1035
|
+
* import { FullFacePipeline } from '@omote/core';
|
|
1036
|
+
*
|
|
1037
|
+
* const pipeline = new FullFacePipeline({
|
|
1038
|
+
* lam,
|
|
1039
|
+
* emotionBlendFactor: 0.8,
|
|
1040
|
+
* lamBlendFactor: 0.2,
|
|
1041
|
+
* });
|
|
1042
|
+
* await pipeline.initialize();
|
|
1043
|
+
*
|
|
1044
|
+
* pipeline.on('full_frame_ready', (frame) => {
|
|
1045
|
+
* applyToAvatar(frame.blendshapes);
|
|
1046
|
+
* });
|
|
1047
|
+
*
|
|
1048
|
+
* pipeline.start();
|
|
1049
|
+
* pipeline.setEmotionLabel('happy'); // From backend LLM
|
|
1050
|
+
* await pipeline.onAudioChunk(audioData);
|
|
1051
|
+
* ```
|
|
1052
|
+
*/
|
|
1053
|
+
|
|
1054
|
+
/**
|
|
1055
|
+
* Configuration for FullFacePipeline
|
|
1056
|
+
*/
|
|
1057
|
+
interface FullFacePipelineOptions {
|
|
1058
|
+
/** Sample rate in Hz (default: 16000) */
|
|
1059
|
+
sampleRate?: number;
|
|
1060
|
+
/** Target chunk duration in ms for coalescing (default: 200) */
|
|
1061
|
+
chunkTargetMs?: number;
|
|
1062
|
+
/**
|
|
1063
|
+
* Audio playback delay in ms before first audio plays.
|
|
1064
|
+
* Gives LAM inference time to pre-compute blendshapes.
|
|
1065
|
+
* Default: auto-detected from lam.backend (50ms WebGPU, 350ms WASM).
|
|
1066
|
+
*/
|
|
1067
|
+
audioDelayMs?: number;
|
|
1068
|
+
/** LAM inference engine */
|
|
1069
|
+
lam: LipSyncBackend;
|
|
1070
|
+
/**
|
|
1071
|
+
* Emotion blend factor for upper face blendshapes (0-1)
|
|
1072
|
+
* Higher values give more weight to emotion detection
|
|
1073
|
+
* @default 0.8
|
|
1074
|
+
*/
|
|
1075
|
+
emotionBlendFactor?: number;
|
|
1076
|
+
/**
|
|
1077
|
+
* LAM blend factor for upper face blendshapes (0-1)
|
|
1078
|
+
* Provides subtle fallback from LAM when emotion is weak
|
|
1079
|
+
* @default 0.2
|
|
1080
|
+
*/
|
|
1081
|
+
lamBlendFactor?: number;
|
|
1082
|
+
}
|
|
1083
|
+
/**
|
|
1084
|
+
* Full face frame with merged blendshapes and emotion data
|
|
1085
|
+
*/
|
|
1086
|
+
interface FullFaceFrame {
|
|
1087
|
+
/** Merged 52 ARKit blendshapes (lower face from LAM + upper face from emotion) */
|
|
1088
|
+
blendshapes: Float32Array;
|
|
1089
|
+
/** Original LAM blendshapes (52) */
|
|
1090
|
+
lamBlendshapes: Float32Array;
|
|
1091
|
+
/** Emotion-driven upper face blendshapes (11) */
|
|
1092
|
+
emotionBlendshapes: UpperFaceBlendshapes;
|
|
1093
|
+
/** Raw emotion frame data */
|
|
1094
|
+
emotion: EmotionFrame | null;
|
|
1095
|
+
/** AudioContext timestamp for this frame */
|
|
1096
|
+
timestamp: number;
|
|
1097
|
+
}
|
|
1098
|
+
/**
|
|
1099
|
+
* Events emitted by FullFacePipeline
|
|
1100
|
+
*/
|
|
1101
|
+
interface FullFacePipelineEvents {
|
|
1102
|
+
/** New merged frame ready for display */
|
|
1103
|
+
full_frame_ready: FullFaceFrame;
|
|
1104
|
+
/** Raw LAM frame ready (for debugging/monitoring) */
|
|
1105
|
+
lam_frame_ready: Float32Array;
|
|
1106
|
+
/** Emotion frame ready (for debugging/monitoring) */
|
|
1107
|
+
emotion_frame_ready: EmotionFrame;
|
|
1108
|
+
/** Playback has completed */
|
|
1109
|
+
playback_complete: void;
|
|
1110
|
+
/** First frame ready, playback starting */
|
|
1111
|
+
playback_start: number;
|
|
1112
|
+
/** Error occurred */
|
|
1113
|
+
error: Error;
|
|
1114
|
+
/** Index signature for EventEmitter compatibility */
|
|
1115
|
+
[key: string]: unknown;
|
|
1116
|
+
}
|
|
1117
|
+
/**
|
|
1118
|
+
* FullFacePipeline - Unified LAM + Emotion animation pipeline
|
|
1119
|
+
*
|
|
1120
|
+
* Audio-first design matching SyncedAudioPipeline:
|
|
1121
|
+
* - Audio is scheduled immediately (never waits for LAM)
|
|
1122
|
+
* - LAM runs in background (fire-and-forget)
|
|
1123
|
+
* - Emotion from setEmotionLabel() or prosody fallback
|
|
1124
|
+
*/
|
|
1125
|
+
declare class FullFacePipeline extends EventEmitter<FullFacePipelineEvents> {
|
|
1126
|
+
private readonly options;
|
|
1127
|
+
private scheduler;
|
|
1128
|
+
private coalescer;
|
|
1129
|
+
private lamPipeline;
|
|
1130
|
+
private emotionMapper;
|
|
1131
|
+
private energyAnalyzer;
|
|
1132
|
+
private playbackStarted;
|
|
1133
|
+
private monitorInterval;
|
|
1134
|
+
private frameAnimationId;
|
|
1135
|
+
private lastEmotionFrame;
|
|
1136
|
+
private currentAudioEnergy;
|
|
1137
|
+
private lastNewFrameTime;
|
|
1138
|
+
private lastKnownLamFrame;
|
|
1139
|
+
private staleWarningEmitted;
|
|
1140
|
+
private static readonly STALE_FRAME_THRESHOLD_MS;
|
|
1141
|
+
private emotionBlendFactor;
|
|
1142
|
+
private lamBlendFactor;
|
|
1143
|
+
constructor(options: FullFacePipelineOptions);
|
|
1144
|
+
/**
|
|
1145
|
+
* Initialize the pipeline
|
|
1146
|
+
*/
|
|
1147
|
+
initialize(): Promise<void>;
|
|
1148
|
+
/**
|
|
1149
|
+
* Set emotion label from backend (e.g., LLM response emotion).
|
|
1150
|
+
*
|
|
1151
|
+
* Converts a natural language emotion label into an EmotionFrame
|
|
1152
|
+
* that drives upper face blendshapes for the duration of the utterance.
|
|
1153
|
+
*
|
|
1154
|
+
* Supported labels: happy, excited, joyful, sad, melancholic, angry,
|
|
1155
|
+
* frustrated, neutral, etc.
|
|
1156
|
+
*
|
|
1157
|
+
* @param label - Emotion label string (case-insensitive)
|
|
1158
|
+
*/
|
|
1159
|
+
setEmotionLabel(label: string): void;
|
|
1160
|
+
/**
|
|
1161
|
+
* Clear any set emotion label.
|
|
1162
|
+
* Falls back to prosody-only upper face animation.
|
|
1163
|
+
*/
|
|
1164
|
+
clearEmotionLabel(): void;
|
|
1165
|
+
/**
|
|
1166
|
+
* Start a new playback session
|
|
1167
|
+
*
|
|
1168
|
+
* Resets all state and prepares for incoming audio chunks.
|
|
1169
|
+
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
1170
|
+
*/
|
|
1171
|
+
start(): void;
|
|
1172
|
+
/**
|
|
1173
|
+
* Receive audio chunk from network
|
|
1174
|
+
*
|
|
1175
|
+
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
1176
|
+
* This prevents LAM inference (50-300ms) from blocking audio scheduling.
|
|
1177
|
+
*
|
|
1178
|
+
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
1179
|
+
*/
|
|
1180
|
+
onAudioChunk(chunk: Uint8Array): Promise<void>;
|
|
1181
|
+
/**
|
|
1182
|
+
* Get emotion frame for current animation.
|
|
1183
|
+
*
|
|
1184
|
+
* Priority:
|
|
1185
|
+
* 1. Explicit emotion label from setEmotionLabel()
|
|
1186
|
+
* 2. Prosody fallback: subtle brow movement from audio energy
|
|
1187
|
+
*/
|
|
1188
|
+
private getEmotionFrame;
|
|
1189
|
+
/**
|
|
1190
|
+
* Merge LAM blendshapes with emotion upper face blendshapes
|
|
1191
|
+
*/
|
|
1192
|
+
mergeBlendshapes(lamFrame: Float32Array, emotionFrame: EmotionFrame | null, audioEnergy?: number): {
|
|
1193
|
+
merged: Float32Array;
|
|
1194
|
+
emotionBlendshapes: UpperFaceBlendshapes;
|
|
1195
|
+
};
|
|
1196
|
+
/**
|
|
1197
|
+
* Start frame animation loop
|
|
1198
|
+
*/
|
|
1199
|
+
private startFrameLoop;
|
|
1200
|
+
/**
|
|
1201
|
+
* End of audio stream
|
|
1202
|
+
*/
|
|
1203
|
+
end(): Promise<void>;
|
|
1204
|
+
/**
|
|
1205
|
+
* Stop playback immediately with smooth fade-out
|
|
1206
|
+
*/
|
|
1207
|
+
stop(fadeOutMs?: number): Promise<void>;
|
|
1208
|
+
/**
|
|
1209
|
+
* Start monitoring for playback completion
|
|
1210
|
+
*/
|
|
1211
|
+
private startMonitoring;
|
|
1212
|
+
/**
|
|
1213
|
+
* Stop monitoring
|
|
1214
|
+
*/
|
|
1215
|
+
private stopMonitoring;
|
|
1216
|
+
/**
|
|
1217
|
+
* Get current pipeline state (for debugging/monitoring)
|
|
1218
|
+
*/
|
|
1219
|
+
getState(): {
|
|
1220
|
+
playbackStarted: boolean;
|
|
1221
|
+
coalescerFill: number;
|
|
1222
|
+
lamFill: number;
|
|
1223
|
+
queuedLAMFrames: number;
|
|
1224
|
+
emotionLabel: "neutral" | "happy" | "angry" | "sad" | null;
|
|
1225
|
+
currentAudioEnergy: number;
|
|
1226
|
+
currentTime: number;
|
|
1227
|
+
playbackEndTime: number;
|
|
1228
|
+
};
|
|
1229
|
+
/**
|
|
1230
|
+
* Check if an explicit emotion label is currently set
|
|
1231
|
+
*/
|
|
1232
|
+
get hasEmotionLabel(): boolean;
|
|
1233
|
+
/**
|
|
1234
|
+
* Cleanup resources
|
|
1235
|
+
*/
|
|
1236
|
+
dispose(): void;
|
|
1237
|
+
}
|
|
1238
|
+
|
|
761
1239
|
/**
|
|
762
1240
|
* Lazy ONNX Runtime loader with conditional WebGPU/WASM bundle loading
|
|
763
1241
|
*
|
|
@@ -1179,6 +1657,8 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
|
1179
1657
|
private isLoading;
|
|
1180
1658
|
private numIdentityClasses;
|
|
1181
1659
|
private inferenceQueue;
|
|
1660
|
+
private poisoned;
|
|
1661
|
+
private static readonly INFERENCE_TIMEOUT_MS;
|
|
1182
1662
|
constructor(config: Wav2Vec2InferenceConfig);
|
|
1183
1663
|
/**
|
|
1184
1664
|
* Check if WebGPU is available and working
|
|
@@ -1187,6 +1667,8 @@ declare class Wav2Vec2Inference implements LipSyncBackend {
|
|
|
1187
1667
|
static isWebGPUAvailable: typeof isWebGPUAvailable;
|
|
1188
1668
|
get backend(): 'webgpu' | 'wasm' | null;
|
|
1189
1669
|
get isLoaded(): boolean;
|
|
1670
|
+
/** True if inference timed out and the session is permanently unusable */
|
|
1671
|
+
get isSessionPoisoned(): boolean;
|
|
1190
1672
|
/**
|
|
1191
1673
|
* Load the ONNX model
|
|
1192
1674
|
*/
|
|
@@ -4069,4 +4551,157 @@ declare class EmphasisDetector {
|
|
|
4069
4551
|
reset(): void;
|
|
4070
4552
|
}
|
|
4071
4553
|
|
|
4072
|
-
|
|
4554
|
+
/**
|
|
4555
|
+
* ProceduralLifeLayer - Renderer-agnostic procedural animation system
|
|
4556
|
+
*
|
|
4557
|
+
* Outputs per-frame blendshape values and head deltas for organic life-like
|
|
4558
|
+
* animation. No Three.js, no React, no R3F — just math.
|
|
4559
|
+
*
|
|
4560
|
+
* Implements research-based eye behavior, blinks, gaze breaks, microsaccades,
|
|
4561
|
+
* breathing/postural sway, and simplex noise-driven brow drift.
|
|
4562
|
+
*
|
|
4563
|
+
* Research sources:
|
|
4564
|
+
* - Blink frequency: 15-20/min (every 3-4s), PMC4043155
|
|
4565
|
+
* - Saccade latency: ~200ms, duration 20-200ms
|
|
4566
|
+
* - Microsaccades: ~1/second, amplitude 0.02-0.05, Scholarpedia
|
|
4567
|
+
* - Fixation duration: 200-350ms, Nature Scientific Reports
|
|
4568
|
+
* - Brow noise: NVIDIA Audio2Face, Unreal MetaHuman layered procedural animation
|
|
4569
|
+
*
|
|
4570
|
+
* @category Animation
|
|
4571
|
+
*
|
|
4572
|
+
* @example
|
|
4573
|
+
* ```typescript
|
|
4574
|
+
* import { ProceduralLifeLayer } from '@omote/core';
|
|
4575
|
+
*
|
|
4576
|
+
* const lifeLayer = new ProceduralLifeLayer();
|
|
4577
|
+
*
|
|
4578
|
+
* // In animation loop:
|
|
4579
|
+
* const output = lifeLayer.update(delta, {
|
|
4580
|
+
* eyeTargetX: normalizedX, // -1..1 from camera math
|
|
4581
|
+
* eyeTargetY: normalizedY,
|
|
4582
|
+
* audioEnergy: energy, // 0-1 from AudioEnergyAnalyzer
|
|
4583
|
+
* isSpeaking: true,
|
|
4584
|
+
* });
|
|
4585
|
+
*
|
|
4586
|
+
* // Apply blendshapes to mesh
|
|
4587
|
+
* for (const [name, value] of Object.entries(output.blendshapes)) {
|
|
4588
|
+
* const idx = mesh.morphTargetDictionary?.[name];
|
|
4589
|
+
* if (idx !== undefined) mesh.morphTargetInfluences![idx] = value;
|
|
4590
|
+
* }
|
|
4591
|
+
*
|
|
4592
|
+
* // Apply head delta to head bone
|
|
4593
|
+
* headBone.rotation.y += output.headDelta.yaw;
|
|
4594
|
+
* headBone.rotation.x += output.headDelta.pitch;
|
|
4595
|
+
* ```
|
|
4596
|
+
*/
|
|
4597
|
+
/**
|
|
4598
|
+
* Configuration for ProceduralLifeLayer
|
|
4599
|
+
*/
|
|
4600
|
+
interface LifeLayerConfig {
|
|
4601
|
+
/** Seconds between blinks [min, max]. Default: [2.5, 6] */
|
|
4602
|
+
blinkIntervalRange?: [number, number];
|
|
4603
|
+
/** Seconds between gaze breaks [min, max]. Default: [3, 8] */
|
|
4604
|
+
gazeBreakIntervalRange?: [number, number];
|
|
4605
|
+
/** Gaze break deviation range [min, max]. Default: [0.15, 0.4] */
|
|
4606
|
+
gazeBreakAmplitudeRange?: [number, number];
|
|
4607
|
+
/** Eye micro-motion noise amplitude (0 to disable). Default: 0.06 */
|
|
4608
|
+
eyeNoiseAmplitude?: number;
|
|
4609
|
+
/** Base simplex noise amplitude for brow drift. Default: 0.30 */
|
|
4610
|
+
browNoiseAmplitude?: number;
|
|
4611
|
+
/** Multiply brow noise when speaking. Default: 2.0 */
|
|
4612
|
+
browNoiseSpeechMultiplier?: number;
|
|
4613
|
+
/** Breathing rate in Hz (0.25 = 15 breaths/min). Default: 0.25 */
|
|
4614
|
+
breathingRate?: number;
|
|
4615
|
+
/** Postural sway amplitude in radians. Default: 0.002 */
|
|
4616
|
+
posturalSwayAmplitude?: number;
|
|
4617
|
+
/** Max eye movement from center (0-1). Default: 0.8 */
|
|
4618
|
+
eyeMaxDeviation?: number;
|
|
4619
|
+
/** Eye smoothing factor (higher = faster response). Default: 15 */
|
|
4620
|
+
eyeSmoothing?: number;
|
|
4621
|
+
}
|
|
4622
|
+
/**
|
|
4623
|
+
* Per-frame input to the life layer
|
|
4624
|
+
*/
|
|
4625
|
+
interface LifeLayerInput {
|
|
4626
|
+
/** Normalized eye target X: -1 (left) to 1 (right). Consumer computes from camera. */
|
|
4627
|
+
eyeTargetX?: number;
|
|
4628
|
+
/** Normalized eye target Y: -1 (down) to 1 (up). Consumer computes from camera. */
|
|
4629
|
+
eyeTargetY?: number;
|
|
4630
|
+
/** Audio energy 0-1 (from AudioEnergyAnalyzer). Drives brow noise amplitude. */
|
|
4631
|
+
audioEnergy?: number;
|
|
4632
|
+
/** Whether avatar is speaking. Multiplies brow noise amplitude. */
|
|
4633
|
+
isSpeaking?: boolean;
|
|
4634
|
+
}
|
|
4635
|
+
/**
|
|
4636
|
+
* Per-frame output from the life layer
|
|
4637
|
+
*/
|
|
4638
|
+
interface LifeLayerOutput {
|
|
4639
|
+
/** Blendshape values to SET directly on mesh (eyes, brows, cheeks). */
|
|
4640
|
+
blendshapes: Record<string, number>;
|
|
4641
|
+
/** Head rotation deltas in radians. Consumer adds to head bone rotation. */
|
|
4642
|
+
headDelta: {
|
|
4643
|
+
yaw: number;
|
|
4644
|
+
pitch: number;
|
|
4645
|
+
};
|
|
4646
|
+
}
|
|
4647
|
+
/**
|
|
4648
|
+
* ProceduralLifeLayer - Renderer-agnostic procedural animation
|
|
4649
|
+
*
|
|
4650
|
+
* Generates per-frame blendshape values and head rotation deltas
|
|
4651
|
+
* for natural eye behavior, blinks, brow movement, and breathing.
|
|
4652
|
+
*/
|
|
4653
|
+
declare class ProceduralLifeLayer {
|
|
4654
|
+
private blinkIntervalRange;
|
|
4655
|
+
private gazeBreakIntervalRange;
|
|
4656
|
+
private gazeBreakAmplitudeRange;
|
|
4657
|
+
private eyeNoiseAmplitude;
|
|
4658
|
+
private browNoiseAmplitude;
|
|
4659
|
+
private browNoiseSpeechMultiplier;
|
|
4660
|
+
private breathingRate;
|
|
4661
|
+
private posturalSwayAmplitude;
|
|
4662
|
+
private eyeMaxDeviation;
|
|
4663
|
+
private eyeSmoothing;
|
|
4664
|
+
private blinkTimer;
|
|
4665
|
+
private blinkInterval;
|
|
4666
|
+
private blinkPhase;
|
|
4667
|
+
private blinkProgress;
|
|
4668
|
+
private asymmetryRight;
|
|
4669
|
+
private smoothedBlinkLeft;
|
|
4670
|
+
private smoothedBlinkRight;
|
|
4671
|
+
private smoothedEyeX;
|
|
4672
|
+
private smoothedEyeY;
|
|
4673
|
+
private eyeNoiseTime;
|
|
4674
|
+
private gazeBreakTimer;
|
|
4675
|
+
private gazeBreakInterval;
|
|
4676
|
+
private gazeBreakPhase;
|
|
4677
|
+
private gazeBreakProgress;
|
|
4678
|
+
private gazeBreakTargetX;
|
|
4679
|
+
private gazeBreakTargetY;
|
|
4680
|
+
private gazeBreakCurrentX;
|
|
4681
|
+
private gazeBreakCurrentY;
|
|
4682
|
+
private microMotionTime;
|
|
4683
|
+
private breathingPhase;
|
|
4684
|
+
private noiseTime;
|
|
4685
|
+
private previousEnergy;
|
|
4686
|
+
private emphasisLevel;
|
|
4687
|
+
constructor(config?: LifeLayerConfig);
|
|
4688
|
+
/**
|
|
4689
|
+
* Update the life layer and produce output for this frame.
|
|
4690
|
+
*
|
|
4691
|
+
* @param delta - Time since last frame in seconds
|
|
4692
|
+
* @param input - Per-frame input (eye target, audio energy, speaking state)
|
|
4693
|
+
* @returns Blendshape values and head rotation deltas
|
|
4694
|
+
*/
|
|
4695
|
+
update(delta: number, input?: LifeLayerInput): LifeLayerOutput;
|
|
4696
|
+
/**
|
|
4697
|
+
* Reset all internal state to initial values.
|
|
4698
|
+
*/
|
|
4699
|
+
reset(): void;
|
|
4700
|
+
private updateBlinks;
|
|
4701
|
+
private getBlinkValues;
|
|
4702
|
+
private getEyeMicroMotion;
|
|
4703
|
+
private updateGazeBreaks;
|
|
4704
|
+
private updateBrowNoise;
|
|
4705
|
+
}
|
|
4706
|
+
|
|
4707
|
+
export { type AIAdapter, type AIAdapterEvents, type AISessionState, ARKIT_BLENDSHAPES, type ActiveSpan, AgentCoreAdapter, type AgentCoreConfig, type AnimationClip, AnimationEvent, AnimationGraph, type AnimationGraphConfig, type AnimationGraphEvents, type AnimationLayer, type AnimationOutput, type AnimationState, type AnimationStateName, type AnimationTrigger, AudioChunkCoalescer, type AudioChunkCoalescerOptions, AudioEnergyAnalyzer, AudioScheduler, type AudioSchedulerOptions, type AudioSyncConfig, type AudioSyncEvents, AudioSyncManager, type BackendPreference, type BlendWeight, type CTCDecodeResult, CTC_VOCAB, type CacheConfig, type CacheSpanAttributes, ConsoleExporter, type ConversationMessage, ConversationOrchestrator, type ConversationSession, type CreateLipSyncConfig, DEFAULT_ANIMATION_CONFIG, EMOTION_ARKIT_MAP, EMOTION_NAMES, EMOTION_VECTOR_SIZE, type Emotion2VecLabel, type EmotionAnimationMap, type EmotionBlendMode, type EmotionBlendshapeConfig, EmotionController, type EmotionFrame, type EmotionLabel, type EmotionName, type EmotionPresetName, EmotionPresets, EmotionToBlendshapeMapper, type EmotionWeights, EmphasisDetector, EventEmitter, type FetchWithCacheOptions, type FullFaceFrame, FullFacePipeline, type FullFacePipelineEvents, type FullFacePipelineOptions, INFERENCE_LATENCY_BUCKETS, type InferenceSpanAttributes, type InterruptionConfig, type InterruptionEvents, InterruptionHandler, type KaldiFbankOptions, type LAMFrame, LAMPipeline, type LAMPipelineOptions, LAM_BLENDSHAPES, type LifeLayerConfig, type LifeLayerInput, type LifeLayerOutput, type LipSyncBackend, type LipSyncModelInfo, type LipSyncResult, MODEL_LOAD_TIME_BUCKETS, type MessageRole, type MetricData, MetricNames, MicrophoneCapture, type MicrophoneCaptureConfig, ModelCache, type ModelSpanAttributes, OTLPExporter, type OTLPExporterConfig, OmoteEvents, OmoteTelemetry, type OrchestratorConfig, type OrchestratorEvents, ProceduralLifeLayer, type QuotaInfo, RingBuffer, type RuntimeBackend, type SafariSpeechConfig, SafariSpeechRecognition, type SamplingConfig, type SenseVoiceConfig, SenseVoiceInference, type SenseVoiceLanguage, type SenseVoiceModelInfo, type SenseVoiceResult, type SessionConfig, type SessionOptions, type SessionSnapshot, type SileroVADBackend, type SileroVADConfig, type SileroVADFactoryConfig, SileroVADInference, SileroVADWorker, type SpanAttributes, type SpanData, type SpeechErrorCallback, type SpeechRecognitionResult, type SpeechResultCallback, type SpeechSegment, SyncedAudioPipeline, type SyncedAudioPipelineEvents, type SyncedAudioPipelineOptions, type TelemetryConfig, type TelemetryExporter, type TelemetryExporterInterface, type TenantConfig, TenantManager, type TenantQuota, type TenantUsage, type TokenRefreshCallback, type Transition, UPPER_FACE_BLENDSHAPES, type UpperFaceBlendshapeName, type UpperFaceBlendshapes, type VADBackend, type VADModelInfo, type VADResult$1 as VADResult, type VADWorkerConfig, type VADWorkerModelInfo, type ValidationResult, type VoiceConfig, WAV2ARKIT_BLENDSHAPES, type Wav2ArkitCpuConfig, Wav2ArkitCpuInference, Wav2Vec2Inference, type Wav2Vec2InferenceConfig, type Wav2Vec2Result, applyCMVN, applyLFR, blendEmotions, calculatePeak, calculateRMS, computeKaldiFbank, configureCacheLimit, configureTelemetry, createEmotionVector, createLipSync, createSessionWithFallback, createSileroVAD, ctcGreedyDecode, fetchWithCache, formatBytes, getCacheConfig, getCacheKey, getEmotionPreset, getLoadedBackend, getModelCache, getOnnxRuntime, getOnnxRuntimeForPreference, getOptimalWasmThreads, getRecommendedBackend, getSessionOptions, getTelemetry, hasWebGPUApi, isAndroid, isIOS, isIOSSafari, isMobile, isOnnxRuntimeLoaded, isSafari, isSpeechRecognitionAvailable, isWebGPUAvailable, lerpEmotion, parseCMVNFromMetadata, parseTokensFile, preloadModels, preloadOnnxRuntime, remapWav2ArkitToLam, resolveBackend, resolveLanguageId, resolveTextNormId, shouldEnableWasmProxy, shouldUseCpuLipSync, shouldUseNativeASR, shouldUseServerLipSync, supportsVADWorker, symmetrizeBlendshapes };
|