@omote/core 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +636 -1
- package/dist/index.d.ts +636 -1
- package/dist/index.js +1429 -203
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +1429 -203
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -1
package/dist/index.mjs
CHANGED
|
@@ -874,6 +874,327 @@ var UPPER_FACE_BLENDSHAPES = [
|
|
|
874
874
|
"cheekSquintLeft",
|
|
875
875
|
"cheekSquintRight"
|
|
876
876
|
];
|
|
877
|
+
var EMOTION_ARKIT_MAP = {
|
|
878
|
+
happy: {
|
|
879
|
+
// AU6 - Cheek raiser (primary Duchenne smile marker)
|
|
880
|
+
cheekSquintLeft: 0.5,
|
|
881
|
+
cheekSquintRight: 0.5,
|
|
882
|
+
// Slight eye squint from genuine smile (orbicularis oculi activation)
|
|
883
|
+
eyeSquintLeft: 0.2,
|
|
884
|
+
eyeSquintRight: 0.2
|
|
885
|
+
},
|
|
886
|
+
angry: {
|
|
887
|
+
// AU4 - Brow lowerer (intense, primary anger marker)
|
|
888
|
+
browDownLeft: 0.7,
|
|
889
|
+
browDownRight: 0.7,
|
|
890
|
+
// AU5 - Upper lid raiser (wide eyes, part of the "glare")
|
|
891
|
+
eyeWideLeft: 0.4,
|
|
892
|
+
eyeWideRight: 0.4,
|
|
893
|
+
// AU7 - Lid tightener (tense stare, combines with AU5 for angry glare)
|
|
894
|
+
eyeSquintLeft: 0.3,
|
|
895
|
+
eyeSquintRight: 0.3
|
|
896
|
+
},
|
|
897
|
+
sad: {
|
|
898
|
+
// AU1 - Inner brow raiser (primary sadness marker)
|
|
899
|
+
browInnerUp: 0.6,
|
|
900
|
+
// AU4 - Brow lowerer (brows drawn together)
|
|
901
|
+
browDownLeft: 0.3,
|
|
902
|
+
browDownRight: 0.3
|
|
903
|
+
},
|
|
904
|
+
neutral: {}
|
|
905
|
+
// All zeros - no expression overlay
|
|
906
|
+
};
|
|
907
|
+
var DEFAULT_CONFIG = {
|
|
908
|
+
smoothingFactor: 0.15,
|
|
909
|
+
confidenceThreshold: 0.3,
|
|
910
|
+
intensity: 1,
|
|
911
|
+
blendMode: "dominant",
|
|
912
|
+
minBlendProbability: 0.1,
|
|
913
|
+
energyModulation: false,
|
|
914
|
+
minEnergyScale: 0.3,
|
|
915
|
+
maxEnergyScale: 1
|
|
916
|
+
};
|
|
917
|
+
function createZeroBlendshapes() {
|
|
918
|
+
const result = {};
|
|
919
|
+
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
920
|
+
result[name] = 0;
|
|
921
|
+
}
|
|
922
|
+
return result;
|
|
923
|
+
}
|
|
924
|
+
function clamp01(value) {
|
|
925
|
+
return Math.max(0, Math.min(1, value));
|
|
926
|
+
}
|
|
927
|
+
var EmotionToBlendshapeMapper = class {
|
|
928
|
+
/**
|
|
929
|
+
* Create a new EmotionToBlendshapeMapper
|
|
930
|
+
*
|
|
931
|
+
* @param config - Optional configuration
|
|
932
|
+
*/
|
|
933
|
+
constructor(config) {
|
|
934
|
+
this.currentEnergy = 1;
|
|
935
|
+
this.config = {
|
|
936
|
+
...DEFAULT_CONFIG,
|
|
937
|
+
...config
|
|
938
|
+
};
|
|
939
|
+
this.targetBlendshapes = createZeroBlendshapes();
|
|
940
|
+
this.currentBlendshapes = createZeroBlendshapes();
|
|
941
|
+
}
|
|
942
|
+
/**
|
|
943
|
+
* Map an emotion frame to target blendshapes
|
|
944
|
+
*
|
|
945
|
+
* This sets the target values that the mapper will smoothly interpolate
|
|
946
|
+
* towards. Call update() each frame to apply smoothing.
|
|
947
|
+
*
|
|
948
|
+
* @param frame - Emotion frame from Emotion2VecInference
|
|
949
|
+
* @param audioEnergy - Optional audio energy (0-1) for energy modulation
|
|
950
|
+
* @returns Target upper face blendshapes (before smoothing)
|
|
951
|
+
*/
|
|
952
|
+
mapFrame(frame, audioEnergy) {
|
|
953
|
+
this.targetBlendshapes = createZeroBlendshapes();
|
|
954
|
+
if (audioEnergy !== void 0) {
|
|
955
|
+
this.currentEnergy = clamp01(audioEnergy);
|
|
956
|
+
}
|
|
957
|
+
if (!frame) {
|
|
958
|
+
return { ...this.targetBlendshapes };
|
|
959
|
+
}
|
|
960
|
+
if (this.config.blendMode === "weighted") {
|
|
961
|
+
this.mapFrameWeighted(frame);
|
|
962
|
+
} else {
|
|
963
|
+
this.mapFrameDominant(frame);
|
|
964
|
+
}
|
|
965
|
+
if (this.config.energyModulation) {
|
|
966
|
+
this.applyEnergyModulation();
|
|
967
|
+
}
|
|
968
|
+
return { ...this.targetBlendshapes };
|
|
969
|
+
}
|
|
970
|
+
/**
|
|
971
|
+
* Map using dominant emotion only (original behavior)
|
|
972
|
+
*/
|
|
973
|
+
mapFrameDominant(frame) {
|
|
974
|
+
if (frame.confidence < this.config.confidenceThreshold) {
|
|
975
|
+
return;
|
|
976
|
+
}
|
|
977
|
+
const emotion = frame.emotion;
|
|
978
|
+
const mapping = EMOTION_ARKIT_MAP[emotion];
|
|
979
|
+
if (!mapping) {
|
|
980
|
+
return;
|
|
981
|
+
}
|
|
982
|
+
const scale = this.config.intensity * frame.confidence;
|
|
983
|
+
for (const [name, value] of Object.entries(mapping)) {
|
|
984
|
+
const blendshapeName = name;
|
|
985
|
+
if (value !== void 0) {
|
|
986
|
+
this.targetBlendshapes[blendshapeName] = clamp01(value * scale);
|
|
987
|
+
}
|
|
988
|
+
}
|
|
989
|
+
}
|
|
990
|
+
/**
|
|
991
|
+
* Map using weighted blend of all emotions by probability
|
|
992
|
+
* Creates more nuanced expressions (e.g., bittersweet = happy + sad)
|
|
993
|
+
*/
|
|
994
|
+
mapFrameWeighted(frame) {
|
|
995
|
+
if (!frame.probabilities) {
|
|
996
|
+
this.mapFrameDominant(frame);
|
|
997
|
+
return;
|
|
998
|
+
}
|
|
999
|
+
for (const [emotion, probability] of Object.entries(frame.probabilities)) {
|
|
1000
|
+
if (probability < this.config.minBlendProbability) {
|
|
1001
|
+
continue;
|
|
1002
|
+
}
|
|
1003
|
+
const mapping = EMOTION_ARKIT_MAP[emotion];
|
|
1004
|
+
if (!mapping) {
|
|
1005
|
+
continue;
|
|
1006
|
+
}
|
|
1007
|
+
const scale = this.config.intensity * probability;
|
|
1008
|
+
for (const [name, value] of Object.entries(mapping)) {
|
|
1009
|
+
const blendshapeName = name;
|
|
1010
|
+
if (value !== void 0) {
|
|
1011
|
+
this.targetBlendshapes[blendshapeName] += value * scale;
|
|
1012
|
+
}
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1016
|
+
this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name]);
|
|
1017
|
+
}
|
|
1018
|
+
}
|
|
1019
|
+
/**
|
|
1020
|
+
* Apply energy modulation to scale emotion intensity by audio energy
|
|
1021
|
+
* Louder speech = stronger expressions
|
|
1022
|
+
*/
|
|
1023
|
+
applyEnergyModulation() {
|
|
1024
|
+
const { minEnergyScale, maxEnergyScale } = this.config;
|
|
1025
|
+
const energyScale = minEnergyScale + this.currentEnergy * (maxEnergyScale - minEnergyScale);
|
|
1026
|
+
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1027
|
+
this.targetBlendshapes[name] = clamp01(this.targetBlendshapes[name] * energyScale);
|
|
1028
|
+
}
|
|
1029
|
+
}
|
|
1030
|
+
/**
|
|
1031
|
+
* Apply smoothing to interpolate current values towards target
|
|
1032
|
+
*
|
|
1033
|
+
* Uses exponential moving average:
|
|
1034
|
+
* current = current + smoothingFactor * (target - current)
|
|
1035
|
+
*
|
|
1036
|
+
* @param _deltaMs - Delta time in milliseconds (reserved for future time-based smoothing)
|
|
1037
|
+
*/
|
|
1038
|
+
update(_deltaMs) {
|
|
1039
|
+
const factor = this.config.smoothingFactor;
|
|
1040
|
+
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
1041
|
+
const target = this.targetBlendshapes[name];
|
|
1042
|
+
const current = this.currentBlendshapes[name];
|
|
1043
|
+
this.currentBlendshapes[name] = clamp01(current + factor * (target - current));
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
/**
|
|
1047
|
+
* Get current smoothed blendshape values
|
|
1048
|
+
*
|
|
1049
|
+
* @returns Current upper face blendshapes (after smoothing)
|
|
1050
|
+
*/
|
|
1051
|
+
getCurrentBlendshapes() {
|
|
1052
|
+
return { ...this.currentBlendshapes };
|
|
1053
|
+
}
|
|
1054
|
+
/**
|
|
1055
|
+
* Reset mapper to neutral state
|
|
1056
|
+
*
|
|
1057
|
+
* Sets both target and current blendshapes to zero.
|
|
1058
|
+
*/
|
|
1059
|
+
reset() {
|
|
1060
|
+
this.targetBlendshapes = createZeroBlendshapes();
|
|
1061
|
+
this.currentBlendshapes = createZeroBlendshapes();
|
|
1062
|
+
this.currentEnergy = 1;
|
|
1063
|
+
}
|
|
1064
|
+
/**
|
|
1065
|
+
* Get current configuration
|
|
1066
|
+
*/
|
|
1067
|
+
getConfig() {
|
|
1068
|
+
return { ...this.config };
|
|
1069
|
+
}
|
|
1070
|
+
/**
|
|
1071
|
+
* Update configuration
|
|
1072
|
+
*
|
|
1073
|
+
* @param config - Partial configuration to update
|
|
1074
|
+
*/
|
|
1075
|
+
setConfig(config) {
|
|
1076
|
+
this.config = {
|
|
1077
|
+
...this.config,
|
|
1078
|
+
...config
|
|
1079
|
+
};
|
|
1080
|
+
}
|
|
1081
|
+
};
|
|
1082
|
+
|
|
1083
|
+
// src/animation/audioEnergy.ts
|
|
1084
|
+
function calculateRMS(samples) {
|
|
1085
|
+
if (samples.length === 0) return 0;
|
|
1086
|
+
let sumSquares = 0;
|
|
1087
|
+
for (let i = 0; i < samples.length; i++) {
|
|
1088
|
+
sumSquares += samples[i] * samples[i];
|
|
1089
|
+
}
|
|
1090
|
+
return Math.sqrt(sumSquares / samples.length);
|
|
1091
|
+
}
|
|
1092
|
+
function calculatePeak(samples) {
|
|
1093
|
+
let peak = 0;
|
|
1094
|
+
for (let i = 0; i < samples.length; i++) {
|
|
1095
|
+
const abs = Math.abs(samples[i]);
|
|
1096
|
+
if (abs > peak) peak = abs;
|
|
1097
|
+
}
|
|
1098
|
+
return peak;
|
|
1099
|
+
}
|
|
1100
|
+
var AudioEnergyAnalyzer = class {
|
|
1101
|
+
/**
|
|
1102
|
+
* @param smoothingFactor How much to smooth (0 = no smoothing, 1 = infinite smoothing). Default 0.85
|
|
1103
|
+
* @param noiseFloor Minimum energy threshold to consider as signal. Default 0.01
|
|
1104
|
+
*/
|
|
1105
|
+
constructor(smoothingFactor = 0.85, noiseFloor = 0.01) {
|
|
1106
|
+
this.smoothedRMS = 0;
|
|
1107
|
+
this.smoothedPeak = 0;
|
|
1108
|
+
this.smoothingFactor = Math.max(0, Math.min(0.99, smoothingFactor));
|
|
1109
|
+
this.noiseFloor = noiseFloor;
|
|
1110
|
+
}
|
|
1111
|
+
/**
|
|
1112
|
+
* Process audio samples and return smoothed energy values
|
|
1113
|
+
* @param samples Audio samples (Float32Array)
|
|
1114
|
+
* @returns Object with rms and peak values
|
|
1115
|
+
*/
|
|
1116
|
+
process(samples) {
|
|
1117
|
+
const instantRMS = calculateRMS(samples);
|
|
1118
|
+
const instantPeak = calculatePeak(samples);
|
|
1119
|
+
const gatedRMS = instantRMS > this.noiseFloor ? instantRMS : 0;
|
|
1120
|
+
const gatedPeak = instantPeak > this.noiseFloor ? instantPeak : 0;
|
|
1121
|
+
if (gatedRMS > this.smoothedRMS) {
|
|
1122
|
+
this.smoothedRMS = this.smoothedRMS * 0.5 + gatedRMS * 0.5;
|
|
1123
|
+
} else {
|
|
1124
|
+
this.smoothedRMS = this.smoothedRMS * this.smoothingFactor + gatedRMS * (1 - this.smoothingFactor);
|
|
1125
|
+
}
|
|
1126
|
+
if (gatedPeak > this.smoothedPeak) {
|
|
1127
|
+
this.smoothedPeak = this.smoothedPeak * 0.3 + gatedPeak * 0.7;
|
|
1128
|
+
} else {
|
|
1129
|
+
this.smoothedPeak = this.smoothedPeak * this.smoothingFactor + gatedPeak * (1 - this.smoothingFactor);
|
|
1130
|
+
}
|
|
1131
|
+
const energy = this.smoothedRMS * 0.7 + this.smoothedPeak * 0.3;
|
|
1132
|
+
return {
|
|
1133
|
+
rms: this.smoothedRMS,
|
|
1134
|
+
peak: this.smoothedPeak,
|
|
1135
|
+
energy: Math.min(1, energy * 2)
|
|
1136
|
+
// Scale up and clamp
|
|
1137
|
+
};
|
|
1138
|
+
}
|
|
1139
|
+
/**
|
|
1140
|
+
* Reset analyzer state
|
|
1141
|
+
*/
|
|
1142
|
+
reset() {
|
|
1143
|
+
this.smoothedRMS = 0;
|
|
1144
|
+
this.smoothedPeak = 0;
|
|
1145
|
+
}
|
|
1146
|
+
/**
|
|
1147
|
+
* Get current smoothed RMS value
|
|
1148
|
+
*/
|
|
1149
|
+
get rms() {
|
|
1150
|
+
return this.smoothedRMS;
|
|
1151
|
+
}
|
|
1152
|
+
/**
|
|
1153
|
+
* Get current smoothed peak value
|
|
1154
|
+
*/
|
|
1155
|
+
get peak() {
|
|
1156
|
+
return this.smoothedPeak;
|
|
1157
|
+
}
|
|
1158
|
+
};
|
|
1159
|
+
var EmphasisDetector = class {
|
|
1160
|
+
/**
|
|
1161
|
+
* @param historySize Number of frames to track. Default 10
|
|
1162
|
+
* @param emphasisThreshold Minimum energy increase to count as emphasis. Default 0.15
|
|
1163
|
+
*/
|
|
1164
|
+
constructor(historySize = 10, emphasisThreshold = 0.15) {
|
|
1165
|
+
this.energyHistory = [];
|
|
1166
|
+
this.historySize = historySize;
|
|
1167
|
+
this.emphasisThreshold = emphasisThreshold;
|
|
1168
|
+
}
|
|
1169
|
+
/**
|
|
1170
|
+
* Process energy value and detect emphasis
|
|
1171
|
+
* @param energy Current energy value (0-1)
|
|
1172
|
+
* @returns Object with isEmphasis flag and emphasisStrength
|
|
1173
|
+
*/
|
|
1174
|
+
process(energy) {
|
|
1175
|
+
this.energyHistory.push(energy);
|
|
1176
|
+
if (this.energyHistory.length > this.historySize) {
|
|
1177
|
+
this.energyHistory.shift();
|
|
1178
|
+
}
|
|
1179
|
+
if (this.energyHistory.length < 3) {
|
|
1180
|
+
return { isEmphasis: false, emphasisStrength: 0 };
|
|
1181
|
+
}
|
|
1182
|
+
const prevFrames = this.energyHistory.slice(0, -1);
|
|
1183
|
+
const avgPrev = prevFrames.reduce((a, b) => a + b, 0) / prevFrames.length;
|
|
1184
|
+
const increase = energy - avgPrev;
|
|
1185
|
+
const isEmphasis = increase > this.emphasisThreshold;
|
|
1186
|
+
return {
|
|
1187
|
+
isEmphasis,
|
|
1188
|
+
emphasisStrength: isEmphasis ? Math.min(1, increase / 0.3) : 0
|
|
1189
|
+
};
|
|
1190
|
+
}
|
|
1191
|
+
/**
|
|
1192
|
+
* Reset detector state
|
|
1193
|
+
*/
|
|
1194
|
+
reset() {
|
|
1195
|
+
this.energyHistory = [];
|
|
1196
|
+
}
|
|
1197
|
+
};
|
|
877
1198
|
|
|
878
1199
|
// src/telemetry/exporters/console.ts
|
|
879
1200
|
var ConsoleExporter = class {
|
|
@@ -2511,7 +2832,7 @@ var CTC_VOCAB = [
|
|
|
2511
2832
|
"Q",
|
|
2512
2833
|
"Z"
|
|
2513
2834
|
];
|
|
2514
|
-
var
|
|
2835
|
+
var _Wav2Vec2Inference = class _Wav2Vec2Inference {
|
|
2515
2836
|
constructor(config) {
|
|
2516
2837
|
this.modelId = "wav2vec2";
|
|
2517
2838
|
this.session = null;
|
|
@@ -2520,6 +2841,10 @@ var Wav2Vec2Inference = class {
|
|
|
2520
2841
|
this.isLoading = false;
|
|
2521
2842
|
// Inference queue for handling concurrent calls
|
|
2522
2843
|
this.inferenceQueue = Promise.resolve();
|
|
2844
|
+
// Session health: set to true if session.run() times out.
|
|
2845
|
+
// A timed-out session may have a zombie GPU/WASM dispatch still running,
|
|
2846
|
+
// so all future infer() calls reject immediately to prevent concurrent access.
|
|
2847
|
+
this.poisoned = false;
|
|
2523
2848
|
this.config = config;
|
|
2524
2849
|
this.numIdentityClasses = config.numIdentityClasses ?? 12;
|
|
2525
2850
|
}
|
|
@@ -2529,6 +2854,10 @@ var Wav2Vec2Inference = class {
|
|
|
2529
2854
|
get isLoaded() {
|
|
2530
2855
|
return this.session !== null;
|
|
2531
2856
|
}
|
|
2857
|
+
/** True if inference timed out and the session is permanently unusable */
|
|
2858
|
+
get isSessionPoisoned() {
|
|
2859
|
+
return this.poisoned;
|
|
2860
|
+
}
|
|
2532
2861
|
/**
|
|
2533
2862
|
* Load the ONNX model
|
|
2534
2863
|
*/
|
|
@@ -2728,6 +3057,9 @@ var Wav2Vec2Inference = class {
|
|
|
2728
3057
|
if (!this.session) {
|
|
2729
3058
|
throw new Error("Model not loaded. Call load() first.");
|
|
2730
3059
|
}
|
|
3060
|
+
if (this.poisoned) {
|
|
3061
|
+
throw new Error("Wav2Vec2 session timed out \u2014 inference unavailable until page reload");
|
|
3062
|
+
}
|
|
2731
3063
|
const audioSamplesCopy = new Float32Array(audioSamples);
|
|
2732
3064
|
let audio;
|
|
2733
3065
|
if (audioSamplesCopy.length === 16e3) {
|
|
@@ -2766,121 +3098,490 @@ var Wav2Vec2Inference = class {
|
|
|
2766
3098
|
if (maxIdx !== prevToken && maxIdx !== 0) {
|
|
2767
3099
|
tokens.push(maxIdx);
|
|
2768
3100
|
}
|
|
2769
|
-
prevToken = maxIdx;
|
|
3101
|
+
prevToken = maxIdx;
|
|
3102
|
+
}
|
|
3103
|
+
return tokens.map((t) => CTC_VOCAB[t] === "|" ? " " : CTC_VOCAB[t]).join("");
|
|
3104
|
+
}
|
|
3105
|
+
/**
|
|
3106
|
+
* Queue inference to serialize ONNX session calls
|
|
3107
|
+
*/
|
|
3108
|
+
queueInference(feeds) {
|
|
3109
|
+
return new Promise((resolve, reject) => {
|
|
3110
|
+
this.inferenceQueue = this.inferenceQueue.then(async () => {
|
|
3111
|
+
const telemetry = getTelemetry();
|
|
3112
|
+
const span = telemetry?.startSpan("Wav2Vec2.infer", {
|
|
3113
|
+
"inference.backend": this._backend,
|
|
3114
|
+
"inference.input_samples": 16e3
|
|
3115
|
+
});
|
|
3116
|
+
try {
|
|
3117
|
+
const startTime = performance.now();
|
|
3118
|
+
const results = await Promise.race([
|
|
3119
|
+
this.session.run(feeds),
|
|
3120
|
+
new Promise(
|
|
3121
|
+
(_, rej) => setTimeout(
|
|
3122
|
+
() => rej(new Error(`Wav2Vec2 inference timed out after ${_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS}ms`)),
|
|
3123
|
+
_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3124
|
+
)
|
|
3125
|
+
)
|
|
3126
|
+
]);
|
|
3127
|
+
const inferenceTimeMs = performance.now() - startTime;
|
|
3128
|
+
const asrOutput = results["asr_logits"];
|
|
3129
|
+
const blendshapeOutput = results["blendshapes"];
|
|
3130
|
+
if (!asrOutput || !blendshapeOutput) {
|
|
3131
|
+
throw new Error("Missing outputs from model");
|
|
3132
|
+
}
|
|
3133
|
+
const asrData = asrOutput.data;
|
|
3134
|
+
const blendshapeData = blendshapeOutput.data;
|
|
3135
|
+
const numASRFrames = asrOutput.dims[1];
|
|
3136
|
+
const numA2EFrames = blendshapeOutput.dims[1];
|
|
3137
|
+
const asrVocabSize = asrOutput.dims[2];
|
|
3138
|
+
const numBlendshapes = blendshapeOutput.dims[2];
|
|
3139
|
+
const asrLogits = [];
|
|
3140
|
+
const blendshapes = [];
|
|
3141
|
+
for (let f = 0; f < numASRFrames; f++) {
|
|
3142
|
+
asrLogits.push(asrData.slice(f * asrVocabSize, (f + 1) * asrVocabSize));
|
|
3143
|
+
}
|
|
3144
|
+
for (let f = 0; f < numA2EFrames; f++) {
|
|
3145
|
+
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
3146
|
+
blendshapes.push(symmetrizeBlendshapes(rawFrame));
|
|
3147
|
+
}
|
|
3148
|
+
const text = this.decodeCTC(asrLogits);
|
|
3149
|
+
logger2.trace("Inference completed", {
|
|
3150
|
+
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
3151
|
+
numA2EFrames,
|
|
3152
|
+
numASRFrames,
|
|
3153
|
+
textLength: text.length
|
|
3154
|
+
});
|
|
3155
|
+
span?.setAttributes({
|
|
3156
|
+
"inference.duration_ms": inferenceTimeMs,
|
|
3157
|
+
"inference.a2e_frames": numA2EFrames,
|
|
3158
|
+
"inference.asr_frames": numASRFrames
|
|
3159
|
+
});
|
|
3160
|
+
span?.end();
|
|
3161
|
+
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
3162
|
+
model: "wav2vec2",
|
|
3163
|
+
backend: this._backend
|
|
3164
|
+
});
|
|
3165
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3166
|
+
model: "wav2vec2",
|
|
3167
|
+
backend: this._backend,
|
|
3168
|
+
status: "success"
|
|
3169
|
+
});
|
|
3170
|
+
resolve({
|
|
3171
|
+
blendshapes,
|
|
3172
|
+
asrLogits,
|
|
3173
|
+
text,
|
|
3174
|
+
numFrames: numA2EFrames,
|
|
3175
|
+
numA2EFrames,
|
|
3176
|
+
numASRFrames,
|
|
3177
|
+
inferenceTimeMs
|
|
3178
|
+
});
|
|
3179
|
+
} catch (err) {
|
|
3180
|
+
const errMsg = err instanceof Error ? err.message : String(err);
|
|
3181
|
+
if (errMsg.includes("timed out")) {
|
|
3182
|
+
this.poisoned = true;
|
|
3183
|
+
logger2.error("CRITICAL: Inference session timed out \u2014 LAM is dead. Page reload required.", {
|
|
3184
|
+
backend: this._backend,
|
|
3185
|
+
timeoutMs: _Wav2Vec2Inference.INFERENCE_TIMEOUT_MS
|
|
3186
|
+
});
|
|
3187
|
+
} else {
|
|
3188
|
+
logger2.error("Inference failed", { error: errMsg, backend: this._backend });
|
|
3189
|
+
}
|
|
3190
|
+
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
3191
|
+
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
3192
|
+
model: "wav2vec2",
|
|
3193
|
+
backend: this._backend,
|
|
3194
|
+
status: "error"
|
|
3195
|
+
});
|
|
3196
|
+
reject(err);
|
|
3197
|
+
}
|
|
3198
|
+
});
|
|
3199
|
+
});
|
|
3200
|
+
}
|
|
3201
|
+
/**
|
|
3202
|
+
* Get blendshape value by name for a specific frame
|
|
3203
|
+
*/
|
|
3204
|
+
getBlendshape(blendshapes, name) {
|
|
3205
|
+
const index = LAM_BLENDSHAPES.indexOf(name);
|
|
3206
|
+
if (index === -1) {
|
|
3207
|
+
throw new Error(`Unknown blendshape: ${name}`);
|
|
3208
|
+
}
|
|
3209
|
+
return blendshapes[index];
|
|
3210
|
+
}
|
|
3211
|
+
/**
|
|
3212
|
+
* Dispose of the model and free resources
|
|
3213
|
+
*/
|
|
3214
|
+
async dispose() {
|
|
3215
|
+
if (this.session) {
|
|
3216
|
+
await this.session.release();
|
|
3217
|
+
this.session = null;
|
|
3218
|
+
}
|
|
3219
|
+
}
|
|
3220
|
+
};
|
|
3221
|
+
_Wav2Vec2Inference.INFERENCE_TIMEOUT_MS = 5e3;
|
|
3222
|
+
/**
|
|
3223
|
+
* Check if WebGPU is available and working
|
|
3224
|
+
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
3225
|
+
*/
|
|
3226
|
+
_Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
3227
|
+
var Wav2Vec2Inference = _Wav2Vec2Inference;
|
|
3228
|
+
|
|
3229
|
+
// src/audio/FullFacePipeline.ts
|
|
3230
|
+
var logger3 = createLogger("FullFacePipeline");
|
|
3231
|
+
function pcm16ToFloat322(buffer) {
|
|
3232
|
+
const byteLen = buffer.byteLength & ~1;
|
|
3233
|
+
const int16 = byteLen === buffer.byteLength ? new Int16Array(buffer) : new Int16Array(buffer, 0, byteLen / 2);
|
|
3234
|
+
const float32 = new Float32Array(int16.length);
|
|
3235
|
+
for (let i = 0; i < int16.length; i++) {
|
|
3236
|
+
float32[i] = int16[i] / 32768;
|
|
3237
|
+
}
|
|
3238
|
+
return float32;
|
|
3239
|
+
}
|
|
3240
|
+
var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
|
|
3241
|
+
LAM_BLENDSHAPES.forEach((name, index) => {
|
|
3242
|
+
BLENDSHAPE_INDEX_MAP.set(name, index);
|
|
3243
|
+
});
|
|
3244
|
+
var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
|
|
3245
|
+
var EMOTION_LABEL_MAP = {
|
|
3246
|
+
// Direct labels
|
|
3247
|
+
happy: "happy",
|
|
3248
|
+
sad: "sad",
|
|
3249
|
+
angry: "angry",
|
|
3250
|
+
neutral: "neutral",
|
|
3251
|
+
// Natural language synonyms
|
|
3252
|
+
excited: "happy",
|
|
3253
|
+
joyful: "happy",
|
|
3254
|
+
cheerful: "happy",
|
|
3255
|
+
delighted: "happy",
|
|
3256
|
+
amused: "happy",
|
|
3257
|
+
melancholic: "sad",
|
|
3258
|
+
sorrowful: "sad",
|
|
3259
|
+
disappointed: "sad",
|
|
3260
|
+
frustrated: "angry",
|
|
3261
|
+
irritated: "angry",
|
|
3262
|
+
furious: "angry",
|
|
3263
|
+
annoyed: "angry",
|
|
3264
|
+
// SenseVoice labels
|
|
3265
|
+
fearful: "sad",
|
|
3266
|
+
disgusted: "angry",
|
|
3267
|
+
surprised: "happy"
|
|
3268
|
+
};
|
|
3269
|
+
var _FullFacePipeline = class _FullFacePipeline extends EventEmitter {
|
|
3270
|
+
constructor(options) {
|
|
3271
|
+
super();
|
|
3272
|
+
this.options = options;
|
|
3273
|
+
this.playbackStarted = false;
|
|
3274
|
+
this.monitorInterval = null;
|
|
3275
|
+
this.frameAnimationId = null;
|
|
3276
|
+
// Emotion state
|
|
3277
|
+
this.lastEmotionFrame = null;
|
|
3278
|
+
this.currentAudioEnergy = 0;
|
|
3279
|
+
// Stale frame detection
|
|
3280
|
+
this.lastNewFrameTime = 0;
|
|
3281
|
+
this.lastKnownLamFrame = null;
|
|
3282
|
+
this.staleWarningEmitted = false;
|
|
3283
|
+
const sampleRate = options.sampleRate ?? 16e3;
|
|
3284
|
+
this.emotionBlendFactor = options.emotionBlendFactor ?? 0.8;
|
|
3285
|
+
this.lamBlendFactor = options.lamBlendFactor ?? 0.2;
|
|
3286
|
+
const autoDelay = options.lam.modelId === "wav2arkit_cpu" ? 750 : options.lam.backend === "wasm" ? 350 : 50;
|
|
3287
|
+
const audioDelayMs = options.audioDelayMs ?? autoDelay;
|
|
3288
|
+
this.scheduler = new AudioScheduler({
|
|
3289
|
+
sampleRate,
|
|
3290
|
+
initialLookaheadSec: audioDelayMs / 1e3
|
|
3291
|
+
});
|
|
3292
|
+
this.coalescer = new AudioChunkCoalescer({
|
|
3293
|
+
sampleRate,
|
|
3294
|
+
targetDurationMs: options.chunkTargetMs ?? 200
|
|
3295
|
+
});
|
|
3296
|
+
this.lamPipeline = new LAMPipeline({
|
|
3297
|
+
sampleRate,
|
|
3298
|
+
onError: (error) => {
|
|
3299
|
+
logger3.error("LAM inference error", { message: error.message, stack: error.stack });
|
|
3300
|
+
this.emit("error", error);
|
|
3301
|
+
}
|
|
3302
|
+
});
|
|
3303
|
+
this.emotionMapper = new EmotionToBlendshapeMapper({
|
|
3304
|
+
smoothingFactor: 0.15,
|
|
3305
|
+
confidenceThreshold: 0.3,
|
|
3306
|
+
intensity: 1,
|
|
3307
|
+
energyModulation: true
|
|
3308
|
+
});
|
|
3309
|
+
this.energyAnalyzer = new AudioEnergyAnalyzer();
|
|
3310
|
+
}
|
|
3311
|
+
/**
|
|
3312
|
+
* Initialize the pipeline
|
|
3313
|
+
*/
|
|
3314
|
+
async initialize() {
|
|
3315
|
+
await this.scheduler.initialize();
|
|
3316
|
+
}
|
|
3317
|
+
/**
|
|
3318
|
+
* Set emotion label from backend (e.g., LLM response emotion).
|
|
3319
|
+
*
|
|
3320
|
+
* Converts a natural language emotion label into an EmotionFrame
|
|
3321
|
+
* that drives upper face blendshapes for the duration of the utterance.
|
|
3322
|
+
*
|
|
3323
|
+
* Supported labels: happy, excited, joyful, sad, melancholic, angry,
|
|
3324
|
+
* frustrated, neutral, etc.
|
|
3325
|
+
*
|
|
3326
|
+
* @param label - Emotion label string (case-insensitive)
|
|
3327
|
+
*/
|
|
3328
|
+
setEmotionLabel(label) {
|
|
3329
|
+
const normalized = label.toLowerCase();
|
|
3330
|
+
const mapped = EMOTION_LABEL_MAP[normalized] ?? "neutral";
|
|
3331
|
+
const probabilities = {
|
|
3332
|
+
neutral: 0.1,
|
|
3333
|
+
happy: 0.1,
|
|
3334
|
+
angry: 0.1,
|
|
3335
|
+
sad: 0.1
|
|
3336
|
+
};
|
|
3337
|
+
probabilities[mapped] = 0.7;
|
|
3338
|
+
const frame = {
|
|
3339
|
+
emotion: mapped,
|
|
3340
|
+
confidence: 0.7,
|
|
3341
|
+
probabilities
|
|
3342
|
+
};
|
|
3343
|
+
this.lastEmotionFrame = frame;
|
|
3344
|
+
logger3.info("Emotion label set", { label, mapped });
|
|
3345
|
+
}
|
|
3346
|
+
/**
|
|
3347
|
+
* Clear any set emotion label.
|
|
3348
|
+
* Falls back to prosody-only upper face animation.
|
|
3349
|
+
*/
|
|
3350
|
+
clearEmotionLabel() {
|
|
3351
|
+
this.lastEmotionFrame = null;
|
|
3352
|
+
}
|
|
3353
|
+
/**
|
|
3354
|
+
* Start a new playback session
|
|
3355
|
+
*
|
|
3356
|
+
* Resets all state and prepares for incoming audio chunks.
|
|
3357
|
+
* Audio will be scheduled immediately as chunks arrive (no buffering).
|
|
3358
|
+
*/
|
|
3359
|
+
start() {
|
|
3360
|
+
this.stopMonitoring();
|
|
3361
|
+
this.scheduler.reset();
|
|
3362
|
+
this.coalescer.reset();
|
|
3363
|
+
this.lamPipeline.reset();
|
|
3364
|
+
this.playbackStarted = false;
|
|
3365
|
+
this.lastEmotionFrame = null;
|
|
3366
|
+
this.currentAudioEnergy = 0;
|
|
3367
|
+
this.emotionMapper.reset();
|
|
3368
|
+
this.energyAnalyzer.reset();
|
|
3369
|
+
this.lastNewFrameTime = 0;
|
|
3370
|
+
this.lastKnownLamFrame = null;
|
|
3371
|
+
this.staleWarningEmitted = false;
|
|
3372
|
+
this.scheduler.warmup();
|
|
3373
|
+
this.startFrameLoop();
|
|
3374
|
+
this.startMonitoring();
|
|
3375
|
+
}
|
|
3376
|
+
/**
|
|
3377
|
+
* Receive audio chunk from network
|
|
3378
|
+
*
|
|
3379
|
+
* Audio-first design: schedules audio immediately, LAM runs in background.
|
|
3380
|
+
* This prevents LAM inference (50-300ms) from blocking audio scheduling.
|
|
3381
|
+
*
|
|
3382
|
+
* @param chunk - Uint8Array containing Int16 PCM audio
|
|
3383
|
+
*/
|
|
3384
|
+
async onAudioChunk(chunk) {
|
|
3385
|
+
const combined = this.coalescer.add(chunk);
|
|
3386
|
+
if (!combined) {
|
|
3387
|
+
return;
|
|
3388
|
+
}
|
|
3389
|
+
const float32 = pcm16ToFloat322(combined);
|
|
3390
|
+
const scheduleTime = await this.scheduler.schedule(float32);
|
|
3391
|
+
if (!this.playbackStarted) {
|
|
3392
|
+
this.playbackStarted = true;
|
|
3393
|
+
this.emit("playback_start", scheduleTime);
|
|
3394
|
+
}
|
|
3395
|
+
const { energy } = this.energyAnalyzer.process(float32);
|
|
3396
|
+
this.currentAudioEnergy = energy;
|
|
3397
|
+
this.lamPipeline.push(float32, scheduleTime, this.options.lam).catch((err) => {
|
|
3398
|
+
this.emit("error", err);
|
|
3399
|
+
});
|
|
3400
|
+
}
|
|
3401
|
+
/**
|
|
3402
|
+
* Get emotion frame for current animation.
|
|
3403
|
+
*
|
|
3404
|
+
* Priority:
|
|
3405
|
+
* 1. Explicit emotion label from setEmotionLabel()
|
|
3406
|
+
* 2. Prosody fallback: subtle brow movement from audio energy
|
|
3407
|
+
*/
|
|
3408
|
+
getEmotionFrame() {
|
|
3409
|
+
if (this.lastEmotionFrame) {
|
|
3410
|
+
return { frame: this.lastEmotionFrame, energy: this.currentAudioEnergy };
|
|
3411
|
+
}
|
|
3412
|
+
return { frame: null, energy: this.currentAudioEnergy };
|
|
3413
|
+
}
|
|
3414
|
+
/**
|
|
3415
|
+
* Merge LAM blendshapes with emotion upper face blendshapes
|
|
3416
|
+
*/
|
|
3417
|
+
mergeBlendshapes(lamFrame, emotionFrame, audioEnergy) {
|
|
3418
|
+
const merged = new Float32Array(52);
|
|
3419
|
+
let emotionBlendshapes;
|
|
3420
|
+
if (emotionFrame) {
|
|
3421
|
+
this.emotionMapper.mapFrame(emotionFrame, audioEnergy);
|
|
3422
|
+
this.emotionMapper.update(33);
|
|
3423
|
+
emotionBlendshapes = this.emotionMapper.getCurrentBlendshapes();
|
|
3424
|
+
} else {
|
|
3425
|
+
emotionBlendshapes = {};
|
|
3426
|
+
for (const name of UPPER_FACE_BLENDSHAPES) {
|
|
3427
|
+
emotionBlendshapes[name] = 0;
|
|
3428
|
+
}
|
|
3429
|
+
}
|
|
3430
|
+
for (let i = 0; i < 52; i++) {
|
|
3431
|
+
const name = LAM_BLENDSHAPES[i];
|
|
3432
|
+
if (UPPER_FACE_SET.has(name)) {
|
|
3433
|
+
const emotionValue = emotionBlendshapes[name] ?? 0;
|
|
3434
|
+
const lamValue = lamFrame[i];
|
|
3435
|
+
merged[i] = emotionValue * this.emotionBlendFactor + lamValue * this.lamBlendFactor;
|
|
3436
|
+
} else {
|
|
3437
|
+
merged[i] = lamFrame[i];
|
|
3438
|
+
}
|
|
3439
|
+
}
|
|
3440
|
+
return { merged, emotionBlendshapes };
|
|
3441
|
+
}
|
|
3442
|
+
/**
|
|
3443
|
+
* Start frame animation loop
|
|
3444
|
+
*/
|
|
3445
|
+
startFrameLoop() {
|
|
3446
|
+
const updateFrame = () => {
|
|
3447
|
+
const currentTime = this.scheduler.getCurrentTime();
|
|
3448
|
+
const lamFrame = this.lamPipeline.getFrameForTime(currentTime, this.options.lam);
|
|
3449
|
+
if (lamFrame) {
|
|
3450
|
+
if (lamFrame !== this.lastKnownLamFrame) {
|
|
3451
|
+
this.lastNewFrameTime = performance.now();
|
|
3452
|
+
this.lastKnownLamFrame = lamFrame;
|
|
3453
|
+
this.staleWarningEmitted = false;
|
|
3454
|
+
}
|
|
3455
|
+
const { frame: emotionFrame, energy } = this.getEmotionFrame();
|
|
3456
|
+
const { merged, emotionBlendshapes } = this.mergeBlendshapes(lamFrame, emotionFrame, energy);
|
|
3457
|
+
const fullFrame = {
|
|
3458
|
+
blendshapes: merged,
|
|
3459
|
+
lamBlendshapes: lamFrame,
|
|
3460
|
+
emotionBlendshapes,
|
|
3461
|
+
emotion: emotionFrame,
|
|
3462
|
+
timestamp: currentTime
|
|
3463
|
+
};
|
|
3464
|
+
this.emit("full_frame_ready", fullFrame);
|
|
3465
|
+
this.emit("lam_frame_ready", lamFrame);
|
|
3466
|
+
if (emotionFrame) {
|
|
3467
|
+
this.emit("emotion_frame_ready", emotionFrame);
|
|
3468
|
+
}
|
|
3469
|
+
} else if (this.playbackStarted && !this.lastKnownLamFrame) {
|
|
3470
|
+
const { frame: emotionFrame, energy } = this.getEmotionFrame();
|
|
3471
|
+
if (emotionFrame && energy > 0.05) {
|
|
3472
|
+
const startupFrame = new Float32Array(52);
|
|
3473
|
+
const { merged, emotionBlendshapes } = this.mergeBlendshapes(startupFrame, emotionFrame, energy);
|
|
3474
|
+
this.emit("full_frame_ready", {
|
|
3475
|
+
blendshapes: merged,
|
|
3476
|
+
lamBlendshapes: startupFrame,
|
|
3477
|
+
emotionBlendshapes,
|
|
3478
|
+
emotion: emotionFrame,
|
|
3479
|
+
timestamp: currentTime
|
|
3480
|
+
});
|
|
3481
|
+
}
|
|
3482
|
+
}
|
|
3483
|
+
if (this.playbackStarted && this.lastNewFrameTime > 0 && !this.staleWarningEmitted && performance.now() - this.lastNewFrameTime > _FullFacePipeline.STALE_FRAME_THRESHOLD_MS) {
|
|
3484
|
+
this.staleWarningEmitted = true;
|
|
3485
|
+
logger3.warn("LAM appears stalled \u2014 no new frames for 3+ seconds during playback", {
|
|
3486
|
+
staleDurationMs: Math.round(performance.now() - this.lastNewFrameTime),
|
|
3487
|
+
queuedFrames: this.lamPipeline.queuedFrameCount
|
|
3488
|
+
});
|
|
3489
|
+
}
|
|
3490
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3491
|
+
};
|
|
3492
|
+
this.frameAnimationId = requestAnimationFrame(updateFrame);
|
|
3493
|
+
}
|
|
3494
|
+
/**
|
|
3495
|
+
* End of audio stream
|
|
3496
|
+
*/
|
|
3497
|
+
async end() {
|
|
3498
|
+
const remaining = this.coalescer.flush();
|
|
3499
|
+
if (remaining) {
|
|
3500
|
+
const chunk = new Uint8Array(remaining);
|
|
3501
|
+
await this.onAudioChunk(chunk);
|
|
3502
|
+
}
|
|
3503
|
+
await this.lamPipeline.flush(this.options.lam);
|
|
3504
|
+
}
|
|
3505
|
+
/**
|
|
3506
|
+
* Stop playback immediately with smooth fade-out
|
|
3507
|
+
*/
|
|
3508
|
+
async stop(fadeOutMs = 50) {
|
|
3509
|
+
this.stopMonitoring();
|
|
3510
|
+
await this.scheduler.cancelAll(fadeOutMs);
|
|
3511
|
+
this.coalescer.reset();
|
|
3512
|
+
this.lamPipeline.reset();
|
|
3513
|
+
this.playbackStarted = false;
|
|
3514
|
+
this.lastEmotionFrame = null;
|
|
3515
|
+
this.currentAudioEnergy = 0;
|
|
3516
|
+
this.emotionMapper.reset();
|
|
3517
|
+
this.energyAnalyzer.reset();
|
|
3518
|
+
this.lastNewFrameTime = 0;
|
|
3519
|
+
this.lastKnownLamFrame = null;
|
|
3520
|
+
this.staleWarningEmitted = false;
|
|
3521
|
+
this.emit("playback_complete", void 0);
|
|
3522
|
+
}
|
|
3523
|
+
/**
|
|
3524
|
+
* Start monitoring for playback completion
|
|
3525
|
+
*/
|
|
3526
|
+
startMonitoring() {
|
|
3527
|
+
if (this.monitorInterval) {
|
|
3528
|
+
clearInterval(this.monitorInterval);
|
|
3529
|
+
}
|
|
3530
|
+
this.monitorInterval = setInterval(() => {
|
|
3531
|
+
if (this.scheduler.isComplete() && this.lamPipeline.queuedFrameCount === 0) {
|
|
3532
|
+
this.emit("playback_complete", void 0);
|
|
3533
|
+
this.stopMonitoring();
|
|
3534
|
+
}
|
|
3535
|
+
}, 100);
|
|
3536
|
+
}
|
|
3537
|
+
/**
|
|
3538
|
+
* Stop monitoring
|
|
3539
|
+
*/
|
|
3540
|
+
stopMonitoring() {
|
|
3541
|
+
if (this.monitorInterval) {
|
|
3542
|
+
clearInterval(this.monitorInterval);
|
|
3543
|
+
this.monitorInterval = null;
|
|
3544
|
+
}
|
|
3545
|
+
if (this.frameAnimationId) {
|
|
3546
|
+
cancelAnimationFrame(this.frameAnimationId);
|
|
3547
|
+
this.frameAnimationId = null;
|
|
2770
3548
|
}
|
|
2771
|
-
return tokens.map((t) => CTC_VOCAB[t] === "|" ? " " : CTC_VOCAB[t]).join("");
|
|
2772
3549
|
}
|
|
2773
3550
|
/**
|
|
2774
|
-
*
|
|
3551
|
+
* Get current pipeline state (for debugging/monitoring)
|
|
2775
3552
|
*/
|
|
2776
|
-
|
|
2777
|
-
return
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
|
|
2782
|
-
|
|
2783
|
-
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
const inferenceTimeMs = performance.now() - startTime;
|
|
2788
|
-
const asrOutput = results["asr_logits"];
|
|
2789
|
-
const blendshapeOutput = results["blendshapes"];
|
|
2790
|
-
if (!asrOutput || !blendshapeOutput) {
|
|
2791
|
-
throw new Error("Missing outputs from model");
|
|
2792
|
-
}
|
|
2793
|
-
const asrData = asrOutput.data;
|
|
2794
|
-
const blendshapeData = blendshapeOutput.data;
|
|
2795
|
-
const numASRFrames = asrOutput.dims[1];
|
|
2796
|
-
const numA2EFrames = blendshapeOutput.dims[1];
|
|
2797
|
-
const asrVocabSize = asrOutput.dims[2];
|
|
2798
|
-
const numBlendshapes = blendshapeOutput.dims[2];
|
|
2799
|
-
const asrLogits = [];
|
|
2800
|
-
const blendshapes = [];
|
|
2801
|
-
for (let f = 0; f < numASRFrames; f++) {
|
|
2802
|
-
asrLogits.push(asrData.slice(f * asrVocabSize, (f + 1) * asrVocabSize));
|
|
2803
|
-
}
|
|
2804
|
-
for (let f = 0; f < numA2EFrames; f++) {
|
|
2805
|
-
const rawFrame = blendshapeData.slice(f * numBlendshapes, (f + 1) * numBlendshapes);
|
|
2806
|
-
blendshapes.push(symmetrizeBlendshapes(rawFrame));
|
|
2807
|
-
}
|
|
2808
|
-
const text = this.decodeCTC(asrLogits);
|
|
2809
|
-
logger2.trace("Inference completed", {
|
|
2810
|
-
inferenceTimeMs: Math.round(inferenceTimeMs * 100) / 100,
|
|
2811
|
-
numA2EFrames,
|
|
2812
|
-
numASRFrames,
|
|
2813
|
-
textLength: text.length
|
|
2814
|
-
});
|
|
2815
|
-
span?.setAttributes({
|
|
2816
|
-
"inference.duration_ms": inferenceTimeMs,
|
|
2817
|
-
"inference.a2e_frames": numA2EFrames,
|
|
2818
|
-
"inference.asr_frames": numASRFrames
|
|
2819
|
-
});
|
|
2820
|
-
span?.end();
|
|
2821
|
-
telemetry?.recordHistogram("omote.inference.latency", inferenceTimeMs, {
|
|
2822
|
-
model: "wav2vec2",
|
|
2823
|
-
backend: this._backend
|
|
2824
|
-
});
|
|
2825
|
-
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
2826
|
-
model: "wav2vec2",
|
|
2827
|
-
backend: this._backend,
|
|
2828
|
-
status: "success"
|
|
2829
|
-
});
|
|
2830
|
-
resolve({
|
|
2831
|
-
blendshapes,
|
|
2832
|
-
asrLogits,
|
|
2833
|
-
text,
|
|
2834
|
-
numFrames: numA2EFrames,
|
|
2835
|
-
numA2EFrames,
|
|
2836
|
-
numASRFrames,
|
|
2837
|
-
inferenceTimeMs
|
|
2838
|
-
});
|
|
2839
|
-
} catch (err) {
|
|
2840
|
-
span?.endWithError(err instanceof Error ? err : new Error(String(err)));
|
|
2841
|
-
telemetry?.incrementCounter("omote.inference.total", 1, {
|
|
2842
|
-
model: "wav2vec2",
|
|
2843
|
-
backend: this._backend,
|
|
2844
|
-
status: "error"
|
|
2845
|
-
});
|
|
2846
|
-
reject(err);
|
|
2847
|
-
}
|
|
2848
|
-
});
|
|
2849
|
-
});
|
|
3553
|
+
getState() {
|
|
3554
|
+
return {
|
|
3555
|
+
playbackStarted: this.playbackStarted,
|
|
3556
|
+
coalescerFill: this.coalescer.fillLevel,
|
|
3557
|
+
lamFill: this.lamPipeline.fillLevel,
|
|
3558
|
+
queuedLAMFrames: this.lamPipeline.queuedFrameCount,
|
|
3559
|
+
emotionLabel: this.lastEmotionFrame?.emotion ?? null,
|
|
3560
|
+
currentAudioEnergy: this.currentAudioEnergy,
|
|
3561
|
+
currentTime: this.scheduler.getCurrentTime(),
|
|
3562
|
+
playbackEndTime: this.scheduler.getPlaybackEndTime()
|
|
3563
|
+
};
|
|
2850
3564
|
}
|
|
2851
3565
|
/**
|
|
2852
|
-
*
|
|
3566
|
+
* Check if an explicit emotion label is currently set
|
|
2853
3567
|
*/
|
|
2854
|
-
|
|
2855
|
-
|
|
2856
|
-
if (index === -1) {
|
|
2857
|
-
throw new Error(`Unknown blendshape: ${name}`);
|
|
2858
|
-
}
|
|
2859
|
-
return blendshapes[index];
|
|
3568
|
+
get hasEmotionLabel() {
|
|
3569
|
+
return this.lastEmotionFrame !== null;
|
|
2860
3570
|
}
|
|
2861
3571
|
/**
|
|
2862
|
-
*
|
|
3572
|
+
* Cleanup resources
|
|
2863
3573
|
*/
|
|
2864
|
-
|
|
2865
|
-
|
|
2866
|
-
|
|
2867
|
-
|
|
2868
|
-
|
|
3574
|
+
dispose() {
|
|
3575
|
+
this.stopMonitoring();
|
|
3576
|
+
this.scheduler.dispose();
|
|
3577
|
+
this.coalescer.reset();
|
|
3578
|
+
this.lamPipeline.reset();
|
|
3579
|
+
this.lastEmotionFrame = null;
|
|
3580
|
+
this.currentAudioEnergy = 0;
|
|
2869
3581
|
}
|
|
2870
3582
|
};
|
|
2871
|
-
|
|
2872
|
-
|
|
2873
|
-
* (iOS returns false even if navigator.gpu exists due to ONNX Runtime bugs)
|
|
2874
|
-
*/
|
|
2875
|
-
Wav2Vec2Inference.isWebGPUAvailable = isWebGPUAvailable;
|
|
2876
|
-
|
|
2877
|
-
// src/audio/FullFacePipeline.ts
|
|
2878
|
-
var logger3 = createLogger("FullFacePipeline");
|
|
2879
|
-
var BLENDSHAPE_INDEX_MAP = /* @__PURE__ */ new Map();
|
|
2880
|
-
LAM_BLENDSHAPES.forEach((name, index) => {
|
|
2881
|
-
BLENDSHAPE_INDEX_MAP.set(name, index);
|
|
2882
|
-
});
|
|
2883
|
-
var UPPER_FACE_SET = new Set(UPPER_FACE_BLENDSHAPES);
|
|
3583
|
+
_FullFacePipeline.STALE_FRAME_THRESHOLD_MS = 3e3;
|
|
3584
|
+
var FullFacePipeline = _FullFacePipeline;
|
|
2884
3585
|
|
|
2885
3586
|
// src/inference/kaldiFbank.ts
|
|
2886
3587
|
function fft(re, im) {
|
|
@@ -6905,121 +7606,639 @@ var AnimationGraph = class extends EventEmitter {
|
|
|
6905
7606
|
}
|
|
6906
7607
|
};
|
|
6907
7608
|
|
|
6908
|
-
// src/animation/
|
|
6909
|
-
|
|
6910
|
-
|
|
6911
|
-
|
|
6912
|
-
|
|
6913
|
-
|
|
6914
|
-
|
|
6915
|
-
|
|
7609
|
+
// src/animation/simplex2d.ts
|
|
7610
|
+
var perm = new Uint8Array(512);
|
|
7611
|
+
var grad2 = [
|
|
7612
|
+
[1, 1],
|
|
7613
|
+
[-1, 1],
|
|
7614
|
+
[1, -1],
|
|
7615
|
+
[-1, -1],
|
|
7616
|
+
[1, 0],
|
|
7617
|
+
[-1, 0],
|
|
7618
|
+
[0, 1],
|
|
7619
|
+
[0, -1]
|
|
7620
|
+
];
|
|
7621
|
+
var p = [
|
|
7622
|
+
151,
|
|
7623
|
+
160,
|
|
7624
|
+
137,
|
|
7625
|
+
91,
|
|
7626
|
+
90,
|
|
7627
|
+
15,
|
|
7628
|
+
131,
|
|
7629
|
+
13,
|
|
7630
|
+
201,
|
|
7631
|
+
95,
|
|
7632
|
+
96,
|
|
7633
|
+
53,
|
|
7634
|
+
194,
|
|
7635
|
+
233,
|
|
7636
|
+
7,
|
|
7637
|
+
225,
|
|
7638
|
+
140,
|
|
7639
|
+
36,
|
|
7640
|
+
103,
|
|
7641
|
+
30,
|
|
7642
|
+
69,
|
|
7643
|
+
142,
|
|
7644
|
+
8,
|
|
7645
|
+
99,
|
|
7646
|
+
37,
|
|
7647
|
+
240,
|
|
7648
|
+
21,
|
|
7649
|
+
10,
|
|
7650
|
+
23,
|
|
7651
|
+
190,
|
|
7652
|
+
6,
|
|
7653
|
+
148,
|
|
7654
|
+
247,
|
|
7655
|
+
120,
|
|
7656
|
+
234,
|
|
7657
|
+
75,
|
|
7658
|
+
0,
|
|
7659
|
+
26,
|
|
7660
|
+
197,
|
|
7661
|
+
62,
|
|
7662
|
+
94,
|
|
7663
|
+
252,
|
|
7664
|
+
219,
|
|
7665
|
+
203,
|
|
7666
|
+
117,
|
|
7667
|
+
35,
|
|
7668
|
+
11,
|
|
7669
|
+
32,
|
|
7670
|
+
57,
|
|
7671
|
+
177,
|
|
7672
|
+
33,
|
|
7673
|
+
88,
|
|
7674
|
+
237,
|
|
7675
|
+
149,
|
|
7676
|
+
56,
|
|
7677
|
+
87,
|
|
7678
|
+
174,
|
|
7679
|
+
20,
|
|
7680
|
+
125,
|
|
7681
|
+
136,
|
|
7682
|
+
171,
|
|
7683
|
+
168,
|
|
7684
|
+
68,
|
|
7685
|
+
175,
|
|
7686
|
+
74,
|
|
7687
|
+
165,
|
|
7688
|
+
71,
|
|
7689
|
+
134,
|
|
7690
|
+
139,
|
|
7691
|
+
48,
|
|
7692
|
+
27,
|
|
7693
|
+
166,
|
|
7694
|
+
77,
|
|
7695
|
+
146,
|
|
7696
|
+
158,
|
|
7697
|
+
231,
|
|
7698
|
+
83,
|
|
7699
|
+
111,
|
|
7700
|
+
229,
|
|
7701
|
+
122,
|
|
7702
|
+
60,
|
|
7703
|
+
211,
|
|
7704
|
+
133,
|
|
7705
|
+
230,
|
|
7706
|
+
220,
|
|
7707
|
+
105,
|
|
7708
|
+
92,
|
|
7709
|
+
41,
|
|
7710
|
+
55,
|
|
7711
|
+
46,
|
|
7712
|
+
245,
|
|
7713
|
+
40,
|
|
7714
|
+
244,
|
|
7715
|
+
102,
|
|
7716
|
+
143,
|
|
7717
|
+
54,
|
|
7718
|
+
65,
|
|
7719
|
+
25,
|
|
7720
|
+
63,
|
|
7721
|
+
161,
|
|
7722
|
+
1,
|
|
7723
|
+
216,
|
|
7724
|
+
80,
|
|
7725
|
+
73,
|
|
7726
|
+
209,
|
|
7727
|
+
76,
|
|
7728
|
+
132,
|
|
7729
|
+
187,
|
|
7730
|
+
208,
|
|
7731
|
+
89,
|
|
7732
|
+
18,
|
|
7733
|
+
169,
|
|
7734
|
+
200,
|
|
7735
|
+
196,
|
|
7736
|
+
135,
|
|
7737
|
+
130,
|
|
7738
|
+
116,
|
|
7739
|
+
188,
|
|
7740
|
+
159,
|
|
7741
|
+
86,
|
|
7742
|
+
164,
|
|
7743
|
+
100,
|
|
7744
|
+
109,
|
|
7745
|
+
198,
|
|
7746
|
+
173,
|
|
7747
|
+
186,
|
|
7748
|
+
3,
|
|
7749
|
+
64,
|
|
7750
|
+
52,
|
|
7751
|
+
217,
|
|
7752
|
+
226,
|
|
7753
|
+
250,
|
|
7754
|
+
124,
|
|
7755
|
+
123,
|
|
7756
|
+
5,
|
|
7757
|
+
202,
|
|
7758
|
+
38,
|
|
7759
|
+
147,
|
|
7760
|
+
118,
|
|
7761
|
+
126,
|
|
7762
|
+
255,
|
|
7763
|
+
82,
|
|
7764
|
+
85,
|
|
7765
|
+
212,
|
|
7766
|
+
207,
|
|
7767
|
+
206,
|
|
7768
|
+
59,
|
|
7769
|
+
227,
|
|
7770
|
+
47,
|
|
7771
|
+
16,
|
|
7772
|
+
58,
|
|
7773
|
+
17,
|
|
7774
|
+
182,
|
|
7775
|
+
189,
|
|
7776
|
+
28,
|
|
7777
|
+
42,
|
|
7778
|
+
223,
|
|
7779
|
+
183,
|
|
7780
|
+
170,
|
|
7781
|
+
213,
|
|
7782
|
+
119,
|
|
7783
|
+
248,
|
|
7784
|
+
152,
|
|
7785
|
+
2,
|
|
7786
|
+
44,
|
|
7787
|
+
154,
|
|
7788
|
+
163,
|
|
7789
|
+
70,
|
|
7790
|
+
221,
|
|
7791
|
+
153,
|
|
7792
|
+
101,
|
|
7793
|
+
155,
|
|
7794
|
+
167,
|
|
7795
|
+
43,
|
|
7796
|
+
172,
|
|
7797
|
+
9,
|
|
7798
|
+
129,
|
|
7799
|
+
22,
|
|
7800
|
+
39,
|
|
7801
|
+
253,
|
|
7802
|
+
19,
|
|
7803
|
+
98,
|
|
7804
|
+
108,
|
|
7805
|
+
110,
|
|
7806
|
+
79,
|
|
7807
|
+
113,
|
|
7808
|
+
224,
|
|
7809
|
+
232,
|
|
7810
|
+
178,
|
|
7811
|
+
185,
|
|
7812
|
+
112,
|
|
7813
|
+
104,
|
|
7814
|
+
218,
|
|
7815
|
+
246,
|
|
7816
|
+
97,
|
|
7817
|
+
228,
|
|
7818
|
+
251,
|
|
7819
|
+
34,
|
|
7820
|
+
242,
|
|
7821
|
+
193,
|
|
7822
|
+
238,
|
|
7823
|
+
210,
|
|
7824
|
+
144,
|
|
7825
|
+
12,
|
|
7826
|
+
191,
|
|
7827
|
+
179,
|
|
7828
|
+
162,
|
|
7829
|
+
241,
|
|
7830
|
+
81,
|
|
7831
|
+
51,
|
|
7832
|
+
145,
|
|
7833
|
+
235,
|
|
7834
|
+
249,
|
|
7835
|
+
14,
|
|
7836
|
+
239,
|
|
7837
|
+
107,
|
|
7838
|
+
49,
|
|
7839
|
+
192,
|
|
7840
|
+
214,
|
|
7841
|
+
31,
|
|
7842
|
+
181,
|
|
7843
|
+
199,
|
|
7844
|
+
106,
|
|
7845
|
+
157,
|
|
7846
|
+
184,
|
|
7847
|
+
84,
|
|
7848
|
+
204,
|
|
7849
|
+
176,
|
|
7850
|
+
115,
|
|
7851
|
+
121,
|
|
7852
|
+
50,
|
|
7853
|
+
45,
|
|
7854
|
+
127,
|
|
7855
|
+
4,
|
|
7856
|
+
150,
|
|
7857
|
+
254,
|
|
7858
|
+
138,
|
|
7859
|
+
236,
|
|
7860
|
+
205,
|
|
7861
|
+
93,
|
|
7862
|
+
222,
|
|
7863
|
+
114,
|
|
7864
|
+
67,
|
|
7865
|
+
29,
|
|
7866
|
+
24,
|
|
7867
|
+
72,
|
|
7868
|
+
243,
|
|
7869
|
+
141,
|
|
7870
|
+
128,
|
|
7871
|
+
195,
|
|
7872
|
+
78,
|
|
7873
|
+
66,
|
|
7874
|
+
215,
|
|
7875
|
+
61,
|
|
7876
|
+
156,
|
|
7877
|
+
180
|
|
7878
|
+
];
|
|
7879
|
+
for (let i = 0; i < 256; i++) {
|
|
7880
|
+
perm[i] = p[i];
|
|
7881
|
+
perm[i + 256] = p[i];
|
|
6916
7882
|
}
|
|
6917
|
-
|
|
6918
|
-
|
|
6919
|
-
|
|
6920
|
-
|
|
6921
|
-
if (abs > peak) peak = abs;
|
|
6922
|
-
}
|
|
6923
|
-
return peak;
|
|
7883
|
+
var F2 = 0.5 * (Math.sqrt(3) - 1);
|
|
7884
|
+
var G2 = (3 - Math.sqrt(3)) / 6;
|
|
7885
|
+
function dot2(g, x, y) {
|
|
7886
|
+
return g[0] * x + g[1] * y;
|
|
6924
7887
|
}
|
|
6925
|
-
|
|
6926
|
-
|
|
6927
|
-
|
|
6928
|
-
|
|
6929
|
-
|
|
6930
|
-
|
|
6931
|
-
|
|
6932
|
-
|
|
6933
|
-
|
|
6934
|
-
|
|
6935
|
-
|
|
6936
|
-
|
|
6937
|
-
|
|
6938
|
-
|
|
6939
|
-
|
|
6940
|
-
|
|
6941
|
-
|
|
6942
|
-
|
|
6943
|
-
|
|
6944
|
-
|
|
6945
|
-
|
|
6946
|
-
|
|
6947
|
-
|
|
6948
|
-
|
|
6949
|
-
|
|
6950
|
-
|
|
6951
|
-
|
|
6952
|
-
|
|
6953
|
-
|
|
6954
|
-
|
|
6955
|
-
|
|
6956
|
-
|
|
7888
|
+
function simplex2d(x, y) {
|
|
7889
|
+
const s = (x + y) * F2;
|
|
7890
|
+
const i = Math.floor(x + s);
|
|
7891
|
+
const j = Math.floor(y + s);
|
|
7892
|
+
const t = (i + j) * G2;
|
|
7893
|
+
const X0 = i - t;
|
|
7894
|
+
const Y0 = j - t;
|
|
7895
|
+
const x0 = x - X0;
|
|
7896
|
+
const y0 = y - Y0;
|
|
7897
|
+
const i1 = x0 > y0 ? 1 : 0;
|
|
7898
|
+
const j1 = x0 > y0 ? 0 : 1;
|
|
7899
|
+
const x1 = x0 - i1 + G2;
|
|
7900
|
+
const y1 = y0 - j1 + G2;
|
|
7901
|
+
const x2 = x0 - 1 + 2 * G2;
|
|
7902
|
+
const y2 = y0 - 1 + 2 * G2;
|
|
7903
|
+
const ii = i & 255;
|
|
7904
|
+
const jj = j & 255;
|
|
7905
|
+
const gi0 = perm[ii + perm[jj]] % 8;
|
|
7906
|
+
const gi1 = perm[ii + i1 + perm[jj + j1]] % 8;
|
|
7907
|
+
const gi2 = perm[ii + 1 + perm[jj + 1]] % 8;
|
|
7908
|
+
let n0 = 0;
|
|
7909
|
+
let t0 = 0.5 - x0 * x0 - y0 * y0;
|
|
7910
|
+
if (t0 >= 0) {
|
|
7911
|
+
t0 *= t0;
|
|
7912
|
+
n0 = t0 * t0 * dot2(grad2[gi0], x0, y0);
|
|
7913
|
+
}
|
|
7914
|
+
let n1 = 0;
|
|
7915
|
+
let t1 = 0.5 - x1 * x1 - y1 * y1;
|
|
7916
|
+
if (t1 >= 0) {
|
|
7917
|
+
t1 *= t1;
|
|
7918
|
+
n1 = t1 * t1 * dot2(grad2[gi1], x1, y1);
|
|
7919
|
+
}
|
|
7920
|
+
let n2 = 0;
|
|
7921
|
+
let t2 = 0.5 - x2 * x2 - y2 * y2;
|
|
7922
|
+
if (t2 >= 0) {
|
|
7923
|
+
t2 *= t2;
|
|
7924
|
+
n2 = t2 * t2 * dot2(grad2[gi2], x2, y2);
|
|
7925
|
+
}
|
|
7926
|
+
return 70 * (n0 + n1 + n2);
|
|
7927
|
+
}
|
|
7928
|
+
|
|
7929
|
+
// src/animation/ProceduralLifeLayer.ts
|
|
7930
|
+
var PHASE_OPEN = 0;
|
|
7931
|
+
var PHASE_CLOSING = 1;
|
|
7932
|
+
var PHASE_CLOSED = 2;
|
|
7933
|
+
var PHASE_OPENING = 3;
|
|
7934
|
+
var BLINK_CLOSE_DURATION = 0.06;
|
|
7935
|
+
var BLINK_HOLD_DURATION = 0.04;
|
|
7936
|
+
var BLINK_OPEN_DURATION = 0.15;
|
|
7937
|
+
var BLINK_ASYMMETRY_DELAY = 8e-3;
|
|
7938
|
+
var GAZE_BREAK_DURATION = 0.12;
|
|
7939
|
+
var GAZE_BREAK_HOLD_DURATION = 0.3;
|
|
7940
|
+
var GAZE_BREAK_RETURN_DURATION = 0.15;
|
|
7941
|
+
var EYE_NOISE_X_FREQ = 0.8;
|
|
7942
|
+
var EYE_NOISE_Y_FREQ = 0.6;
|
|
7943
|
+
var EYE_NOISE_X_PHASE = 73.1;
|
|
7944
|
+
var EYE_NOISE_Y_PHASE = 91.7;
|
|
7945
|
+
var BROW_INNER_UP_FREQ = 0.4;
|
|
7946
|
+
var BROW_OUTER_LEFT_FREQ = 0.35;
|
|
7947
|
+
var BROW_OUTER_RIGHT_FREQ = 0.38;
|
|
7948
|
+
var BROW_DOWN_FREQ = 0.3;
|
|
7949
|
+
var BROW_INNER_UP_PHASE = 0;
|
|
7950
|
+
var BROW_OUTER_LEFT_PHASE = 17.3;
|
|
7951
|
+
var BROW_OUTER_RIGHT_PHASE = 31.7;
|
|
7952
|
+
var BROW_DOWN_LEFT_PHASE = 47.1;
|
|
7953
|
+
var BROW_DOWN_RIGHT_PHASE = 59.3;
|
|
7954
|
+
var EMPHASIS_ENERGY_THRESHOLD = 0.3;
|
|
7955
|
+
var EMPHASIS_DECAY_RATE = 4;
|
|
7956
|
+
function clamp(v, min, max) {
|
|
7957
|
+
return v < min ? min : v > max ? max : v;
|
|
7958
|
+
}
|
|
7959
|
+
function randomRange(min, max) {
|
|
7960
|
+
return min + Math.random() * (max - min);
|
|
7961
|
+
}
|
|
7962
|
+
function smoothStep(t) {
|
|
7963
|
+
return t * t * (3 - 2 * t);
|
|
7964
|
+
}
|
|
7965
|
+
function softClamp(v, max) {
|
|
7966
|
+
return Math.tanh(v / max) * max;
|
|
7967
|
+
}
|
|
7968
|
+
var ProceduralLifeLayer = class {
|
|
7969
|
+
constructor(config) {
|
|
7970
|
+
// Blink state
|
|
7971
|
+
this.blinkTimer = 0;
|
|
7972
|
+
this.blinkPhase = PHASE_OPEN;
|
|
7973
|
+
this.blinkProgress = 0;
|
|
7974
|
+
this.asymmetryRight = 0.97;
|
|
7975
|
+
this.smoothedBlinkLeft = 0;
|
|
7976
|
+
this.smoothedBlinkRight = 0;
|
|
7977
|
+
// Eye contact (smoothed)
|
|
7978
|
+
this.smoothedEyeX = 0;
|
|
7979
|
+
this.smoothedEyeY = 0;
|
|
7980
|
+
// Eye micro-motion (continuous simplex noise, no discrete events)
|
|
7981
|
+
this.eyeNoiseTime = 0;
|
|
7982
|
+
// Gaze break state
|
|
7983
|
+
this.gazeBreakTimer = 0;
|
|
7984
|
+
this.gazeBreakPhase = PHASE_OPEN;
|
|
7985
|
+
this.gazeBreakProgress = 0;
|
|
7986
|
+
this.gazeBreakTargetX = 0;
|
|
7987
|
+
this.gazeBreakTargetY = 0;
|
|
7988
|
+
this.gazeBreakCurrentX = 0;
|
|
7989
|
+
this.gazeBreakCurrentY = 0;
|
|
7990
|
+
// Breathing / postural sway
|
|
7991
|
+
this.microMotionTime = 0;
|
|
7992
|
+
this.breathingPhase = 0;
|
|
7993
|
+
// Brow noise
|
|
7994
|
+
this.noiseTime = 0;
|
|
7995
|
+
this.previousEnergy = 0;
|
|
7996
|
+
this.emphasisLevel = 0;
|
|
7997
|
+
this.blinkIntervalRange = config?.blinkIntervalRange ?? [2.5, 6];
|
|
7998
|
+
this.gazeBreakIntervalRange = config?.gazeBreakIntervalRange ?? [3, 8];
|
|
7999
|
+
this.gazeBreakAmplitudeRange = config?.gazeBreakAmplitudeRange ?? [0.15, 0.4];
|
|
8000
|
+
this.eyeNoiseAmplitude = config?.eyeNoiseAmplitude ?? 0.06;
|
|
8001
|
+
this.browNoiseAmplitude = config?.browNoiseAmplitude ?? 0.3;
|
|
8002
|
+
this.browNoiseSpeechMultiplier = config?.browNoiseSpeechMultiplier ?? 2;
|
|
8003
|
+
this.breathingRate = config?.breathingRate ?? 0.25;
|
|
8004
|
+
this.posturalSwayAmplitude = config?.posturalSwayAmplitude ?? 2e-3;
|
|
8005
|
+
this.eyeMaxDeviation = config?.eyeMaxDeviation ?? 0.8;
|
|
8006
|
+
this.eyeSmoothing = config?.eyeSmoothing ?? 15;
|
|
8007
|
+
this.blinkInterval = randomRange(...this.blinkIntervalRange);
|
|
8008
|
+
this.gazeBreakInterval = randomRange(...this.gazeBreakIntervalRange);
|
|
8009
|
+
}
|
|
8010
|
+
/**
|
|
8011
|
+
* Update the life layer and produce output for this frame.
|
|
8012
|
+
*
|
|
8013
|
+
* @param delta - Time since last frame in seconds
|
|
8014
|
+
* @param input - Per-frame input (eye target, audio energy, speaking state)
|
|
8015
|
+
* @returns Blendshape values and head rotation deltas
|
|
8016
|
+
*/
|
|
8017
|
+
update(delta, input) {
|
|
8018
|
+
const eyeTargetX = input?.eyeTargetX ?? 0;
|
|
8019
|
+
const eyeTargetY = input?.eyeTargetY ?? 0;
|
|
8020
|
+
const audioEnergy = input?.audioEnergy ?? 0;
|
|
8021
|
+
const isSpeaking = input?.isSpeaking ?? false;
|
|
8022
|
+
const safeDelta = Math.min(delta, 0.1);
|
|
8023
|
+
const blendshapes = {};
|
|
8024
|
+
this.updateBlinks(delta);
|
|
8025
|
+
const blinkSmoothing = 45;
|
|
8026
|
+
const blinkValues = this.getBlinkValues();
|
|
8027
|
+
this.smoothedBlinkLeft += (blinkValues.left - this.smoothedBlinkLeft) * Math.min(1, safeDelta * blinkSmoothing);
|
|
8028
|
+
this.smoothedBlinkRight += (blinkValues.right - this.smoothedBlinkRight) * Math.min(1, safeDelta * blinkSmoothing);
|
|
8029
|
+
blendshapes["eyeBlinkLeft"] = this.smoothedBlinkLeft;
|
|
8030
|
+
blendshapes["eyeBlinkRight"] = this.smoothedBlinkRight;
|
|
8031
|
+
this.smoothedEyeX += (eyeTargetX - this.smoothedEyeX) * Math.min(1, safeDelta * this.eyeSmoothing);
|
|
8032
|
+
this.smoothedEyeY += (eyeTargetY - this.smoothedEyeY) * Math.min(1, safeDelta * this.eyeSmoothing);
|
|
8033
|
+
this.eyeNoiseTime += delta;
|
|
8034
|
+
const microMotion = this.getEyeMicroMotion();
|
|
8035
|
+
this.updateGazeBreaks(delta);
|
|
8036
|
+
const finalEyeX = this.smoothedEyeX + this.gazeBreakCurrentX + microMotion.x;
|
|
8037
|
+
const finalEyeY = this.smoothedEyeY + this.gazeBreakCurrentY + microMotion.y;
|
|
8038
|
+
const clampedX = softClamp(finalEyeX, this.eyeMaxDeviation);
|
|
8039
|
+
const clampedY = softClamp(finalEyeY, this.eyeMaxDeviation);
|
|
8040
|
+
const deadZone = 0.02;
|
|
8041
|
+
const lookRight = clampedX > deadZone ? clampedX : clampedX > 0 ? clampedX * (clampedX / deadZone) : 0;
|
|
8042
|
+
const lookLeft = clampedX < -deadZone ? -clampedX : clampedX < 0 ? -clampedX * (-clampedX / deadZone) : 0;
|
|
8043
|
+
const lookUp = clampedY > deadZone ? clampedY : clampedY > 0 ? clampedY * (clampedY / deadZone) : 0;
|
|
8044
|
+
const lookDown = clampedY < -deadZone ? -clampedY : clampedY < 0 ? -clampedY * (-clampedY / deadZone) : 0;
|
|
8045
|
+
blendshapes["eyeLookInLeft"] = lookRight;
|
|
8046
|
+
blendshapes["eyeLookOutLeft"] = lookLeft;
|
|
8047
|
+
blendshapes["eyeLookInRight"] = lookLeft;
|
|
8048
|
+
blendshapes["eyeLookOutRight"] = lookRight;
|
|
8049
|
+
blendshapes["eyeLookUpLeft"] = lookUp;
|
|
8050
|
+
blendshapes["eyeLookUpRight"] = lookUp;
|
|
8051
|
+
blendshapes["eyeLookDownLeft"] = lookDown;
|
|
8052
|
+
blendshapes["eyeLookDownRight"] = lookDown;
|
|
8053
|
+
this.updateBrowNoise(delta, audioEnergy, isSpeaking, blendshapes);
|
|
8054
|
+
this.microMotionTime += delta;
|
|
8055
|
+
this.breathingPhase += delta * this.breathingRate * Math.PI * 2;
|
|
8056
|
+
const breathingY = Math.sin(this.breathingPhase) * 3e-3;
|
|
8057
|
+
const swayAmp = this.posturalSwayAmplitude;
|
|
8058
|
+
const swayX = Math.sin(this.microMotionTime * 0.7) * swayAmp + Math.sin(this.microMotionTime * 1.3) * swayAmp * 0.5;
|
|
8059
|
+
const swayY = Math.sin(this.microMotionTime * 0.5) * swayAmp * 0.75 + Math.sin(this.microMotionTime * 0.9) * swayAmp * 0.5;
|
|
6957
8060
|
return {
|
|
6958
|
-
|
|
6959
|
-
|
|
6960
|
-
|
|
6961
|
-
|
|
8061
|
+
blendshapes,
|
|
8062
|
+
headDelta: {
|
|
8063
|
+
yaw: swayX,
|
|
8064
|
+
pitch: breathingY + swayY
|
|
8065
|
+
}
|
|
6962
8066
|
};
|
|
6963
8067
|
}
|
|
6964
8068
|
/**
|
|
6965
|
-
* Reset
|
|
8069
|
+
* Reset all internal state to initial values.
|
|
6966
8070
|
*/
|
|
6967
8071
|
reset() {
|
|
6968
|
-
this.
|
|
6969
|
-
this.
|
|
6970
|
-
|
|
6971
|
-
|
|
6972
|
-
|
|
6973
|
-
|
|
6974
|
-
|
|
6975
|
-
|
|
6976
|
-
|
|
6977
|
-
|
|
6978
|
-
|
|
6979
|
-
|
|
6980
|
-
|
|
6981
|
-
|
|
6982
|
-
|
|
6983
|
-
|
|
6984
|
-
|
|
6985
|
-
|
|
6986
|
-
|
|
6987
|
-
|
|
6988
|
-
|
|
6989
|
-
|
|
6990
|
-
this.
|
|
6991
|
-
|
|
6992
|
-
|
|
8072
|
+
this.blinkTimer = 0;
|
|
8073
|
+
this.blinkInterval = randomRange(...this.blinkIntervalRange);
|
|
8074
|
+
this.blinkPhase = PHASE_OPEN;
|
|
8075
|
+
this.blinkProgress = 0;
|
|
8076
|
+
this.asymmetryRight = 0.97;
|
|
8077
|
+
this.smoothedBlinkLeft = 0;
|
|
8078
|
+
this.smoothedBlinkRight = 0;
|
|
8079
|
+
this.smoothedEyeX = 0;
|
|
8080
|
+
this.smoothedEyeY = 0;
|
|
8081
|
+
this.eyeNoiseTime = 0;
|
|
8082
|
+
this.gazeBreakTimer = 0;
|
|
8083
|
+
this.gazeBreakInterval = randomRange(...this.gazeBreakIntervalRange);
|
|
8084
|
+
this.gazeBreakPhase = PHASE_OPEN;
|
|
8085
|
+
this.gazeBreakProgress = 0;
|
|
8086
|
+
this.gazeBreakTargetX = 0;
|
|
8087
|
+
this.gazeBreakTargetY = 0;
|
|
8088
|
+
this.gazeBreakCurrentX = 0;
|
|
8089
|
+
this.gazeBreakCurrentY = 0;
|
|
8090
|
+
this.microMotionTime = 0;
|
|
8091
|
+
this.breathingPhase = 0;
|
|
8092
|
+
this.noiseTime = 0;
|
|
8093
|
+
this.previousEnergy = 0;
|
|
8094
|
+
this.emphasisLevel = 0;
|
|
8095
|
+
}
|
|
8096
|
+
// =====================================================================
|
|
8097
|
+
// PRIVATE: Blink system
|
|
8098
|
+
// =====================================================================
|
|
8099
|
+
updateBlinks(delta) {
|
|
8100
|
+
this.blinkTimer += delta;
|
|
8101
|
+
if (this.blinkTimer >= this.blinkInterval && this.blinkPhase === PHASE_OPEN) {
|
|
8102
|
+
this.blinkPhase = PHASE_CLOSING;
|
|
8103
|
+
this.blinkProgress = 0;
|
|
8104
|
+
this.blinkTimer = 0;
|
|
8105
|
+
this.blinkInterval = randomRange(...this.blinkIntervalRange);
|
|
8106
|
+
this.asymmetryRight = 0.95 + Math.random() * 0.08;
|
|
8107
|
+
}
|
|
8108
|
+
if (this.blinkPhase > PHASE_OPEN) {
|
|
8109
|
+
this.blinkProgress += delta;
|
|
8110
|
+
if (this.blinkPhase === PHASE_CLOSING) {
|
|
8111
|
+
if (this.blinkProgress >= BLINK_CLOSE_DURATION) {
|
|
8112
|
+
this.blinkPhase = PHASE_CLOSED;
|
|
8113
|
+
this.blinkProgress = 0;
|
|
8114
|
+
}
|
|
8115
|
+
} else if (this.blinkPhase === PHASE_CLOSED) {
|
|
8116
|
+
if (this.blinkProgress >= BLINK_HOLD_DURATION) {
|
|
8117
|
+
this.blinkPhase = PHASE_OPENING;
|
|
8118
|
+
this.blinkProgress = 0;
|
|
8119
|
+
}
|
|
8120
|
+
} else if (this.blinkPhase === PHASE_OPENING) {
|
|
8121
|
+
if (this.blinkProgress >= BLINK_OPEN_DURATION) {
|
|
8122
|
+
this.blinkPhase = PHASE_OPEN;
|
|
8123
|
+
this.blinkProgress = 0;
|
|
8124
|
+
}
|
|
8125
|
+
}
|
|
8126
|
+
}
|
|
6993
8127
|
}
|
|
6994
|
-
|
|
6995
|
-
|
|
6996
|
-
|
|
6997
|
-
* @returns Object with isEmphasis flag and emphasisStrength
|
|
6998
|
-
*/
|
|
6999
|
-
process(energy) {
|
|
7000
|
-
this.energyHistory.push(energy);
|
|
7001
|
-
if (this.energyHistory.length > this.historySize) {
|
|
7002
|
-
this.energyHistory.shift();
|
|
8128
|
+
getBlinkValues() {
|
|
8129
|
+
if (this.blinkPhase === PHASE_OPEN) {
|
|
8130
|
+
return { left: 0, right: 0 };
|
|
7003
8131
|
}
|
|
7004
|
-
if (this.
|
|
7005
|
-
|
|
8132
|
+
if (this.blinkPhase === PHASE_CLOSING) {
|
|
8133
|
+
const t2 = Math.min(1, this.blinkProgress / BLINK_CLOSE_DURATION);
|
|
8134
|
+
const eased2 = t2 * t2 * t2;
|
|
8135
|
+
const tRight = Math.max(0, Math.min(1, (this.blinkProgress - BLINK_ASYMMETRY_DELAY) / BLINK_CLOSE_DURATION));
|
|
8136
|
+
return {
|
|
8137
|
+
left: eased2,
|
|
8138
|
+
right: tRight * tRight * tRight * this.asymmetryRight
|
|
8139
|
+
};
|
|
7006
8140
|
}
|
|
7007
|
-
|
|
7008
|
-
|
|
7009
|
-
|
|
7010
|
-
const
|
|
8141
|
+
if (this.blinkPhase === PHASE_CLOSED) {
|
|
8142
|
+
return { left: 1, right: this.asymmetryRight };
|
|
8143
|
+
}
|
|
8144
|
+
const t = Math.min(1, this.blinkProgress / BLINK_OPEN_DURATION);
|
|
8145
|
+
const eased = smoothStep(t);
|
|
7011
8146
|
return {
|
|
7012
|
-
|
|
7013
|
-
|
|
8147
|
+
left: 1 - eased,
|
|
8148
|
+
right: (1 - eased) * this.asymmetryRight
|
|
7014
8149
|
};
|
|
7015
8150
|
}
|
|
7016
|
-
|
|
7017
|
-
|
|
7018
|
-
|
|
7019
|
-
|
|
7020
|
-
|
|
8151
|
+
// =====================================================================
|
|
8152
|
+
// PRIVATE: Eye micro-motion (continuous simplex noise)
|
|
8153
|
+
// =====================================================================
|
|
8154
|
+
getEyeMicroMotion() {
|
|
8155
|
+
const amp = this.eyeNoiseAmplitude;
|
|
8156
|
+
const x = simplex2d(this.eyeNoiseTime * EYE_NOISE_X_FREQ, EYE_NOISE_X_PHASE) * amp;
|
|
8157
|
+
const y = simplex2d(this.eyeNoiseTime * EYE_NOISE_Y_FREQ, EYE_NOISE_Y_PHASE) * amp * 0.7;
|
|
8158
|
+
return { x, y };
|
|
8159
|
+
}
|
|
8160
|
+
// =====================================================================
|
|
8161
|
+
// PRIVATE: Gaze breaks
|
|
8162
|
+
// =====================================================================
|
|
8163
|
+
updateGazeBreaks(delta) {
|
|
8164
|
+
this.gazeBreakTimer += delta;
|
|
8165
|
+
if (this.gazeBreakTimer >= this.gazeBreakInterval && this.gazeBreakPhase === PHASE_OPEN) {
|
|
8166
|
+
this.gazeBreakPhase = PHASE_CLOSING;
|
|
8167
|
+
this.gazeBreakProgress = 0;
|
|
8168
|
+
this.gazeBreakTimer = 0;
|
|
8169
|
+
const amp = randomRange(...this.gazeBreakAmplitudeRange);
|
|
8170
|
+
this.gazeBreakTargetX = (Math.random() - 0.5) * 2 * amp;
|
|
8171
|
+
this.gazeBreakTargetY = (Math.random() - 0.5) * amp * 0.4;
|
|
8172
|
+
this.gazeBreakInterval = randomRange(...this.gazeBreakIntervalRange);
|
|
8173
|
+
}
|
|
8174
|
+
if (this.gazeBreakPhase > PHASE_OPEN) {
|
|
8175
|
+
this.gazeBreakProgress += delta;
|
|
8176
|
+
if (this.gazeBreakPhase === 1) {
|
|
8177
|
+
const t = Math.min(1, this.gazeBreakProgress / GAZE_BREAK_DURATION);
|
|
8178
|
+
const eased = smoothStep(t);
|
|
8179
|
+
this.gazeBreakCurrentX = this.gazeBreakTargetX * eased;
|
|
8180
|
+
this.gazeBreakCurrentY = this.gazeBreakTargetY * eased;
|
|
8181
|
+
if (this.gazeBreakProgress >= GAZE_BREAK_DURATION) {
|
|
8182
|
+
this.gazeBreakPhase = 2;
|
|
8183
|
+
this.gazeBreakProgress = 0;
|
|
8184
|
+
}
|
|
8185
|
+
} else if (this.gazeBreakPhase === 2) {
|
|
8186
|
+
this.gazeBreakCurrentX = this.gazeBreakTargetX;
|
|
8187
|
+
this.gazeBreakCurrentY = this.gazeBreakTargetY;
|
|
8188
|
+
if (this.gazeBreakProgress >= GAZE_BREAK_HOLD_DURATION) {
|
|
8189
|
+
this.gazeBreakPhase = 3;
|
|
8190
|
+
this.gazeBreakProgress = 0;
|
|
8191
|
+
}
|
|
8192
|
+
} else if (this.gazeBreakPhase === 3) {
|
|
8193
|
+
const t = Math.min(1, this.gazeBreakProgress / GAZE_BREAK_RETURN_DURATION);
|
|
8194
|
+
const eased = smoothStep(t);
|
|
8195
|
+
this.gazeBreakCurrentX = this.gazeBreakTargetX * (1 - eased);
|
|
8196
|
+
this.gazeBreakCurrentY = this.gazeBreakTargetY * (1 - eased);
|
|
8197
|
+
if (this.gazeBreakProgress >= GAZE_BREAK_RETURN_DURATION) {
|
|
8198
|
+
this.gazeBreakPhase = PHASE_OPEN;
|
|
8199
|
+
this.gazeBreakProgress = 0;
|
|
8200
|
+
this.gazeBreakCurrentX = 0;
|
|
8201
|
+
this.gazeBreakCurrentY = 0;
|
|
8202
|
+
}
|
|
8203
|
+
}
|
|
8204
|
+
} else {
|
|
8205
|
+
this.gazeBreakCurrentX = 0;
|
|
8206
|
+
this.gazeBreakCurrentY = 0;
|
|
8207
|
+
}
|
|
8208
|
+
}
|
|
8209
|
+
// =====================================================================
|
|
8210
|
+
// PRIVATE: Brow noise (simplex-driven organic drift)
|
|
8211
|
+
// =====================================================================
|
|
8212
|
+
updateBrowNoise(delta, audioEnergy, isSpeaking, blendshapes) {
|
|
8213
|
+
this.noiseTime += delta;
|
|
8214
|
+
const energyDelta = audioEnergy - this.previousEnergy;
|
|
8215
|
+
if (energyDelta > EMPHASIS_ENERGY_THRESHOLD) {
|
|
8216
|
+
this.emphasisLevel = 1;
|
|
8217
|
+
}
|
|
8218
|
+
this.emphasisLevel = Math.max(0, this.emphasisLevel - delta * EMPHASIS_DECAY_RATE);
|
|
8219
|
+
this.previousEnergy = audioEnergy;
|
|
8220
|
+
const speechMul = isSpeaking && audioEnergy > 0 ? this.browNoiseSpeechMultiplier : 1;
|
|
8221
|
+
const amp = this.browNoiseAmplitude * speechMul;
|
|
8222
|
+
const innerUpNoise = simplex2d(this.noiseTime * BROW_INNER_UP_FREQ, BROW_INNER_UP_PHASE);
|
|
8223
|
+
const innerUpBase = (innerUpNoise * 0.5 + 0.5) * amp * 0.83;
|
|
8224
|
+
const innerUpEmphasis = this.emphasisLevel * 0.25;
|
|
8225
|
+
blendshapes["browInnerUp"] = clamp(innerUpBase + innerUpEmphasis, 0, 1);
|
|
8226
|
+
const outerLeftNoise = simplex2d(this.noiseTime * BROW_OUTER_LEFT_FREQ, BROW_OUTER_LEFT_PHASE);
|
|
8227
|
+
blendshapes["browOuterUpLeft"] = clamp((outerLeftNoise * 0.5 + 0.5) * amp * 0.5, 0, 1);
|
|
8228
|
+
const outerRightNoise = simplex2d(this.noiseTime * BROW_OUTER_RIGHT_FREQ, BROW_OUTER_RIGHT_PHASE);
|
|
8229
|
+
blendshapes["browOuterUpRight"] = clamp((outerRightNoise * 0.5 + 0.5) * amp * 0.5, 0, 1);
|
|
8230
|
+
const downLeftNoise = simplex2d(this.noiseTime * BROW_DOWN_FREQ, BROW_DOWN_LEFT_PHASE);
|
|
8231
|
+
blendshapes["browDownLeft"] = clamp((downLeftNoise * 0.5 + 0.5) * amp * 0.33, 0, 1);
|
|
8232
|
+
const downRightNoise = simplex2d(this.noiseTime * BROW_DOWN_FREQ, BROW_DOWN_RIGHT_PHASE);
|
|
8233
|
+
blendshapes["browDownRight"] = clamp((downRightNoise * 0.5 + 0.5) * amp * 0.33, 0, 1);
|
|
7021
8234
|
}
|
|
7022
8235
|
};
|
|
8236
|
+
|
|
8237
|
+
// ../types/dist/index.mjs
|
|
8238
|
+
var PROTOCOL_VERSION = 1;
|
|
8239
|
+
function isProtocolEvent(obj) {
|
|
8240
|
+
return typeof obj === "object" && obj !== null && "v" in obj && "type" in obj && "ts" in obj;
|
|
8241
|
+
}
|
|
7023
8242
|
export {
|
|
7024
8243
|
ARKIT_BLENDSHAPES,
|
|
7025
8244
|
AgentCoreAdapter,
|
|
@@ -7033,12 +8252,15 @@ export {
|
|
|
7033
8252
|
ConversationOrchestrator,
|
|
7034
8253
|
DEFAULT_ANIMATION_CONFIG,
|
|
7035
8254
|
DEFAULT_LOGGING_CONFIG,
|
|
8255
|
+
EMOTION_ARKIT_MAP,
|
|
7036
8256
|
EMOTION_NAMES,
|
|
7037
8257
|
EMOTION_VECTOR_SIZE,
|
|
7038
8258
|
EmotionController,
|
|
7039
8259
|
EmotionPresets,
|
|
8260
|
+
EmotionToBlendshapeMapper,
|
|
7040
8261
|
EmphasisDetector,
|
|
7041
8262
|
EventEmitter,
|
|
8263
|
+
FullFacePipeline,
|
|
7042
8264
|
INFERENCE_LATENCY_BUCKETS,
|
|
7043
8265
|
InterruptionHandler,
|
|
7044
8266
|
LAMPipeline,
|
|
@@ -7050,6 +8272,8 @@ export {
|
|
|
7050
8272
|
ModelCache,
|
|
7051
8273
|
OTLPExporter,
|
|
7052
8274
|
OmoteTelemetry,
|
|
8275
|
+
PROTOCOL_VERSION,
|
|
8276
|
+
ProceduralLifeLayer,
|
|
7053
8277
|
RingBuffer,
|
|
7054
8278
|
SafariSpeechRecognition,
|
|
7055
8279
|
SenseVoiceInference,
|
|
@@ -7057,6 +8281,7 @@ export {
|
|
|
7057
8281
|
SileroVADWorker,
|
|
7058
8282
|
SyncedAudioPipeline,
|
|
7059
8283
|
TenantManager,
|
|
8284
|
+
UPPER_FACE_BLENDSHAPES,
|
|
7060
8285
|
WAV2ARKIT_BLENDSHAPES,
|
|
7061
8286
|
Wav2ArkitCpuInference,
|
|
7062
8287
|
Wav2Vec2Inference,
|
|
@@ -7095,6 +8320,7 @@ export {
|
|
|
7095
8320
|
isIOSSafari,
|
|
7096
8321
|
isMobile,
|
|
7097
8322
|
isOnnxRuntimeLoaded,
|
|
8323
|
+
isProtocolEvent,
|
|
7098
8324
|
isSafari,
|
|
7099
8325
|
isSpeechRecognitionAvailable,
|
|
7100
8326
|
isWebGPUAvailable,
|