@viji-dev/core 0.3.20 → 0.3.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,24 +5,27 @@
5
5
  * Uses importScripts() to load MediaPipe Tasks Vision UMD bundle.
6
6
  */
7
7
 
8
+ // Debug logging — controlled by CVSystem via 'debug' message
9
+ let DEBUG = false;
10
+ function log(...args) {
11
+ if (DEBUG) {
12
+ console.log('🔧 [CV Tasks Worker]', ...args);
13
+ }
14
+ }
15
+
8
16
  // Define CommonJS environment for MediaPipe bundle
9
17
  self.exports = {};
10
18
  self.module = { exports: {} };
11
19
 
12
20
  // Import MediaPipe Tasks Vision UMD bundle
13
- console.log('🔧 [CV Tasks Worker] Starting to load vision_bundle.js...');
21
+ log('Starting to load vision_bundle.js...');
14
22
  try {
15
23
  importScripts('/dist/assets/vision_bundle.js');
16
- console.log('✅ [CV Tasks Worker] vision_bundle.js loaded successfully');
24
+ log('vision_bundle.js loaded successfully');
17
25
  } catch (error) {
18
26
  console.error('❌ [CV Tasks Worker] Failed to load vision_bundle.js:', error);
19
27
  }
20
28
 
21
- // Debug: Check what's available after import (disabled for production)
22
- // console.log('🔧 [CV Tasks Worker] Available globals after import:', Object.keys(self));
23
- // console.log('🔧 [CV Tasks Worker] module.exports:', self.module.exports);
24
- // console.log('🔧 [CV Tasks Worker] exports:', self.exports);
25
-
26
29
  // MediaPipe model instances
27
30
  let faceDetector = null;
28
31
  let faceLandmarker = null;
@@ -45,14 +48,236 @@ let processingConfig = false;
45
48
  let workerHealthy = true;
46
49
  let memoryPressureDetected = false;
47
50
 
48
- // Note: No longer need reusable canvas - passing ImageBitmap directly to MediaPipe!
51
+ // Safe zero-defaults for face data when features are inactive
52
+ const EMPTY_EXPRESSIONS = Object.freeze({
53
+ neutral: 0, happy: 0, sad: 0, angry: 0, surprised: 0, disgusted: 0, fearful: 0
54
+ });
49
55
 
50
- // Debug logging
51
- const DEBUG = true; // Temporarily enabled to debug segmentation
52
- function log(...args) {
53
- if (DEBUG) {
54
- console.log('🔧 [CV Tasks Worker]', ...args);
56
+ const EMPTY_HEAD_POSE = Object.freeze({ pitch: 0, yaw: 0, roll: 0 });
57
+
58
+ const EMPTY_BLENDSHAPES = Object.freeze({
59
+ browDownLeft: 0, browDownRight: 0, browInnerUp: 0, browOuterUpLeft: 0, browOuterUpRight: 0,
60
+ cheekPuff: 0, cheekSquintLeft: 0, cheekSquintRight: 0,
61
+ eyeBlinkLeft: 0, eyeBlinkRight: 0,
62
+ eyeLookDownLeft: 0, eyeLookDownRight: 0, eyeLookInLeft: 0, eyeLookInRight: 0,
63
+ eyeLookOutLeft: 0, eyeLookOutRight: 0, eyeLookUpLeft: 0, eyeLookUpRight: 0,
64
+ eyeSquintLeft: 0, eyeSquintRight: 0, eyeWideLeft: 0, eyeWideRight: 0,
65
+ jawForward: 0, jawLeft: 0, jawOpen: 0, jawRight: 0,
66
+ mouthClose: 0, mouthDimpleLeft: 0, mouthDimpleRight: 0,
67
+ mouthFrownLeft: 0, mouthFrownRight: 0, mouthFunnel: 0, mouthLeft: 0,
68
+ mouthLowerDownLeft: 0, mouthLowerDownRight: 0, mouthPressLeft: 0, mouthPressRight: 0,
69
+ mouthPucker: 0, mouthRight: 0, mouthRollLower: 0, mouthRollUpper: 0,
70
+ mouthShrugLower: 0, mouthShrugUpper: 0, mouthSmileLeft: 0, mouthSmileRight: 0,
71
+ mouthStretchLeft: 0, mouthStretchRight: 0, mouthUpperUpLeft: 0, mouthUpperUpRight: 0,
72
+ noseSneerLeft: 0, noseSneerRight: 0, tongueOut: 0
73
+ });
74
+
75
+ /**
76
+ * Convert MediaPipe faceBlendshapes categories array to a flat record
77
+ */
78
+ function buildBlendshapesRecord(categories) {
79
+ const record = {};
80
+ for (const cat of categories) {
81
+ record[cat.categoryName] = cat.score;
82
+ }
83
+ return record;
84
+ }
85
+
86
+ /**
87
+ * EMFACS-based emotion prototype vectors (Ekman FACS → ARKit blendshapes).
88
+ * Weights reflect each blendshape's reliability in MediaPipe's 2D web model.
89
+ * Known near-zero blendshapes (cheekSquint*, noseSneer*, eyeWide*) are
90
+ * down-weighted and supplemented by correlated signals that do activate.
91
+ *
92
+ * Reference: Aldenhoven et al. (2026) "Real-Time Emotion Recognition
93
+ * Performance of Mobile Devices" — EMFACS cosine similarity approach,
94
+ * 68.3% accuracy on 7 emotions, exceeding human raters (58.9%).
95
+ */
96
+ const EMOTION_PROTOTYPES = {
97
+ // mouthSmile is unique to happiness — no other emotion uses it.
98
+ // eyeSquint is a secondary "Duchenne smile" indicator.
99
+ happy: {
100
+ mouthSmileLeft: 1.0, mouthSmileRight: 1.0,
101
+ eyeSquintLeft: 0.3, eyeSquintRight: 0.3
102
+ },
103
+ // Pouty/trembling lip: mouthShrugLower (chin raiser) is the primary signal,
104
+ // mouthPucker (compressed lips) secondary. Compact prototype so it wins
105
+ // over angry when the differentiating upper-face signals are absent.
106
+ sad: {
107
+ mouthShrugLower: 1.0,
108
+ mouthPucker: 0.8
109
+ },
110
+ // Shares sad's mouth signals at lower weight, differentiated by upper-face
111
+ // tension: eyeSquint + browDown. These extra dimensions shift the cosine
112
+ // direction away from sad only when genuinely activated.
113
+ angry: {
114
+ mouthShrugLower: 0.6, mouthPucker: 0.5,
115
+ eyeSquintLeft: 1.0, eyeSquintRight: 1.0,
116
+ browDownLeft: 1.0, browDownRight: 1.0
117
+ },
118
+ // Brow raise only — the simplest, most distinctive prototype.
119
+ // jawOpen removed to avoid overlap with fearful.
120
+ surprised: {
121
+ browInnerUp: 1.0,
122
+ browOuterUpLeft: 1.0, browOuterUpRight: 1.0
123
+ },
124
+ // mouthUpperUp (upper lip raise) is the unique primary signal.
125
+ // mouthFrown supports, browDown at low weight for wrinkled-brow disgust.
126
+ disgusted: {
127
+ mouthUpperUpLeft: 1.0, mouthUpperUpRight: 1.0,
128
+ mouthFrownLeft: 0.8, mouthFrownRight: 0.8,
129
+ browDownLeft: 0.3, browDownRight: 0.3
130
+ },
131
+ // Shares surprised's brow raise, differentiated by jawOpen (rare in other
132
+ // emotions at even 10-20%). jawOpen is the primary differentiator.
133
+ fearful: {
134
+ browInnerUp: 0.8, browOuterUpLeft: 0.8, browOuterUpRight: 0.8,
135
+ jawOpen: 1.0
136
+ }
137
+ };
138
+
139
+ const PROTOTYPE_KEYS = Object.keys(EMPTY_BLENDSHAPES);
140
+
141
+ // Pre-compute prototype magnitudes for cosine similarity
142
+ const PROTOTYPE_MAGNITUDES = {};
143
+ for (const [emotion, proto] of Object.entries(EMOTION_PROTOTYPES)) {
144
+ let sumSq = 0;
145
+ for (const key of PROTOTYPE_KEYS) {
146
+ const v = proto[key] || 0;
147
+ sumSq += v * v;
148
+ }
149
+ PROTOTYPE_MAGNITUDES[emotion] = Math.sqrt(sumSq);
150
+ }
151
+
152
+ // Noise floor: blendshape values below this are treated as zero to
153
+ // prevent resting-state activations from matching emotion prototypes
154
+ const BLENDSHAPE_NOISE_FLOOR = 0.10;
155
+
156
+ /**
157
+ * Cosine similarity between observed blendshape vector and a prototype.
158
+ * cos(v, p) = (v · p) / (|v| * |p|)
159
+ * Applies a noise floor to observed values to suppress resting-state noise.
160
+ */
161
+ function emotionCosineSimilarity(observed, prototype, protoMag) {
162
+ let dot = 0;
163
+ let magO = 0;
164
+ for (const key of PROTOTYPE_KEYS) {
165
+ const raw = observed[key] || 0;
166
+ const o = raw > BLENDSHAPE_NOISE_FLOOR ? raw : 0;
167
+ const p = prototype[key] || 0;
168
+ dot += o * p;
169
+ magO += o * o;
170
+ }
171
+ magO = Math.sqrt(magO);
172
+ if (magO < 1e-8 || protoMag < 1e-8) return 0;
173
+ return dot / (magO * protoMag);
174
+ }
175
+
176
+ // Cross-suppression: when one emotion is confident, competing emotions are
177
+ // reduced. Uses raw (pre-suppression) scores so order doesn't matter.
178
+ // [suppressor] → { [target]: strength 0-1 }
179
+ const EMOTION_INHIBITIONS = {
180
+ happy: { angry: 0.7, sad: 0.5, disgusted: 0.4, fearful: 0.3 },
181
+ sad: { happy: 0.3, angry: 0.2 },
182
+ angry: { happy: 0.3, sad: 0.2 },
183
+ surprised: { angry: 0.3, sad: 0.3 },
184
+ disgusted: { happy: 0.4, surprised: 0.2 },
185
+ fearful: { happy: 0.3, angry: 0.2 }
186
+ };
187
+
188
+ /**
189
+ * Classify observed blendshapes into emotions using a 3-stage pipeline:
190
+ * 1. Cosine similarity against EMFACS prototypes (base scores)
191
+ * 2. Key-signal boosters for defining blendshapes (mouthPress → angry)
192
+ * 3. Cross-emotion inhibition matrix (happy suppresses angry, etc.)
193
+ *
194
+ * Returns { neutral, happy, sad, angry, surprised, disgusted, fearful }
195
+ */
196
+ function mapBlendshapesToEmotions(bs) {
197
+ const NF = BLENDSHAPE_NOISE_FLOOR;
198
+
199
+ // --- Stage 1: Cosine similarity base scores ---
200
+ const scores = {};
201
+ for (const [emotion, proto] of Object.entries(EMOTION_PROTOTYPES)) {
202
+ scores[emotion] = Math.max(0, emotionCosineSimilarity(bs, proto, PROTOTYPE_MAGNITUDES[emotion]));
55
203
  }
204
+
205
+ // --- Stage 2: Key-signal boosters ---
206
+ // mouthPress is a defining angry signal not in the prototype (to avoid
207
+ // resting-state contamination) but boosts angry when clearly present
208
+ const mouthPress = Math.max(0,
209
+ ((bs.mouthPressLeft || 0) + (bs.mouthPressRight || 0)) / 2 - NF);
210
+ if (mouthPress > 0) {
211
+ scores.angry = Math.min(1, scores.angry + mouthPress * 0.3);
212
+ }
213
+
214
+ // --- Stage 3: Cross-emotion inhibition ---
215
+ // Snapshot raw scores so suppression is non-circular
216
+ const raw = {};
217
+ for (const key in scores) raw[key] = scores[key];
218
+
219
+ for (const [suppressor, targets] of Object.entries(EMOTION_INHIBITIONS)) {
220
+ const suppressorScore = raw[suppressor] || 0;
221
+ if (suppressorScore > 0.1) {
222
+ for (const [target, strength] of Object.entries(targets)) {
223
+ scores[target] *= (1 - suppressorScore * strength);
224
+ }
225
+ }
226
+ }
227
+
228
+ // --- Neutral: dominant when no emotion is confident ---
229
+ let maxScore = 0;
230
+ for (const emotion of Object.keys(EMOTION_PROTOTYPES)) {
231
+ if (scores[emotion] > maxScore) maxScore = scores[emotion];
232
+ }
233
+ const neutralThreshold = 0.35;
234
+ scores.neutral = maxScore < neutralThreshold ? 1.0 : Math.max(0, 1.0 - maxScore * 1.5);
235
+
236
+ return {
237
+ neutral: scores.neutral,
238
+ happy: scores.happy || 0,
239
+ sad: scores.sad || 0,
240
+ angry: scores.angry || 0,
241
+ surprised: scores.surprised || 0,
242
+ disgusted: scores.disgusted || 0,
243
+ fearful: scores.fearful || 0
244
+ };
245
+ }
246
+
247
+ /**
248
+ * Compute head pose (pitch, yaw, roll in degrees) from 468 face landmarks.
249
+ * Uses nose tip, chin, eye corners, and forehead to derive 3D orientation.
250
+ */
251
+ function computeHeadPoseFromLandmarks(landmarks) {
252
+ // Key landmark indices (MediaPipe FaceMesh)
253
+ const noseTip = landmarks[1];
254
+ const chin = landmarks[152];
255
+ const leftEye = landmarks[33];
256
+ const rightEye = landmarks[263];
257
+ const forehead = landmarks[10];
258
+
259
+ if (!noseTip || !chin || !leftEye || !rightEye || !forehead) {
260
+ return { pitch: 0, yaw: 0, roll: 0 };
261
+ }
262
+
263
+ // Yaw: horizontal angle from eye midpoint to nose tip
264
+ const eyeMidX = (leftEye.x + rightEye.x) / 2;
265
+ const eyeMidY = (leftEye.y + rightEye.y) / 2;
266
+ const yaw = Math.atan2(noseTip.x - eyeMidX, Math.abs(noseTip.z - ((leftEye.z + rightEye.z) / 2 || 0)) + 0.001) * (180 / Math.PI);
267
+
268
+ // Pitch: vertical angle from forehead to chin through nose
269
+ const faceVerticalLen = Math.sqrt((chin.x - forehead.x) ** 2 + (chin.y - forehead.y) ** 2) || 0.001;
270
+ const noseRelY = (noseTip.y - forehead.y) / faceVerticalLen;
271
+ const pitch = (noseRelY - 0.5) * 180;
272
+
273
+ // Roll: tilt from horizontal eye line
274
+ const roll = Math.atan2(rightEye.y - leftEye.y, rightEye.x - leftEye.x) * (180 / Math.PI);
275
+
276
+ return {
277
+ pitch: isNaN(pitch) ? 0 : Math.max(-90, Math.min(90, pitch)),
278
+ yaw: isNaN(yaw) ? 0 : Math.max(-90, Math.min(90, yaw)),
279
+ roll: isNaN(roll) ? 0 : Math.max(-180, Math.min(180, roll))
280
+ };
56
281
  }
57
282
 
58
283
  /**
@@ -60,21 +285,18 @@ function log(...args) {
60
285
  */
61
286
  async function initializeVision() {
62
287
  if (isInitialized) {
63
- console.log('🔧 [CV Tasks Worker] Vision already initialized, skipping');
288
+ log('Vision already initialized, skipping');
64
289
  return;
65
290
  }
66
291
 
67
292
  try {
68
- console.log('🔧 [CV Tasks Worker] Starting MediaPipe Tasks Vision initialization...');
293
+ log('Starting MediaPipe Tasks Vision initialization...');
69
294
 
70
- // Initialize the vision runtime with WASM files
71
- // MediaPipe Tasks Vision expects the base path without trailing slash
72
295
  const wasmBasePath = '/dist/assets/wasm';
73
296
  log('WASM base path:', wasmBasePath);
74
297
 
75
- // Try different ways to access FilesetResolver
76
298
  const FilesetResolver = self.FilesetResolver || self.module.exports.FilesetResolver || self.exports.FilesetResolver;
77
- console.log('🔧 [CV Tasks Worker] FilesetResolver found:', !!FilesetResolver);
299
+ log('FilesetResolver found:', !!FilesetResolver);
78
300
 
79
301
  if (!FilesetResolver) {
80
302
  throw new Error('FilesetResolver not found in any expected location');
@@ -139,12 +361,13 @@ async function initializeFaceLandmarks() {
139
361
  delegate: 'GPU'
140
362
  },
141
363
  runningMode: 'VIDEO',
142
- numFaces: 1
364
+ numFaces: 1,
365
+ outputFaceBlendshapes: true
143
366
  };
144
367
 
145
368
  const FaceLandmarker = self.FaceLandmarker || self.module.exports.FaceLandmarker || self.exports.FaceLandmarker;
146
369
  faceLandmarker = await FaceLandmarker.createFromOptions(vision, options);
147
- log('✅ Face Landmarker loaded');
370
+ log('✅ Face Landmarker loaded (blendshapes enabled)');
148
371
  } catch (error) {
149
372
  log('❌ Failed to load Face Landmarker:', error);
150
373
  throw error;
@@ -256,42 +479,60 @@ async function processFrame(imageInput, timestamp, features) {
256
479
  if (features.includes('faceDetection') && faceDetector) {
257
480
  const detectionResult = faceDetector.detectForVideo(imageInput, timestamp);
258
481
  results.faces = detectionResult.detections.map((detection) => ({
259
- boundingBox: {
260
- // Normalize coordinates to 0-1 range to match other CV features
482
+ bounds: {
261
483
  x: detection.boundingBox.originX / imageInput.width,
262
484
  y: detection.boundingBox.originY / imageInput.height,
263
485
  width: detection.boundingBox.width / imageInput.width,
264
486
  height: detection.boundingBox.height / imageInput.height
265
487
  },
488
+ center: {
489
+ x: (detection.boundingBox.originX + detection.boundingBox.width / 2) / imageInput.width,
490
+ y: (detection.boundingBox.originY + detection.boundingBox.height / 2) / imageInput.height
491
+ },
266
492
  landmarks: [],
267
- expressions: {},
493
+ expressions: EMPTY_EXPRESSIONS,
494
+ headPose: EMPTY_HEAD_POSE,
495
+ blendshapes: EMPTY_BLENDSHAPES,
268
496
  confidence: detection.categories[0]?.score || 0
269
497
  }));
270
498
  }
271
499
 
272
- // Process face landmarks
273
- if (features.includes('faceMesh') && faceLandmarker) {
500
+ // Process face landmarks (used by faceMesh and emotionDetection)
501
+ if ((features.includes('faceMesh') || features.includes('emotionDetection')) && faceLandmarker) {
274
502
  const landmarkResult = faceLandmarker.detectForVideo(imageInput, timestamp);
275
503
  if (landmarkResult.faceLandmarks.length > 0) {
276
504
  const landmarks = landmarkResult.faceLandmarks[0];
277
505
 
278
- // If no face detection results exist, create a basic face structure
279
506
  if (!results.faces) {
280
507
  results.faces = [{
281
- boundingBox: null, // No bounding box when only mesh is enabled
508
+ bounds: null,
509
+ center: null,
282
510
  landmarks: [],
283
- expressions: {},
284
- confidence: 0.8 // Default confidence for mesh-only detection
511
+ expressions: EMPTY_EXPRESSIONS,
512
+ headPose: EMPTY_HEAD_POSE,
513
+ blendshapes: EMPTY_BLENDSHAPES,
514
+ confidence: 0.8
285
515
  }];
286
516
  }
287
517
 
288
- // Add landmarks to the first face (mesh only processes one face)
518
+ const mappedLandmarks = landmarks.map((landmark) => ({
519
+ x: landmark.x,
520
+ y: landmark.y,
521
+ z: landmark.z || 0
522
+ }));
523
+
289
524
  if (results.faces[0]) {
290
- results.faces[0].landmarks = landmarks.map((landmark) => ({
291
- x: landmark.x,
292
- y: landmark.y,
293
- z: landmark.z || 0
294
- }));
525
+ results.faces[0].landmarks = mappedLandmarks;
526
+ results.faces[0].headPose = computeHeadPoseFromLandmarks(landmarks);
527
+
528
+ // Populate emotion data when emotionDetection is active
529
+ if (features.includes('emotionDetection') &&
530
+ landmarkResult.faceBlendshapes &&
531
+ landmarkResult.faceBlendshapes.length > 0) {
532
+ const bs = buildBlendshapesRecord(landmarkResult.faceBlendshapes[0].categories);
533
+ results.faces[0].blendshapes = bs;
534
+ results.faces[0].expressions = mapBlendshapesToEmotions(bs);
535
+ }
295
536
  }
296
537
  }
297
538
  }
@@ -343,14 +584,7 @@ async function processFrame(imageInput, timestamp, features) {
343
584
  height: segmentResult.categoryMask.height
344
585
  };
345
586
 
346
- // Debug logging (temporary)
347
- if (DEBUG) {
348
- console.log('🔧 [CV Tasks Worker] Segmentation mask:', {
349
- width: results.segmentation.width,
350
- height: results.segmentation.height,
351
- maskSize: results.segmentation.mask.length
352
- });
353
- }
587
+ log('Segmentation mask:', results.segmentation.width, 'x', results.segmentation.height);
354
588
  } finally {
355
589
  // CRITICAL: Close MPMask instance to prevent resource leaks
356
590
  segmentResult.categoryMask.close();
@@ -449,8 +683,18 @@ async function handleConfigUpdateInternal(features) {
449
683
  faceDetector = null;
450
684
  break;
451
685
  case 'faceMesh':
452
- cleanupPromises.push(cleanupWasmInstance(faceLandmarker, 'FaceLandmarker'));
453
- faceLandmarker = null;
686
+ // Only teardown FaceLandmarker if emotionDetection also not active
687
+ if (!newFeatures.has('emotionDetection')) {
688
+ cleanupPromises.push(cleanupWasmInstance(faceLandmarker, 'FaceLandmarker'));
689
+ faceLandmarker = null;
690
+ }
691
+ break;
692
+ case 'emotionDetection':
693
+ // Only teardown FaceLandmarker if faceMesh also not active
694
+ if (!newFeatures.has('faceMesh')) {
695
+ cleanupPromises.push(cleanupWasmInstance(faceLandmarker, 'FaceLandmarker'));
696
+ faceLandmarker = null;
697
+ }
454
698
  break;
455
699
  case 'handTracking':
456
700
  cleanupPromises.push(cleanupWasmInstance(handLandmarker, 'HandLandmarker'));
@@ -485,6 +729,8 @@ async function handleConfigUpdateInternal(features) {
485
729
  await initializeFaceDetection();
486
730
  break;
487
731
  case 'faceMesh':
732
+ case 'emotionDetection':
733
+ // Both share the FaceLandmarker (with blendshapes enabled)
488
734
  await initializeFaceLandmarks();
489
735
  break;
490
736
  case 'handTracking':
@@ -525,10 +771,19 @@ async function handleConfigUpdate(features) {
525
771
  self.onmessage = async (event) => {
526
772
  const message = event.data;
527
773
 
528
- console.log('🔧 [CV Tasks Worker] Received message:', message.type, message);
774
+ // Only log non-process messages to avoid per-frame spam
775
+ if (message.type !== 'process') {
776
+ log('Received message:', message.type);
777
+ }
529
778
 
530
779
  try {
531
780
  switch (message.type) {
781
+ case 'debug': {
782
+ DEBUG = !!message.enabled;
783
+ log('Debug mode', DEBUG ? 'enabled' : 'disabled');
784
+ break;
785
+ }
786
+
532
787
  case 'init': {
533
788
  log('Received init message');
534
789