openface3 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,49 @@
1
+ # Python
2
+ .venv/
3
+ venv/
4
+ ENV/
5
+ env/
6
+ __pycache__/
7
+ *.py[cod]
8
+ *$py.class
9
+ *.so
10
+ .Python
11
+ *.egg-info/
12
+ dist/
13
+ build/
14
+
15
+ # IDEs
16
+ .vscode/
17
+ .idea/
18
+ *.swp
19
+ *.swo
20
+ *~
21
+
22
+ # OS
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Project-specific
27
+ weights/
28
+ outputs/
29
+ .of3mini_star/
30
+ output*.txt
31
+ *.npy
32
+ *.pkl
33
+
34
+ # Jupyter
35
+ .ipynb_checkpoints/
36
+
37
+ # VS Code debugging
38
+ .debugpy/
39
+
40
+ # Temporary files
41
+ *.tmp
42
+ *.log
43
+ logs
44
+ assets
45
+ datasets
46
+ *.gz
47
+ vis_output
48
+ CLAUDE.md
49
+ certs
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Haneesh Byreddy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.4
2
+ Name: openface3
3
+ Version: 0.1.0
4
+ Summary: Facial analysis pipeline: detection, 98-point alignment, and multitask emotion/gaze/AU inference
5
+ Project-URL: Homepage, https://github.com/haneeshbyreddy/openface3
6
+ Project-URL: Issues, https://github.com/haneeshbyreddy/openface3/issues
7
+ Project-URL: HuggingFace, https://huggingface.co/haneeshbyreddy/openface3-weights
8
+ License: MIT License
9
+
10
+ Copyright (c) 2026 Haneesh Byreddy
11
+
12
+ Permission is hereby granted, free of charge, to any person obtaining a copy
13
+ of this software and associated documentation files (the "Software"), to deal
14
+ in the Software without restriction, including without limitation the rights
15
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
+ copies of the Software, and to permit persons to whom the Software is
17
+ furnished to do so, subject to the following conditions:
18
+
19
+ The above copyright notice and this permission notice shall be included in all
20
+ copies or substantial portions of the Software.
21
+
22
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
+ SOFTWARE.
29
+ License-File: LICENSE
30
+ Keywords: action-units,emotion,face,facial-analysis,gaze
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Operating System :: OS Independent
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
35
+ Requires-Python: >=3.9
36
+ Requires-Dist: albumentations
37
+ Requires-Dist: huggingface-hub
38
+ Requires-Dist: iglovikov-helper-functions
39
+ Requires-Dist: numpy
40
+ Requires-Dist: opencv-python
41
+ Requires-Dist: timm
42
+ Requires-Dist: torch>=2.0
43
+ Requires-Dist: torchvision>=0.15
44
+ Description-Content-Type: text/markdown
45
+
46
+ # openface3
47
+
48
+ Facial analysis pipeline: face detection, 98-point landmark alignment, and multitask emotion / gaze / action unit inference.
49
+
50
+ ## Install
51
+
52
+ ```bash
53
+ pip install openface3
54
+ ```
55
+
56
+ Weights are downloaded automatically from [Hugging Face](https://huggingface.co/haneeshbyreddy/openface3-weights) on first use and cached via the standard HF cache (`~/.cache/huggingface/hub/`).
57
+
58
+ ## Usage
59
+
60
+ ```python
61
+ import cv2
62
+ from openface3 import Pipeline
63
+
64
+ pipeline = Pipeline(device='cuda') # or 'cpu'
65
+
66
+ image = cv2.cvtColor(cv2.imread('image.jpg'), cv2.COLOR_BGR2RGB)
67
+ results, timings = pipeline.run(image)
68
+
69
+ for face in results:
70
+ print(face['emotion']) # (1, 8) logits — softmax to get probabilities
71
+ print(face['gaze']) # (1, 2) [horizontal_deg, vertical_deg]
72
+ print(face['au']) # (1, 8) AU activations in [0, 1]
73
+ print(face['landmarks']) # (98, 2) WFLW keypoints in image coords
74
+ ```
75
+
76
+ Serialise a frame to JSON:
77
+
78
+ ```python
79
+ payload = Pipeline.serialize(results, timings, frame_n=0, ts=0.0)
80
+ ```
81
+
82
+ ## Labels
83
+
84
+ ```python
85
+ from openface3 import EMOTION_LABELS, AU_IDS
86
+ # EMOTION_LABELS: ['Neutral','Happy','Sad','Surprise','Fear','Disgust','Anger','Contempt']
87
+ # AU_IDS: ['AU01','AU02','AU04','AU06','AU09','AU12','AU25','AU26']
88
+ ```
89
+
90
+ ## License
91
+
92
+ MIT
@@ -0,0 +1,47 @@
1
+ # openface3
2
+
3
+ Facial analysis pipeline: face detection, 98-point landmark alignment, and multitask emotion / gaze / action unit inference.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install openface3
9
+ ```
10
+
11
+ Weights are downloaded automatically from [Hugging Face](https://huggingface.co/haneeshbyreddy/openface3-weights) on first use and cached via the standard HF cache (`~/.cache/huggingface/hub/`).
12
+
13
+ ## Usage
14
+
15
+ ```python
16
+ import cv2
17
+ from openface3 import Pipeline
18
+
19
+ pipeline = Pipeline(device='cuda') # or 'cpu'
20
+
21
+ image = cv2.cvtColor(cv2.imread('image.jpg'), cv2.COLOR_BGR2RGB)
22
+ results, timings = pipeline.run(image)
23
+
24
+ for face in results:
25
+ print(face['emotion']) # (1, 8) logits — softmax to get probabilities
26
+ print(face['gaze']) # (1, 2) [horizontal_deg, vertical_deg]
27
+ print(face['au']) # (1, 8) AU activations in [0, 1]
28
+ print(face['landmarks']) # (98, 2) WFLW keypoints in image coords
29
+ ```
30
+
31
+ Serialise a frame to JSON:
32
+
33
+ ```python
34
+ payload = Pipeline.serialize(results, timings, frame_n=0, ts=0.0)
35
+ ```
36
+
37
+ ## Labels
38
+
39
+ ```python
40
+ from openface3 import EMOTION_LABELS, AU_IDS
41
+ # EMOTION_LABELS: ['Neutral','Happy','Sad','Surprise','Fear','Disgust','Anger','Contempt']
42
+ # AU_IDS: ['AU01','AU02','AU04','AU06','AU09','AU12','AU25','AU26']
43
+ ```
44
+
45
+ ## License
46
+
47
+ MIT
@@ -0,0 +1 @@
1
+ from .pipeline import Pipeline, EMOTION_LABELS, AU_IDS
@@ -0,0 +1,118 @@
1
+ import cv2
2
+ import numpy as np
3
+ import torch
4
+
5
+ from .models.landmark import StackedHGNetV1
6
+
7
+
8
+ # ── Model config ──────────────────────────────────────────────────────────────
9
+
10
+ class Config:
11
+ width = 256
12
+ height = 256
13
+ use_AAM = True
14
+ classes_num = [98, 9, 98]
15
+ nstack = 4
16
+ add_coord = True
17
+ decoder_type = 'default'
18
+ edge_info = (
19
+ (False, (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
20
+ (True, (33,34,35,36,37,38,39,40,41)),
21
+ (True, (42,43,44,45,46,47,48,49,50)),
22
+ (False, (51,52,53,54)),
23
+ (False, (55,56,57,58,59)),
24
+ (True, (60,61,62,63,64,65,66,67)),
25
+ (True, (68,69,70,71,72,73,74,75)),
26
+ (True, (76,77,78,79,80,81,82,83,84,85,86,87)),
27
+ (True, (88,89,90,91,92,93,94,95)),
28
+ )
29
+
30
+
31
+ # ── Crop helpers (also used directly in eval scripts) ─────────────────────────
32
+
33
+ class GetCropMatrix:
34
+ def __init__(self, image_size, target_face_scale, align_corners=False):
35
+ self.image_size = image_size
36
+ self.target_face_scale = target_face_scale
37
+ self.align_corners = align_corners
38
+
39
+ def process(self, scale, center_w, center_h):
40
+ to_w = (self.image_size - 1) if self.align_corners else self.image_size
41
+ to_h = (self.image_size - 1) if self.align_corners else self.image_size
42
+ scale_mu = self.image_size / (scale * self.target_face_scale * 200.0)
43
+ cosv, sinv = 1.0, 0.0
44
+ acos, asin = scale_mu * cosv, scale_mu * sinv
45
+ fx, fy = center_w, center_h
46
+ tx, ty = to_w / 2.0, to_h / 2.0
47
+ return np.array([
48
+ [acos, -asin, tx - acos * fx + asin * fy],
49
+ [asin, acos, ty - asin * fx - acos * fy],
50
+ [0.0, 0.0, 1.0]], dtype=np.float32)
51
+
52
+
53
+ class TransformPerspective:
54
+ def __init__(self, image_size):
55
+ self.image_size = image_size
56
+
57
+ def process(self, image, matrix):
58
+ return cv2.warpPerspective(image, matrix,
59
+ dsize=(self.image_size, self.image_size),
60
+ flags=cv2.INTER_LINEAR, borderValue=0)
61
+
62
+
63
+ # ── Aligner ───────────────────────────────────────────────────────────────────
64
+
65
+ class Aligner:
66
+ def __init__(self, model_path: str, device='cpu'):
67
+ self.device = torch.device(device)
68
+ self.input_size = 256
69
+ cfg = Config()
70
+ net = StackedHGNetV1(config=cfg, classes_num=cfg.classes_num,
71
+ edge_info=cfg.edge_info, nstack=cfg.nstack,
72
+ add_coord=cfg.add_coord, decoder_type=cfg.decoder_type)
73
+ ckpt = torch.load(model_path, map_location=self.device)
74
+ net.load_state_dict(ckpt['net'], strict=False)
75
+ self.model = net.to(self.device).eval()
76
+
77
+ self._crop_matrix = GetCropMatrix(image_size=self.input_size,
78
+ target_face_scale=1.0, align_corners=True)
79
+ self._warp = TransformPerspective(image_size=self.input_size)
80
+
81
+ # ── preprocess ────────────────────────────────────────────────────────────
82
+
83
+ def preprocess(self, image: np.ndarray, bbox):
84
+ """Bbox-based crop (GetCropMatrix approach)."""
85
+ x1, y1, x2, y2 = bbox
86
+ cw = (x1 + x2) / 2
87
+ ch = (y1 + y2) / 2
88
+ scale = max(x2 - x1, y2 - y1) / 200.0
89
+ matrix = self._crop_matrix.process(scale, cw, ch)
90
+ crop = self._warp.process(image, matrix)
91
+ return crop, matrix
92
+
93
+ # ── infer ─────────────────────────────────────────────────────────────────
94
+
95
+ def infer(self, crop: np.ndarray) -> torch.Tensor:
96
+ """(256,256,3) uint8 RGB → raw coords tensor (98,2) in [-1,1]."""
97
+ t = torch.from_numpy(crop[np.newaxis]).float().permute(0, 3, 1, 2)
98
+ t = (t / 255.0) * 2.0 - 1.0
99
+ with torch.inference_mode():
100
+ out = self.model(t.to(self.device))
101
+ return out[-1][0] # (98, 2)
102
+
103
+ # ── postprocess ───────────────────────────────────────────────────────────
104
+
105
+ def postprocess(self, raw_coords: torch.Tensor, matrix: np.ndarray) -> np.ndarray:
106
+ """Denorm + backproject to original image space → (98,2)."""
107
+ pts = ((raw_coords + 1) * self.input_size - 1) / 2
108
+ pts = pts.cpu().numpy()
109
+ inv = np.linalg.inv(matrix)
110
+ out = np.empty_like(pts)
111
+ out[:, 0] = inv[0, 0]*pts[:, 0] + inv[0, 1]*pts[:, 1] + inv[0, 2]
112
+ out[:, 1] = inv[1, 0]*pts[:, 0] + inv[1, 1]*pts[:, 1] + inv[1, 2]
113
+ return out
114
+
115
+ def __call__(self, image: np.ndarray, bbox) -> np.ndarray:
116
+ """bbox → (98,2) landmarks in original image space."""
117
+ crop, matrix = self.preprocess(image, bbox)
118
+ return self.postprocess(self.infer(crop), matrix)
@@ -0,0 +1,40 @@
1
+ import numpy as np
2
+ import torch
3
+ from torchvision import transforms
4
+
5
+ from .models.multitask import MTL
6
+
7
+
8
+ class Analyzer:
9
+ def __init__(self, model_path: str, device='cpu'):
10
+ self.device = torch.device(device)
11
+ self.model = MTL().to(self.device)
12
+ state_dict = torch.load(model_path, map_location=self.device)
13
+ self.model.load_state_dict(state_dict)
14
+ self.model.eval()
15
+
16
+ self.transform = transforms.Compose([
17
+ transforms.ToTensor(),
18
+ transforms.Resize((224, 224)),
19
+ transforms.Normalize(mean=[0.485, 0.456, 0.406],
20
+ std=[0.229, 0.224, 0.225]),
21
+ ])
22
+
23
+ def preprocess(self, face_crop: np.ndarray) -> torch.Tensor:
24
+ """(H,W,3) uint8 RGB → normalised tensor on device."""
25
+ return self.transform(face_crop).unsqueeze(0).to(self.device)
26
+
27
+ def infer(self, tensor: torch.Tensor):
28
+ with torch.inference_mode():
29
+ return self.model(tensor)
30
+
31
+ def postprocess(self, raw) -> dict:
32
+ emotion_raw, gaze_raw, au_raw = raw
33
+ return {
34
+ 'emotion': emotion_raw.cpu().numpy(),
35
+ 'gaze': gaze_raw.cpu().numpy(),
36
+ 'au': au_raw.cpu().numpy(),
37
+ }
38
+
39
+ def __call__(self, face_crop: np.ndarray) -> dict:
40
+ return self.postprocess(self.infer(self.preprocess(face_crop)))
@@ -0,0 +1,130 @@
1
+ from itertools import product
2
+ from math import ceil
3
+ import albumentations as A
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from iglovikov_helper_functions.utils.image_utils import pad_to_size, unpad_from_size
8
+ from torch.utils import model_zoo
9
+ from torchvision.ops import nms
10
+
11
+ from .models.retinaface import RetinaFace
12
+
13
+
14
+ # ── Prior box / decode utilities ─────────────────────────────────────────────
15
+
16
+ def priorbox(min_sizes, steps, clip, image_size):
17
+ feature_maps = [[ceil(image_size[0] / s), ceil(image_size[1] / s)] for s in steps]
18
+ anchors = []
19
+ for k, f in enumerate(feature_maps):
20
+ for i, j in product(range(f[0]), range(f[1])):
21
+ for ms in min_sizes[k]:
22
+ cx = (j + 0.5) * steps[k] / image_size[1]
23
+ cy = (i + 0.5) * steps[k] / image_size[0]
24
+ anchors += [cx, cy, ms / image_size[1], ms / image_size[0]]
25
+ out = torch.Tensor(anchors).view(-1, 4)
26
+ if clip:
27
+ out.clamp_(max=1, min=0)
28
+ return out
29
+
30
+
31
+ def decode(loc, priors, variances):
32
+ boxes = torch.cat((
33
+ priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
34
+ priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
35
+ boxes[:, :2] -= boxes[:, 2:] / 2
36
+ boxes[:, 2:] += boxes[:, :2]
37
+ return boxes
38
+
39
+
40
+ def decode_landm(pre, priors, variances):
41
+ return torch.cat([
42
+ priors[:, :2] + pre[:, 2*i:2*i+2] * variances[0] * priors[:, 2:]
43
+ for i in range(5)], dim=1)
44
+
45
+
46
+ # ── Detector ──────────────────────────────────────────────────────────────────
47
+
48
+ WEIGHTS_URL = "https://github.com/ternaus/retinaface/releases/download/0.01/retinaface_resnet50_2020-07-20-f168fae3c.zip"
49
+
50
+ class Detector:
51
+ def __init__(self, device='cpu', max_size=640):
52
+ self.device = device
53
+ self.max_size = max_size
54
+ self.model = RetinaFace(
55
+ name='Resnet50', pretrained=False,
56
+ return_layers={'layer2': 1, 'layer3': 2, 'layer4': 3},
57
+ in_channels=256, out_channels=256).to(device)
58
+ self.transform = A.Compose([A.LongestMaxSize(max_size=max_size, p=1), A.Normalize(p=1)])
59
+ self.prior_box = priorbox(min_sizes=[[16, 32], [64, 128], [256, 512]],
60
+ steps=[8, 16, 32], clip=False,
61
+ image_size=(max_size, max_size)).to(device)
62
+ self.variance = [0.1, 0.2]
63
+ self._scale_bbox = torch.tensor([max_size, max_size, max_size, max_size],
64
+ dtype=torch.float32).to(device)
65
+ self._scale_landm = torch.tensor([max_size, max_size] * 5,
66
+ dtype=torch.float32).to(device)
67
+
68
+ def load_weights(self):
69
+ state_dict = model_zoo.load_url(WEIGHTS_URL, progress=True, map_location='cpu')
70
+ self.model.load_state_dict(state_dict)
71
+
72
+ def eval(self):
73
+ self.model.eval()
74
+
75
+ def preprocess(self, image: np.ndarray):
76
+ h, w = image.shape[:2]
77
+ transformed = self.transform(image=image)['image']
78
+ paded = pad_to_size(target_size=(self.max_size, self.max_size), image=transformed)
79
+ tensor = torch.from_numpy(np.transpose(paded['image'], (2, 0, 1))).to(self.device)
80
+ meta = {'pads': paded['pads'], 'resize_coeff': max(h, w) / self.max_size,
81
+ 'original_size': (h, w)}
82
+ return tensor.unsqueeze(0), meta
83
+
84
+ def infer(self, tensor: torch.Tensor):
85
+ with torch.inference_mode():
86
+ return self.model(tensor)
87
+
88
+ def postprocess(self, raw, meta, confidence_threshold=0.7, nms_threshold=0.4):
89
+ loc, conf, land = raw
90
+ conf = F.softmax(conf, dim=-1)
91
+ boxes = decode(loc[0], self.prior_box, self.variance) * self._scale_bbox
92
+ scores = conf[0][:, 1]
93
+ landmarks = decode_landm(land[0], self.prior_box, self.variance) * self._scale_landm
94
+
95
+ valid = torch.where(scores > confidence_threshold)[0]
96
+ boxes, landmarks, scores = boxes[valid], landmarks[valid], scores[valid]
97
+ order = scores.argsort(descending=True)
98
+ boxes, landmarks, scores = boxes[order], landmarks[order], scores[order]
99
+
100
+ keep = nms(boxes, scores, nms_threshold)
101
+ boxes = boxes[keep].int()
102
+ if boxes.shape[0] == 0:
103
+ return []
104
+
105
+ landmarks = landmarks[keep].cpu().numpy().reshape(-1, 2)
106
+ scores = scores[keep].cpu().numpy().astype(np.float64)
107
+ boxes = boxes.cpu().numpy()
108
+
109
+ unpadded = unpad_from_size(meta['pads'], bboxes=boxes, keypoints=landmarks)
110
+ rc = meta['resize_coeff']
111
+ boxes = (unpadded['bboxes'] * rc).astype(int)
112
+ landmarks = (unpadded['keypoints'].reshape(-1, 10) * rc).astype(int)
113
+ oh, ow = meta['original_size']
114
+
115
+ results = []
116
+ for i, bbox in enumerate(boxes):
117
+ x1 = int(np.clip(bbox[0], 0, ow - 1))
118
+ x2 = int(np.clip(bbox[2], x1 + 1, ow - 1))
119
+ y1 = int(np.clip(bbox[1], 0, oh - 1))
120
+ y2 = int(np.clip(bbox[3], y1 + 1, oh - 1))
121
+ if x1 >= x2 or y1 >= y2:
122
+ continue
123
+ results.append({'bbox': [x1, y1, x2, y2],
124
+ 'score': float(scores[i]),
125
+ 'landmarks': landmarks[i].reshape(-1, 2).tolist()})
126
+ return results
127
+
128
+ def __call__(self, image: np.ndarray, confidence_threshold=0.7, nms_threshold=0.4):
129
+ tensor, meta = self.preprocess(image)
130
+ return self.postprocess(self.infer(tensor), meta, confidence_threshold, nms_threshold)
File without changes
@@ -0,0 +1,287 @@
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+
6
+
7
+ # ── CoordConv ─────────────────────────────────────────────────────────────────
8
+
9
+ class AddCoordsTh(nn.Module):
10
+ def __init__(self, x_dim, y_dim, with_r=False, with_boundary=False):
11
+ super().__init__()
12
+ self.x_dim = x_dim
13
+ self.y_dim = y_dim
14
+ self.with_r = with_r
15
+ self.with_boundary = with_boundary
16
+
17
+ def forward(self, input_tensor, heatmap=None):
18
+ batch = input_tensor.shape[0]
19
+
20
+ xx_ones = torch.ones([1, self.y_dim], dtype=torch.int32).to(input_tensor)
21
+ xx_ones = xx_ones.unsqueeze(-1) # (1, y_dim, 1)
22
+ xx_range = torch.arange(self.x_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor)
23
+ xx_range = xx_range.unsqueeze(1) # (1, 1, x_dim)
24
+ xx_channel = torch.matmul(xx_ones.float(), xx_range.float()).unsqueeze(-1) # (1, y_dim, x_dim, 1)
25
+
26
+ yy_ones = torch.ones([1, self.x_dim], dtype=torch.int32).to(input_tensor)
27
+ yy_ones = yy_ones.unsqueeze(1) # (1, 1, x_dim)
28
+ yy_range = torch.arange(self.y_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor)
29
+ yy_range = yy_range.unsqueeze(-1) # (1, y_dim, 1)
30
+ yy_channel = torch.matmul(yy_range.float(), yy_ones.float()).unsqueeze(-1) # (1, y_dim, x_dim, 1)
31
+
32
+ xx_channel = xx_channel.permute(0, 3, 2, 1) / (self.x_dim - 1) * 2 - 1
33
+ yy_channel = yy_channel.permute(0, 3, 2, 1) / (self.y_dim - 1) * 2 - 1
34
+
35
+ xx_channel = xx_channel.repeat(batch, 1, 1, 1)
36
+ yy_channel = yy_channel.repeat(batch, 1, 1, 1)
37
+
38
+ if self.with_boundary and heatmap is not None:
39
+ boundary = torch.clamp(heatmap[:, -1:, :, :], 0.0, 1.0)
40
+ zero = torch.zeros_like(xx_channel)
41
+ xx_boundary = torch.where(boundary > 0.05, xx_channel, zero)
42
+ yy_boundary = torch.where(boundary > 0.05, yy_channel, zero)
43
+
44
+ ret = torch.cat([input_tensor, xx_channel, yy_channel], dim=1)
45
+ if self.with_r:
46
+ rr = torch.sqrt(xx_channel ** 2 + yy_channel ** 2)
47
+ rr = rr / torch.max(rr)
48
+ ret = torch.cat([ret, rr], dim=1)
49
+ if self.with_boundary and heatmap is not None:
50
+ ret = torch.cat([ret, xx_boundary, yy_boundary], dim=1)
51
+ return ret
52
+
53
+
54
+ class CoordConvTh(nn.Module):
55
+ def __init__(self, x_dim, y_dim, with_r, with_boundary,
56
+ in_channels, out_channels, first_one=False,
57
+ relu=False, bn=False, *args, **kwargs):
58
+ super().__init__()
59
+ self.addcoords = AddCoordsTh(x_dim=x_dim, y_dim=y_dim,
60
+ with_r=with_r, with_boundary=with_boundary)
61
+ in_channels += 2
62
+ if with_r:
63
+ in_channels += 1
64
+ if with_boundary and not first_one:
65
+ in_channels += 2
66
+ self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, *args, **kwargs)
67
+ self.relu = nn.ReLU() if relu else None
68
+ self.bn = nn.BatchNorm2d(out_channels) if bn else None
69
+ self.with_boundary = with_boundary
70
+ self.first_one = first_one
71
+
72
+ def forward(self, input_tensor, heatmap=None):
73
+ assert (self.with_boundary and not self.first_one) == (heatmap is not None)
74
+ ret = self.addcoords(input_tensor, heatmap)
75
+ ret = self.conv(ret)
76
+ if self.bn is not None: ret = self.bn(ret)
77
+ if self.relu is not None: ret = self.relu(ret)
78
+ return ret
79
+
80
+
81
+ # ── Decoder ───────────────────────────────────────────────────────────────────
82
+
83
+ class _DecoderDefault:
84
+ def __init__(self, weight=1, use_weight_map=False):
85
+ self.weight = weight
86
+ self.use_weight_map = use_weight_map
87
+
88
+ def _make_grid(self, h, w):
89
+ yy, xx = torch.meshgrid(
90
+ torch.arange(h).float() / (h - 1) * 2 - 1,
91
+ torch.arange(w).float() / (w - 1) * 2 - 1)
92
+ return yy, xx
93
+
94
+ def get_coords_from_heatmap(self, heatmap):
95
+ batch, npoints, h, w = heatmap.shape
96
+ if self.use_weight_map:
97
+ heatmap = heatmap * self.weight
98
+ yy, xx = self._make_grid(h, w)
99
+ yy = yy.view(1, 1, h, w).to(heatmap)
100
+ xx = xx.view(1, 1, h, w).to(heatmap)
101
+ heatmap_sum = torch.clamp(heatmap.sum([2, 3]), min=1e-6)
102
+ yy_coord = (yy * heatmap).sum([2, 3]) / heatmap_sum
103
+ xx_coord = (xx * heatmap).sum([2, 3]) / heatmap_sum
104
+ return torch.stack([xx_coord, yy_coord], dim=-1)
105
+
106
+
107
+ def get_decoder(decoder_type='default'):
108
+ if decoder_type == 'default':
109
+ return _DecoderDefault()
110
+ raise NotImplementedError(f"Unknown decoder: {decoder_type}")
111
+
112
+
113
+ # ── Building blocks ───────────────────────────────────────────────────────────
114
+
115
+ class Activation(nn.Module):
116
+ def __init__(self, kind='relu', channel=None):
117
+ super().__init__()
118
+ self.kind = kind
119
+ norm_str, act_str = kind.split('+') if '+' in kind else ('none', kind)
120
+ self.norm_fn = {'in': F.instance_norm,
121
+ 'bn': nn.BatchNorm2d(channel),
122
+ 'bn_noaffine': nn.BatchNorm2d(channel, affine=False, track_running_stats=True),
123
+ 'none': None}[norm_str]
124
+ self.act_fn = {'relu': F.relu, 'softplus': nn.Softplus(), 'exp': torch.exp,
125
+ 'sigmoid': torch.sigmoid, 'tanh': torch.tanh, 'none': None}[act_str]
126
+
127
+ def forward(self, x):
128
+ if self.norm_fn is not None: x = self.norm_fn(x)
129
+ if self.act_fn is not None: x = self.act_fn(x)
130
+ return x
131
+
132
+
133
+ class ConvBlock(nn.Module):
134
+ def __init__(self, inp_dim, out_dim, kernel_size=3, stride=1, bn=False, relu=True, groups=1):
135
+ super().__init__()
136
+ self.conv = nn.Conv2d(inp_dim, out_dim, kernel_size, stride,
137
+ padding=(kernel_size - 1) // 2, groups=groups, bias=True)
138
+ self.relu = nn.ReLU() if relu else None
139
+ self.bn = nn.BatchNorm2d(out_dim) if bn else None
140
+
141
+ def forward(self, x):
142
+ x = self.conv(x)
143
+ if self.bn is not None: x = self.bn(x)
144
+ if self.relu is not None: x = self.relu(x)
145
+ return x
146
+
147
+
148
+ class ResBlock(nn.Module):
149
+ def __init__(self, inp_dim, out_dim, mid_dim=None):
150
+ super().__init__()
151
+ mid_dim = mid_dim or out_dim // 2
152
+ self.relu = nn.ReLU()
153
+ self.bn1 = nn.BatchNorm2d(inp_dim)
154
+ self.conv1 = ConvBlock(inp_dim, mid_dim, 1, relu=False)
155
+ self.bn2 = nn.BatchNorm2d(mid_dim)
156
+ self.conv2 = ConvBlock(mid_dim, mid_dim, 3, relu=False)
157
+ self.bn3 = nn.BatchNorm2d(mid_dim)
158
+ self.conv3 = ConvBlock(mid_dim, out_dim, 1, relu=False)
159
+ self.need_skip = inp_dim != out_dim
160
+ self.skip_layer = ConvBlock(inp_dim, out_dim, 1, relu=False) if self.need_skip else None
161
+
162
+ def forward(self, x):
163
+ residual = self.skip_layer(x) if self.need_skip else x
164
+ out = self.conv3(self.relu(self.bn3(self.conv2(self.relu(self.bn2(self.conv1(self.relu(self.bn1(x)))))))))
165
+ return out + residual
166
+
167
+
168
+ class Hourglass(nn.Module):
169
+ def __init__(self, n, f, increase=0, up_mode='nearest',
170
+ add_coord=False, first_one=False, x_dim=64, y_dim=64):
171
+ super().__init__()
172
+ nf = f + increase
173
+ self.coordconv = CoordConvTh(x_dim=x_dim, y_dim=y_dim, with_r=True, with_boundary=True,
174
+ relu=False, bn=False, in_channels=f, out_channels=f,
175
+ first_one=first_one, kernel_size=1, stride=1, padding=0
176
+ ) if add_coord else None
177
+ self.up1 = ResBlock(f, f)
178
+ self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
179
+ self.low1 = ResBlock(f, nf)
180
+ self.n = n
181
+ self.low2 = Hourglass(n=n-1, f=nf, increase=increase, up_mode=up_mode,
182
+ add_coord=False) if n > 1 else ResBlock(nf, nf)
183
+ self.low3 = ResBlock(nf, f)
184
+ self.up2 = nn.Upsample(scale_factor=2, mode=up_mode)
185
+
186
+ def forward(self, x, heatmap=None):
187
+ if self.coordconv is not None:
188
+ x = self.coordconv(x, heatmap)
189
+ return self.up1(x) + self.up2(self.low3(self.low2(self.low1(self.pool1(x)))))
190
+
191
+
192
+ class E2HTransform(nn.Module):
193
+ def __init__(self, edge_info, num_points, num_edges):
194
+ super().__init__()
195
+ e2h = np.zeros([num_points, num_edges])
196
+ for eid, (_, indices) in enumerate(edge_info):
197
+ for pid in indices:
198
+ e2h[pid, eid] = 1
199
+ e2h = torch.from_numpy(e2h).float()
200
+ self.register_buffer('weight', e2h.view(e2h.size(0), e2h.size(1), 1, 1))
201
+ bias = ((e2h @ torch.ones(e2h.size(1))) < 0.5).to(e2h)
202
+ self.register_buffer('bias', bias)
203
+
204
+ def forward(self, edgemaps):
205
+ return F.conv2d(edgemaps, weight=self.weight, bias=self.bias)
206
+
207
+
208
+ # ── StackedHGNetV1 ────────────────────────────────────────────────────────────
209
+
210
+ class StackedHGNetV1(nn.Module):
211
+ def __init__(self, config, classes_num, edge_info,
212
+ nstack=4, nlevels=4, in_channel=256, increase=0,
213
+ add_coord=True, decoder_type='default'):
214
+ super().__init__()
215
+ self.cfg = config
216
+ self.decoder = get_decoder(decoder_type)
217
+ self.nstack = nstack
218
+ self.add_coord = add_coord
219
+ self.num_heats = classes_num[0]
220
+
221
+ first_conv = CoordConvTh(x_dim=config.width, y_dim=config.height,
222
+ with_r=True, with_boundary=False,
223
+ relu=True, bn=True, in_channels=3, out_channels=64,
224
+ kernel_size=7, stride=2, padding=3
225
+ ) if add_coord else ConvBlock(3, 64, 7, 2, bn=True, relu=True)
226
+
227
+ self.pre = nn.Sequential(first_conv, ResBlock(64, 128),
228
+ nn.MaxPool2d(2, 2), ResBlock(128, 128), ResBlock(128, in_channel))
229
+
230
+ self.hgs = nn.ModuleList([
231
+ Hourglass(n=nlevels, f=in_channel, increase=increase, add_coord=add_coord,
232
+ first_one=(i == 0),
233
+ x_dim=config.width // nstack,
234
+ y_dim=config.height // nstack)
235
+ for i in range(nstack)])
236
+
237
+ self.features = nn.ModuleList([
238
+ nn.Sequential(ResBlock(in_channel, in_channel),
239
+ ConvBlock(in_channel, in_channel, 1, bn=True, relu=True))
240
+ for _ in range(nstack)])
241
+
242
+ self.out_heatmaps = nn.ModuleList([ConvBlock(in_channel, self.num_heats, 1, relu=False, bn=False) for _ in range(nstack)])
243
+ self.merge_features = nn.ModuleList([ConvBlock(in_channel, in_channel, 1, relu=False, bn=False) for _ in range(nstack - 1)])
244
+ self.merge_heatmaps = nn.ModuleList([ConvBlock(self.num_heats, in_channel, 1, relu=False, bn=False) for _ in range(nstack - 1)])
245
+ self.heatmap_act = Activation("in+relu", self.num_heats)
246
+
247
+ if config.use_AAM:
248
+ self.num_edges = classes_num[1]
249
+ self.num_points = classes_num[2]
250
+ self.e2h = E2HTransform(edge_info, self.num_points, self.num_edges)
251
+ self.out_edgemaps = nn.ModuleList([ConvBlock(in_channel, self.num_edges, 1, relu=False, bn=False) for _ in range(nstack)])
252
+ self.out_pointmaps = nn.ModuleList([ConvBlock(in_channel, self.num_points, 1, relu=False, bn=False) for _ in range(nstack)])
253
+ self.merge_edgemaps = nn.ModuleList([ConvBlock(self.num_edges, in_channel, 1, relu=False, bn=False) for _ in range(nstack - 1)])
254
+ self.merge_pointmaps = nn.ModuleList([ConvBlock(self.num_points, in_channel, 1, relu=False, bn=False) for _ in range(nstack - 1)])
255
+ self.edgemap_act = Activation("sigmoid", self.num_edges)
256
+ self.pointmap_act = Activation("sigmoid", self.num_points)
257
+
258
+ def forward(self, x):
259
+ x = self.pre(x)
260
+ y, fusionmaps, heatmaps = [], [], None
261
+
262
+ for i in range(self.nstack):
263
+ hg = self.hgs[i](x, heatmap=heatmaps)
264
+ feature = self.features[i](hg)
265
+ heatmaps = self.heatmap_act(self.out_heatmaps[i](feature))
266
+
267
+ if self.cfg.use_AAM:
268
+ pointmaps = self.pointmap_act(self.out_pointmaps[i](feature))
269
+ edgemaps = self.edgemap_act(self.out_edgemaps[i](feature))
270
+ fusion = self.e2h(edgemaps) * pointmaps * heatmaps
271
+ else:
272
+ fusion = heatmaps
273
+
274
+ landmarks = self.decoder.get_coords_from_heatmap(fusion)
275
+
276
+ if i < self.nstack - 1:
277
+ x = x + self.merge_features[i](feature) + self.merge_heatmaps[i](heatmaps)
278
+ if self.cfg.use_AAM:
279
+ x += self.merge_pointmaps[i](pointmaps) + self.merge_edgemaps[i](edgemaps)
280
+
281
+ y.append(landmarks)
282
+ if self.cfg.use_AAM:
283
+ y.append(pointmaps)
284
+ y.append(edgemaps)
285
+ fusionmaps.append(fusion)
286
+
287
+ return y, fusionmaps, landmarks
@@ -0,0 +1,93 @@
1
+ import math
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import timm
7
+
8
+
9
+ # ── Graph Neural Network (AU head) ────────────────────────────────────────────
10
+
11
+ def _normalize_digraph(A):
12
+ b, n, _ = A.shape
13
+ degs_inv_sqrt = A.detach().sum(dim=-1) ** -0.5
14
+ norm = torch.eye(n, device=A.device).view(1, n, n) * degs_inv_sqrt.view(b, n, 1)
15
+ return torch.bmm(torch.bmm(norm, A), norm)
16
+
17
+
18
+ class GNN(nn.Module):
19
+ def __init__(self, in_channels, num_classes, neighbor_num=4, metric='dots'):
20
+ super().__init__()
21
+ self.metric = metric
22
+ self.neighbor_num = neighbor_num
23
+ self.U = nn.Linear(in_channels, in_channels)
24
+ self.V = nn.Linear(in_channels, in_channels)
25
+ self.bnv = nn.BatchNorm1d(num_classes)
26
+ self.relu = nn.ReLU()
27
+ nn.init.normal_(self.U.weight, 0, math.sqrt(2. / in_channels))
28
+ nn.init.normal_(self.V.weight, 0, math.sqrt(2. / in_channels))
29
+ self.bnv.weight.data.fill_(1)
30
+ self.bnv.bias.data.zero_()
31
+
32
+ def forward(self, x):
33
+ b, n, c = x.shape
34
+ si = x.detach()
35
+ if self.metric == 'dots':
36
+ si = torch.einsum('bij,bjk->bik', si, si.transpose(1, 2))
37
+ elif self.metric == 'cosine':
38
+ si = F.normalize(si, p=2, dim=-1)
39
+ si = torch.einsum('bij,bjk->bik', si, si.transpose(1, 2))
40
+ elif self.metric == 'l1':
41
+ si = torch.abs(si.unsqueeze(2) - si.unsqueeze(1)).sum(dim=-1)
42
+ else:
43
+ raise ValueError(f"Unknown metric: {self.metric}")
44
+
45
+ largest = self.metric != 'l1'
46
+ threshold = si.topk(k=self.neighbor_num, dim=-1, largest=largest)[0][:, :, -1].view(b, n, 1)
47
+ adj = (si >= threshold).float() if largest else (si <= threshold).float()
48
+ A = _normalize_digraph(adj)
49
+ agg = torch.einsum('bij,bjk->bik', A, self.V(x))
50
+ return self.relu(x + self.bnv(agg + self.U(x)))
51
+
52
+
53
+ class AUHead(nn.Module):
54
+ def __init__(self, in_channels, num_classes, neighbor_num=4, metric='dots'):
55
+ super().__init__()
56
+ self.class_linears = nn.ModuleList([nn.Linear(in_channels, in_channels) for _ in range(num_classes)])
57
+ self.gnn = GNN(in_channels, num_classes, neighbor_num=neighbor_num, metric=metric)
58
+ self.sc = nn.Parameter(torch.zeros(num_classes, in_channels))
59
+ self.relu = nn.ReLU()
60
+ nn.init.xavier_uniform_(self.sc)
61
+
62
+ def forward(self, x):
63
+ f_u = torch.cat([layer(x).unsqueeze(1) for layer in self.class_linears], dim=1)
64
+ f_v = self.gnn(f_u)
65
+ sc = F.normalize(self.relu(self.sc), p=2, dim=-1)
66
+ cl = F.normalize(f_v, p=2, dim=-1)
67
+ return (cl * sc.unsqueeze(0)).sum(dim=-1)
68
+
69
+
70
+ # ── MTL Model ─────────────────────────────────────────────────────────────────
71
+
72
+ class MTL(nn.Module):
73
+ def __init__(self, base_model_name='tf_efficientnet_b0_ns', expr_classes=8, au_numbers=8):
74
+ super().__init__()
75
+ self.base_model = timm.create_model(base_model_name, pretrained=False)
76
+ self.base_model.classifier = nn.Identity()
77
+ d = self.base_model.num_features
78
+
79
+ self.relu = nn.ReLU()
80
+ self.fc_emotion = nn.Linear(d, d)
81
+ self.fc_gaze = nn.Linear(d, d)
82
+ self.fc_au = nn.Linear(d, d)
83
+
84
+ self.emotion_classifier = nn.Linear(d, expr_classes)
85
+ self.gaze_regressor = nn.Linear(d, 2)
86
+ self.au_regressor = AUHead(in_channels=d, num_classes=au_numbers, neighbor_num=4, metric='dots')
87
+
88
+ def forward(self, x):
89
+ feat = self.base_model(x)
90
+ emotion = self.emotion_classifier(self.relu(self.fc_emotion(feat)))
91
+ gaze = self.gaze_regressor(self.relu(self.fc_gaze(feat)))
92
+ au = self.au_regressor(self.relu(self.fc_au(feat)))
93
+ return emotion, gaze, au
@@ -0,0 +1,121 @@
1
+ from typing import Dict, List, Tuple
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ from torchvision import models
7
+ from torchvision.models import _utils
8
+
9
+
10
+ # ── FPN / SSH building blocks ─────────────────────────────────────────────────
11
+
12
+ def _conv_bn(inp, oup, stride=1, leaky=0):
13
+ return nn.Sequential(
14
+ nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
15
+ nn.BatchNorm2d(oup),
16
+ nn.LeakyReLU(negative_slope=leaky, inplace=True))
17
+
18
+ def _conv_bn_no_relu(inp, oup, stride):
19
+ return nn.Sequential(
20
+ nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
21
+ nn.BatchNorm2d(oup))
22
+
23
+ def _conv_bn1x1(inp, oup, stride, leaky=0):
24
+ return nn.Sequential(
25
+ nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
26
+ nn.BatchNorm2d(oup),
27
+ nn.LeakyReLU(negative_slope=leaky, inplace=True))
28
+
29
+
30
+ class SSH(nn.Module):
31
+ def __init__(self, in_channel, out_channel):
32
+ super().__init__()
33
+ if out_channel % 4 != 0:
34
+ raise ValueError(f"out_channel must be divisible by 4, got {out_channel}")
35
+ leaky = 0.1 if out_channel <= 64 else 0
36
+ self.conv3X3 = _conv_bn_no_relu(in_channel, out_channel // 2, stride=1)
37
+ self.conv5X5_1 = _conv_bn(in_channel, out_channel // 4, stride=1, leaky=leaky)
38
+ self.conv5X5_2 = _conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1)
39
+ self.conv7X7_2 = _conv_bn(out_channel // 4, out_channel // 4, stride=1, leaky=leaky)
40
+ self.conv7x7_3 = _conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1)
41
+
42
+ def forward(self, x):
43
+ c3 = self.conv3X3(x)
44
+ c5_1 = self.conv5X5_1(x)
45
+ c5 = self.conv5X5_2(c5_1)
46
+ c7 = self.conv7x7_3(self.conv7X7_2(c5_1))
47
+ return F.relu(torch.cat([c3, c5, c7], dim=1))
48
+
49
+
50
+ class FPN(nn.Module):
51
+ def __init__(self, in_channels_list, out_channels):
52
+ super().__init__()
53
+ leaky = 0.1 if out_channels <= 64 else 0
54
+ self.output1 = _conv_bn1x1(in_channels_list[0], out_channels, stride=1, leaky=leaky)
55
+ self.output2 = _conv_bn1x1(in_channels_list[1], out_channels, stride=1, leaky=leaky)
56
+ self.output3 = _conv_bn1x1(in_channels_list[2], out_channels, stride=1, leaky=leaky)
57
+ self.merge1 = _conv_bn(out_channels, out_channels, leaky=leaky)
58
+ self.merge2 = _conv_bn(out_channels, out_channels, leaky=leaky)
59
+
60
+ def forward(self, x: Dict[str, torch.Tensor]) -> List[torch.Tensor]:
61
+ y = list(x.values())
62
+ o1, o2, o3 = self.output1(y[0]), self.output2(y[1]), self.output3(y[2])
63
+ o2 = self.merge2(o2 + F.interpolate(o3, size=o2.shape[2:], mode='nearest'))
64
+ o1 = self.merge1(o1 + F.interpolate(o2, size=o1.shape[2:], mode='nearest'))
65
+ return [o1, o2, o3]
66
+
67
+
68
+ # ── Prediction heads ──────────────────────────────────────────────────────────
69
+
70
+ class _ClassHead(nn.Module):
71
+ def __init__(self, in_channels=512, num_anchors=2):
72
+ super().__init__()
73
+ self.conv1x1 = nn.Conv2d(in_channels, num_anchors * 2, 1)
74
+
75
+ def forward(self, x):
76
+ return self.conv1x1(x).permute(0, 2, 3, 1).contiguous().view(x.shape[0], -1, 2)
77
+
78
+
79
+ class _BboxHead(nn.Module):
80
+ def __init__(self, in_channels=512, num_anchors=2):
81
+ super().__init__()
82
+ self.conv1x1 = nn.Conv2d(in_channels, num_anchors * 4, 1)
83
+
84
+ def forward(self, x):
85
+ return self.conv1x1(x).permute(0, 2, 3, 1).contiguous().view(x.shape[0], -1, 4)
86
+
87
+
88
+ class _LandmarkHead(nn.Module):
89
+ def __init__(self, in_channels=512, num_anchors=2):
90
+ super().__init__()
91
+ self.conv1x1 = nn.Conv2d(in_channels, num_anchors * 10, 1)
92
+
93
+ def forward(self, x):
94
+ return self.conv1x1(x).permute(0, 2, 3, 1).contiguous().view(x.shape[0], -1, 10)
95
+
96
+
97
+ # ── RetinaFace ────────────────────────────────────────────────────────────────
98
+
99
+ class RetinaFace(nn.Module):
100
+ def __init__(self, name, pretrained, in_channels, return_layers, out_channels):
101
+ super().__init__()
102
+ if name != "Resnet50":
103
+ raise NotImplementedError(f"Only Resnet50 supported, got {name}")
104
+ backbone = models.resnet50(pretrained=pretrained)
105
+ self.body = _utils.IntermediateLayerGetter(backbone, return_layers)
106
+ c = in_channels
107
+ self.fpn = FPN([c * 2, c * 4, c * 8], out_channels)
108
+ self.ssh1 = SSH(out_channels, out_channels)
109
+ self.ssh2 = SSH(out_channels, out_channels)
110
+ self.ssh3 = SSH(out_channels, out_channels)
111
+ self.ClassHead = nn.ModuleList([_ClassHead(out_channels) for _ in range(3)])
112
+ self.BboxHead = nn.ModuleList([_BboxHead(out_channels) for _ in range(3)])
113
+ self.LandmarkHead = nn.ModuleList([_LandmarkHead(out_channels) for _ in range(3)])
114
+
115
+ def forward(self, x) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
116
+ fpn_out = self.fpn(self.body(x))
117
+ features = [self.ssh1(fpn_out[0]), self.ssh2(fpn_out[1]), self.ssh3(fpn_out[2])]
118
+ bbox = torch.cat([self.BboxHead[i](f) for i, f in enumerate(features)], dim=1)
119
+ cls = torch.cat([self.ClassHead[i](f) for i, f in enumerate(features)], dim=1)
120
+ ldm = torch.cat([self.LandmarkHead[i](f) for i, f in enumerate(features)], dim=1)
121
+ return bbox, cls, ldm
@@ -0,0 +1,139 @@
1
+ import time
2
+
3
+ import numpy as np
4
+ import torch
5
+ from huggingface_hub import hf_hub_download
6
+
7
+ from .detection import Detector
8
+ from .alignment import Aligner
9
+ from .analysis import Analyzer
10
+
11
+ HF_REPO_ID = "haneeshbyreddy/openface3-weights"
12
+
13
+ EMOTION_LABELS = ['Neutral', 'Happy', 'Sad', 'Surprise', 'Fear', 'Disgust', 'Anger', 'Contempt']
14
+ AU_IDS = ['AU01', 'AU02', 'AU04', 'AU06', 'AU09', 'AU12', 'AU25', 'AU26']
15
+
16
+
17
+ class Pipeline:
18
+ def __init__(self, device='cpu'):
19
+ self.detector = Detector(device=device)
20
+ self.detector.load_weights()
21
+ self.detector.eval()
22
+
23
+ landmark_path = hf_hub_download(HF_REPO_ID, 'aligner.pkl')
24
+ multitask_path = hf_hub_download(HF_REPO_ID, 'analyzer.pth')
25
+ self.aligner = Aligner(model_path=landmark_path, device=device)
26
+ self.analyzer = Analyzer(model_path=multitask_path, device=device)
27
+ self._warmup()
28
+
29
+ def _warmup(self):
30
+ dummy_det = torch.zeros(1, 3, self.detector.max_size, self.detector.max_size,
31
+ device=self.detector.device)
32
+ self.detector.infer(dummy_det)
33
+
34
+ dummy_crop = np.zeros((256, 256, 3), dtype=np.uint8)
35
+ self.aligner.infer(dummy_crop)
36
+
37
+ dummy_face = self.analyzer.preprocess(np.zeros((224, 224, 3), dtype=np.uint8))
38
+ self.analyzer.infer(dummy_face)
39
+
40
+ def run(self, image: np.ndarray) -> tuple:
41
+ """image: (H,W,3) uint8 RGB. Returns (results, timings)."""
42
+ t_total = time.perf_counter()
43
+
44
+ t0 = time.perf_counter()
45
+ tensor, meta = self.detector.preprocess(image)
46
+ t1 = time.perf_counter()
47
+ raw_det = self.detector.infer(tensor)
48
+ detect_infer_ms = (time.perf_counter() - t1) * 1000.0
49
+ detections = self.detector.postprocess(raw_det, meta, confidence_threshold=0.95)
50
+ detect_ms = (time.perf_counter() - t0) * 1000.0
51
+
52
+ results = []
53
+ align_ms = 0.0
54
+ align_infer_ms = 0.0
55
+ analyze_ms = 0.0
56
+ analyze_infer_ms = 0.0
57
+
58
+ for det in detections:
59
+ t0 = time.perf_counter()
60
+ crop, matrix = self.aligner.preprocess(image, det['bbox'])
61
+ t1 = time.perf_counter()
62
+ raw_lm = self.aligner.infer(crop)
63
+ align_infer_ms += (time.perf_counter() - t1) * 1000.0
64
+ landmarks = self.aligner.postprocess(raw_lm, matrix)
65
+ align_ms += (time.perf_counter() - t0) * 1000.0
66
+
67
+ x1, y1, x2, y2 = det['bbox']
68
+ face_crop = image[y1:y2, x1:x2]
69
+
70
+ if face_crop.size > 0:
71
+ t0 = time.perf_counter()
72
+ tensor_face = self.analyzer.preprocess(face_crop)
73
+ t1 = time.perf_counter()
74
+ raw_ana = self.analyzer.infer(tensor_face)
75
+ analyze_infer_ms += (time.perf_counter() - t1) * 1000.0
76
+ analysis = self.analyzer.postprocess(raw_ana)
77
+ analyze_ms += (time.perf_counter() - t0) * 1000.0
78
+ else:
79
+ analysis = {}
80
+
81
+ results.append({**det, 'landmarks': landmarks, **analysis})
82
+
83
+ total_ms = (time.perf_counter() - t_total) * 1000.0
84
+ timings = {
85
+ 'detect_ms': detect_ms,
86
+ 'detect_infer_ms': detect_infer_ms,
87
+ 'align_ms': align_ms,
88
+ 'align_infer_ms': align_infer_ms,
89
+ 'analyze_ms': analyze_ms,
90
+ 'analyze_infer_ms': analyze_infer_ms,
91
+ 'total_ms': total_ms,
92
+ }
93
+ return results, timings
94
+
95
+ @staticmethod
96
+ def serialize(results: list, timings: dict, frame_n: int = 0, ts: float = 0.0) -> dict:
97
+ """
98
+ Convert pipeline output to a JSON-ready dict.
99
+ Both main.py (JSONL recording) and the backend API use this format.
100
+ """
101
+ faces = []
102
+ for r in results:
103
+ em_logits = r['emotion'].flatten().tolist() if r.get('emotion') is not None else []
104
+ top_em = int(np.argmax(em_logits)) if em_logits else 0
105
+ gaze = r['gaze'].flatten().tolist() if r.get('gaze') is not None else [0.0, 0.0]
106
+ au_vals = r['au'].flatten().tolist() if r.get('au') is not None else []
107
+
108
+ faces.append({
109
+ 'bbox': r['bbox'],
110
+ 'confidence': round(float(r['score']), 4),
111
+ 'landmarks': r['landmarks'].tolist(),
112
+ 'emotion': {
113
+ 'label': EMOTION_LABELS[top_em] if em_logits else None,
114
+ 'logits': [round(v, 4) for v in em_logits],
115
+ },
116
+ 'gaze': {
117
+ 'horizontal_deg': round(gaze[0], 2),
118
+ 'vertical_deg': round(gaze[1], 2),
119
+ },
120
+ 'action_units': {
121
+ au_id: round(float(np.clip(v, 0.0, 1.0)), 4)
122
+ for au_id, v in zip(AU_IDS, au_vals)
123
+ },
124
+ })
125
+
126
+ return {
127
+ 'frame': frame_n,
128
+ 'ts': round(ts, 4),
129
+ 'faces': faces,
130
+ 'timings': {
131
+ 'detect': {'total_ms': round(timings['detect_ms'], 2),
132
+ 'infer_ms': round(timings['detect_infer_ms'], 2)},
133
+ 'align': {'total_ms': round(timings['align_ms'], 2),
134
+ 'infer_ms': round(timings['align_infer_ms'], 2)},
135
+ 'analyze': {'total_ms': round(timings['analyze_ms'], 2),
136
+ 'infer_ms': round(timings['analyze_infer_ms'],2)},
137
+ 'total_ms': round(timings['total_ms'], 2),
138
+ },
139
+ }
@@ -0,0 +1,39 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "openface3"
7
+ version = "0.1.0"
8
+ description = "Facial analysis pipeline: detection, 98-point alignment, and multitask emotion/gaze/AU inference"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = { file = "LICENSE" }
12
+ keywords = ["face", "emotion", "gaze", "action-units", "facial-analysis"]
13
+ classifiers = [
14
+ "Programming Language :: Python :: 3",
15
+ "License :: OSI Approved :: MIT License",
16
+ "Operating System :: OS Independent",
17
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
18
+ ]
19
+ dependencies = [
20
+ "torch>=2.0",
21
+ "torchvision>=0.15",
22
+ "numpy",
23
+ "opencv-python",
24
+ "albumentations",
25
+ "iglovikov-helper-functions",
26
+ "timm",
27
+ "huggingface-hub",
28
+ ]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/haneeshbyreddy/openface3"
32
+ Issues = "https://github.com/haneeshbyreddy/openface3/issues"
33
+ HuggingFace = "https://huggingface.co/haneeshbyreddy/openface3-weights"
34
+
35
+ [tool.hatch.build.targets.wheel]
36
+ packages = ["openface3"]
37
+
38
+ [tool.hatch.build.targets.sdist]
39
+ include = ["openface3/"]