openface3 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openface3/__init__.py +1 -0
- openface3/alignment.py +118 -0
- openface3/analysis.py +40 -0
- openface3/detection.py +130 -0
- openface3/models/__init__.py +0 -0
- openface3/models/landmark.py +287 -0
- openface3/models/multitask.py +93 -0
- openface3/models/retinaface.py +121 -0
- openface3/pipeline.py +139 -0
- openface3-0.1.0.dist-info/METADATA +92 -0
- openface3-0.1.0.dist-info/RECORD +13 -0
- openface3-0.1.0.dist-info/WHEEL +4 -0
- openface3-0.1.0.dist-info/licenses/LICENSE +21 -0
openface3/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .pipeline import Pipeline, EMOTION_LABELS, AU_IDS
|
openface3/alignment.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import cv2
|
|
2
|
+
import numpy as np
|
|
3
|
+
import torch
|
|
4
|
+
|
|
5
|
+
from .models.landmark import StackedHGNetV1
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# ── Model config ──────────────────────────────────────────────────────────────
|
|
9
|
+
|
|
10
|
+
class Config:
|
|
11
|
+
width = 256
|
|
12
|
+
height = 256
|
|
13
|
+
use_AAM = True
|
|
14
|
+
classes_num = [98, 9, 98]
|
|
15
|
+
nstack = 4
|
|
16
|
+
add_coord = True
|
|
17
|
+
decoder_type = 'default'
|
|
18
|
+
edge_info = (
|
|
19
|
+
(False, (0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32)),
|
|
20
|
+
(True, (33,34,35,36,37,38,39,40,41)),
|
|
21
|
+
(True, (42,43,44,45,46,47,48,49,50)),
|
|
22
|
+
(False, (51,52,53,54)),
|
|
23
|
+
(False, (55,56,57,58,59)),
|
|
24
|
+
(True, (60,61,62,63,64,65,66,67)),
|
|
25
|
+
(True, (68,69,70,71,72,73,74,75)),
|
|
26
|
+
(True, (76,77,78,79,80,81,82,83,84,85,86,87)),
|
|
27
|
+
(True, (88,89,90,91,92,93,94,95)),
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ── Crop helpers (also used directly in eval scripts) ─────────────────────────
|
|
32
|
+
|
|
33
|
+
class GetCropMatrix:
|
|
34
|
+
def __init__(self, image_size, target_face_scale, align_corners=False):
|
|
35
|
+
self.image_size = image_size
|
|
36
|
+
self.target_face_scale = target_face_scale
|
|
37
|
+
self.align_corners = align_corners
|
|
38
|
+
|
|
39
|
+
def process(self, scale, center_w, center_h):
|
|
40
|
+
to_w = (self.image_size - 1) if self.align_corners else self.image_size
|
|
41
|
+
to_h = (self.image_size - 1) if self.align_corners else self.image_size
|
|
42
|
+
scale_mu = self.image_size / (scale * self.target_face_scale * 200.0)
|
|
43
|
+
cosv, sinv = 1.0, 0.0
|
|
44
|
+
acos, asin = scale_mu * cosv, scale_mu * sinv
|
|
45
|
+
fx, fy = center_w, center_h
|
|
46
|
+
tx, ty = to_w / 2.0, to_h / 2.0
|
|
47
|
+
return np.array([
|
|
48
|
+
[acos, -asin, tx - acos * fx + asin * fy],
|
|
49
|
+
[asin, acos, ty - asin * fx - acos * fy],
|
|
50
|
+
[0.0, 0.0, 1.0]], dtype=np.float32)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class TransformPerspective:
|
|
54
|
+
def __init__(self, image_size):
|
|
55
|
+
self.image_size = image_size
|
|
56
|
+
|
|
57
|
+
def process(self, image, matrix):
|
|
58
|
+
return cv2.warpPerspective(image, matrix,
|
|
59
|
+
dsize=(self.image_size, self.image_size),
|
|
60
|
+
flags=cv2.INTER_LINEAR, borderValue=0)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ── Aligner ───────────────────────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
class Aligner:
|
|
66
|
+
def __init__(self, model_path: str, device='cpu'):
|
|
67
|
+
self.device = torch.device(device)
|
|
68
|
+
self.input_size = 256
|
|
69
|
+
cfg = Config()
|
|
70
|
+
net = StackedHGNetV1(config=cfg, classes_num=cfg.classes_num,
|
|
71
|
+
edge_info=cfg.edge_info, nstack=cfg.nstack,
|
|
72
|
+
add_coord=cfg.add_coord, decoder_type=cfg.decoder_type)
|
|
73
|
+
ckpt = torch.load(model_path, map_location=self.device)
|
|
74
|
+
net.load_state_dict(ckpt['net'], strict=False)
|
|
75
|
+
self.model = net.to(self.device).eval()
|
|
76
|
+
|
|
77
|
+
self._crop_matrix = GetCropMatrix(image_size=self.input_size,
|
|
78
|
+
target_face_scale=1.0, align_corners=True)
|
|
79
|
+
self._warp = TransformPerspective(image_size=self.input_size)
|
|
80
|
+
|
|
81
|
+
# ── preprocess ────────────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
def preprocess(self, image: np.ndarray, bbox):
|
|
84
|
+
"""Bbox-based crop (GetCropMatrix approach)."""
|
|
85
|
+
x1, y1, x2, y2 = bbox
|
|
86
|
+
cw = (x1 + x2) / 2
|
|
87
|
+
ch = (y1 + y2) / 2
|
|
88
|
+
scale = max(x2 - x1, y2 - y1) / 200.0
|
|
89
|
+
matrix = self._crop_matrix.process(scale, cw, ch)
|
|
90
|
+
crop = self._warp.process(image, matrix)
|
|
91
|
+
return crop, matrix
|
|
92
|
+
|
|
93
|
+
# ── infer ─────────────────────────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
def infer(self, crop: np.ndarray) -> torch.Tensor:
|
|
96
|
+
"""(256,256,3) uint8 RGB → raw coords tensor (98,2) in [-1,1]."""
|
|
97
|
+
t = torch.from_numpy(crop[np.newaxis]).float().permute(0, 3, 1, 2)
|
|
98
|
+
t = (t / 255.0) * 2.0 - 1.0
|
|
99
|
+
with torch.inference_mode():
|
|
100
|
+
out = self.model(t.to(self.device))
|
|
101
|
+
return out[-1][0] # (98, 2)
|
|
102
|
+
|
|
103
|
+
# ── postprocess ───────────────────────────────────────────────────────────
|
|
104
|
+
|
|
105
|
+
def postprocess(self, raw_coords: torch.Tensor, matrix: np.ndarray) -> np.ndarray:
|
|
106
|
+
"""Denorm + backproject to original image space → (98,2)."""
|
|
107
|
+
pts = ((raw_coords + 1) * self.input_size - 1) / 2
|
|
108
|
+
pts = pts.cpu().numpy()
|
|
109
|
+
inv = np.linalg.inv(matrix)
|
|
110
|
+
out = np.empty_like(pts)
|
|
111
|
+
out[:, 0] = inv[0, 0]*pts[:, 0] + inv[0, 1]*pts[:, 1] + inv[0, 2]
|
|
112
|
+
out[:, 1] = inv[1, 0]*pts[:, 0] + inv[1, 1]*pts[:, 1] + inv[1, 2]
|
|
113
|
+
return out
|
|
114
|
+
|
|
115
|
+
def __call__(self, image: np.ndarray, bbox) -> np.ndarray:
|
|
116
|
+
"""bbox → (98,2) landmarks in original image space."""
|
|
117
|
+
crop, matrix = self.preprocess(image, bbox)
|
|
118
|
+
return self.postprocess(self.infer(crop), matrix)
|
openface3/analysis.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import torch
|
|
3
|
+
from torchvision import transforms
|
|
4
|
+
|
|
5
|
+
from .models.multitask import MTL
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Analyzer:
|
|
9
|
+
def __init__(self, model_path: str, device='cpu'):
|
|
10
|
+
self.device = torch.device(device)
|
|
11
|
+
self.model = MTL().to(self.device)
|
|
12
|
+
state_dict = torch.load(model_path, map_location=self.device)
|
|
13
|
+
self.model.load_state_dict(state_dict)
|
|
14
|
+
self.model.eval()
|
|
15
|
+
|
|
16
|
+
self.transform = transforms.Compose([
|
|
17
|
+
transforms.ToTensor(),
|
|
18
|
+
transforms.Resize((224, 224)),
|
|
19
|
+
transforms.Normalize(mean=[0.485, 0.456, 0.406],
|
|
20
|
+
std=[0.229, 0.224, 0.225]),
|
|
21
|
+
])
|
|
22
|
+
|
|
23
|
+
def preprocess(self, face_crop: np.ndarray) -> torch.Tensor:
|
|
24
|
+
"""(H,W,3) uint8 RGB → normalised tensor on device."""
|
|
25
|
+
return self.transform(face_crop).unsqueeze(0).to(self.device)
|
|
26
|
+
|
|
27
|
+
def infer(self, tensor: torch.Tensor):
|
|
28
|
+
with torch.inference_mode():
|
|
29
|
+
return self.model(tensor)
|
|
30
|
+
|
|
31
|
+
def postprocess(self, raw) -> dict:
|
|
32
|
+
emotion_raw, gaze_raw, au_raw = raw
|
|
33
|
+
return {
|
|
34
|
+
'emotion': emotion_raw.cpu().numpy(),
|
|
35
|
+
'gaze': gaze_raw.cpu().numpy(),
|
|
36
|
+
'au': au_raw.cpu().numpy(),
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
def __call__(self, face_crop: np.ndarray) -> dict:
|
|
40
|
+
return self.postprocess(self.infer(self.preprocess(face_crop)))
|
openface3/detection.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
from itertools import product
|
|
2
|
+
from math import ceil
|
|
3
|
+
import albumentations as A
|
|
4
|
+
import numpy as np
|
|
5
|
+
import torch
|
|
6
|
+
import torch.nn.functional as F
|
|
7
|
+
from iglovikov_helper_functions.utils.image_utils import pad_to_size, unpad_from_size
|
|
8
|
+
from torch.utils import model_zoo
|
|
9
|
+
from torchvision.ops import nms
|
|
10
|
+
|
|
11
|
+
from .models.retinaface import RetinaFace
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# ── Prior box / decode utilities ─────────────────────────────────────────────
|
|
15
|
+
|
|
16
|
+
def priorbox(min_sizes, steps, clip, image_size):
|
|
17
|
+
feature_maps = [[ceil(image_size[0] / s), ceil(image_size[1] / s)] for s in steps]
|
|
18
|
+
anchors = []
|
|
19
|
+
for k, f in enumerate(feature_maps):
|
|
20
|
+
for i, j in product(range(f[0]), range(f[1])):
|
|
21
|
+
for ms in min_sizes[k]:
|
|
22
|
+
cx = (j + 0.5) * steps[k] / image_size[1]
|
|
23
|
+
cy = (i + 0.5) * steps[k] / image_size[0]
|
|
24
|
+
anchors += [cx, cy, ms / image_size[1], ms / image_size[0]]
|
|
25
|
+
out = torch.Tensor(anchors).view(-1, 4)
|
|
26
|
+
if clip:
|
|
27
|
+
out.clamp_(max=1, min=0)
|
|
28
|
+
return out
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def decode(loc, priors, variances):
|
|
32
|
+
boxes = torch.cat((
|
|
33
|
+
priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
|
|
34
|
+
priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
|
|
35
|
+
boxes[:, :2] -= boxes[:, 2:] / 2
|
|
36
|
+
boxes[:, 2:] += boxes[:, :2]
|
|
37
|
+
return boxes
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def decode_landm(pre, priors, variances):
|
|
41
|
+
return torch.cat([
|
|
42
|
+
priors[:, :2] + pre[:, 2*i:2*i+2] * variances[0] * priors[:, 2:]
|
|
43
|
+
for i in range(5)], dim=1)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ── Detector ──────────────────────────────────────────────────────────────────
|
|
47
|
+
|
|
48
|
+
WEIGHTS_URL = "https://github.com/ternaus/retinaface/releases/download/0.01/retinaface_resnet50_2020-07-20-f168fae3c.zip"
|
|
49
|
+
|
|
50
|
+
class Detector:
|
|
51
|
+
def __init__(self, device='cpu', max_size=640):
|
|
52
|
+
self.device = device
|
|
53
|
+
self.max_size = max_size
|
|
54
|
+
self.model = RetinaFace(
|
|
55
|
+
name='Resnet50', pretrained=False,
|
|
56
|
+
return_layers={'layer2': 1, 'layer3': 2, 'layer4': 3},
|
|
57
|
+
in_channels=256, out_channels=256).to(device)
|
|
58
|
+
self.transform = A.Compose([A.LongestMaxSize(max_size=max_size, p=1), A.Normalize(p=1)])
|
|
59
|
+
self.prior_box = priorbox(min_sizes=[[16, 32], [64, 128], [256, 512]],
|
|
60
|
+
steps=[8, 16, 32], clip=False,
|
|
61
|
+
image_size=(max_size, max_size)).to(device)
|
|
62
|
+
self.variance = [0.1, 0.2]
|
|
63
|
+
self._scale_bbox = torch.tensor([max_size, max_size, max_size, max_size],
|
|
64
|
+
dtype=torch.float32).to(device)
|
|
65
|
+
self._scale_landm = torch.tensor([max_size, max_size] * 5,
|
|
66
|
+
dtype=torch.float32).to(device)
|
|
67
|
+
|
|
68
|
+
def load_weights(self):
|
|
69
|
+
state_dict = model_zoo.load_url(WEIGHTS_URL, progress=True, map_location='cpu')
|
|
70
|
+
self.model.load_state_dict(state_dict)
|
|
71
|
+
|
|
72
|
+
def eval(self):
|
|
73
|
+
self.model.eval()
|
|
74
|
+
|
|
75
|
+
def preprocess(self, image: np.ndarray):
|
|
76
|
+
h, w = image.shape[:2]
|
|
77
|
+
transformed = self.transform(image=image)['image']
|
|
78
|
+
paded = pad_to_size(target_size=(self.max_size, self.max_size), image=transformed)
|
|
79
|
+
tensor = torch.from_numpy(np.transpose(paded['image'], (2, 0, 1))).to(self.device)
|
|
80
|
+
meta = {'pads': paded['pads'], 'resize_coeff': max(h, w) / self.max_size,
|
|
81
|
+
'original_size': (h, w)}
|
|
82
|
+
return tensor.unsqueeze(0), meta
|
|
83
|
+
|
|
84
|
+
def infer(self, tensor: torch.Tensor):
|
|
85
|
+
with torch.inference_mode():
|
|
86
|
+
return self.model(tensor)
|
|
87
|
+
|
|
88
|
+
def postprocess(self, raw, meta, confidence_threshold=0.7, nms_threshold=0.4):
|
|
89
|
+
loc, conf, land = raw
|
|
90
|
+
conf = F.softmax(conf, dim=-1)
|
|
91
|
+
boxes = decode(loc[0], self.prior_box, self.variance) * self._scale_bbox
|
|
92
|
+
scores = conf[0][:, 1]
|
|
93
|
+
landmarks = decode_landm(land[0], self.prior_box, self.variance) * self._scale_landm
|
|
94
|
+
|
|
95
|
+
valid = torch.where(scores > confidence_threshold)[0]
|
|
96
|
+
boxes, landmarks, scores = boxes[valid], landmarks[valid], scores[valid]
|
|
97
|
+
order = scores.argsort(descending=True)
|
|
98
|
+
boxes, landmarks, scores = boxes[order], landmarks[order], scores[order]
|
|
99
|
+
|
|
100
|
+
keep = nms(boxes, scores, nms_threshold)
|
|
101
|
+
boxes = boxes[keep].int()
|
|
102
|
+
if boxes.shape[0] == 0:
|
|
103
|
+
return []
|
|
104
|
+
|
|
105
|
+
landmarks = landmarks[keep].cpu().numpy().reshape(-1, 2)
|
|
106
|
+
scores = scores[keep].cpu().numpy().astype(np.float64)
|
|
107
|
+
boxes = boxes.cpu().numpy()
|
|
108
|
+
|
|
109
|
+
unpadded = unpad_from_size(meta['pads'], bboxes=boxes, keypoints=landmarks)
|
|
110
|
+
rc = meta['resize_coeff']
|
|
111
|
+
boxes = (unpadded['bboxes'] * rc).astype(int)
|
|
112
|
+
landmarks = (unpadded['keypoints'].reshape(-1, 10) * rc).astype(int)
|
|
113
|
+
oh, ow = meta['original_size']
|
|
114
|
+
|
|
115
|
+
results = []
|
|
116
|
+
for i, bbox in enumerate(boxes):
|
|
117
|
+
x1 = int(np.clip(bbox[0], 0, ow - 1))
|
|
118
|
+
x2 = int(np.clip(bbox[2], x1 + 1, ow - 1))
|
|
119
|
+
y1 = int(np.clip(bbox[1], 0, oh - 1))
|
|
120
|
+
y2 = int(np.clip(bbox[3], y1 + 1, oh - 1))
|
|
121
|
+
if x1 >= x2 or y1 >= y2:
|
|
122
|
+
continue
|
|
123
|
+
results.append({'bbox': [x1, y1, x2, y2],
|
|
124
|
+
'score': float(scores[i]),
|
|
125
|
+
'landmarks': landmarks[i].reshape(-1, 2).tolist()})
|
|
126
|
+
return results
|
|
127
|
+
|
|
128
|
+
def __call__(self, image: np.ndarray, confidence_threshold=0.7, nms_threshold=0.4):
|
|
129
|
+
tensor, meta = self.preprocess(image)
|
|
130
|
+
return self.postprocess(self.infer(tensor), meta, confidence_threshold, nms_threshold)
|
|
File without changes
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import torch
|
|
3
|
+
import torch.nn as nn
|
|
4
|
+
import torch.nn.functional as F
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# ── CoordConv ─────────────────────────────────────────────────────────────────
|
|
8
|
+
|
|
9
|
+
class AddCoordsTh(nn.Module):
|
|
10
|
+
def __init__(self, x_dim, y_dim, with_r=False, with_boundary=False):
|
|
11
|
+
super().__init__()
|
|
12
|
+
self.x_dim = x_dim
|
|
13
|
+
self.y_dim = y_dim
|
|
14
|
+
self.with_r = with_r
|
|
15
|
+
self.with_boundary = with_boundary
|
|
16
|
+
|
|
17
|
+
def forward(self, input_tensor, heatmap=None):
|
|
18
|
+
batch = input_tensor.shape[0]
|
|
19
|
+
|
|
20
|
+
xx_ones = torch.ones([1, self.y_dim], dtype=torch.int32).to(input_tensor)
|
|
21
|
+
xx_ones = xx_ones.unsqueeze(-1) # (1, y_dim, 1)
|
|
22
|
+
xx_range = torch.arange(self.x_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor)
|
|
23
|
+
xx_range = xx_range.unsqueeze(1) # (1, 1, x_dim)
|
|
24
|
+
xx_channel = torch.matmul(xx_ones.float(), xx_range.float()).unsqueeze(-1) # (1, y_dim, x_dim, 1)
|
|
25
|
+
|
|
26
|
+
yy_ones = torch.ones([1, self.x_dim], dtype=torch.int32).to(input_tensor)
|
|
27
|
+
yy_ones = yy_ones.unsqueeze(1) # (1, 1, x_dim)
|
|
28
|
+
yy_range = torch.arange(self.y_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor)
|
|
29
|
+
yy_range = yy_range.unsqueeze(-1) # (1, y_dim, 1)
|
|
30
|
+
yy_channel = torch.matmul(yy_range.float(), yy_ones.float()).unsqueeze(-1) # (1, y_dim, x_dim, 1)
|
|
31
|
+
|
|
32
|
+
xx_channel = xx_channel.permute(0, 3, 2, 1) / (self.x_dim - 1) * 2 - 1
|
|
33
|
+
yy_channel = yy_channel.permute(0, 3, 2, 1) / (self.y_dim - 1) * 2 - 1
|
|
34
|
+
|
|
35
|
+
xx_channel = xx_channel.repeat(batch, 1, 1, 1)
|
|
36
|
+
yy_channel = yy_channel.repeat(batch, 1, 1, 1)
|
|
37
|
+
|
|
38
|
+
if self.with_boundary and heatmap is not None:
|
|
39
|
+
boundary = torch.clamp(heatmap[:, -1:, :, :], 0.0, 1.0)
|
|
40
|
+
zero = torch.zeros_like(xx_channel)
|
|
41
|
+
xx_boundary = torch.where(boundary > 0.05, xx_channel, zero)
|
|
42
|
+
yy_boundary = torch.where(boundary > 0.05, yy_channel, zero)
|
|
43
|
+
|
|
44
|
+
ret = torch.cat([input_tensor, xx_channel, yy_channel], dim=1)
|
|
45
|
+
if self.with_r:
|
|
46
|
+
rr = torch.sqrt(xx_channel ** 2 + yy_channel ** 2)
|
|
47
|
+
rr = rr / torch.max(rr)
|
|
48
|
+
ret = torch.cat([ret, rr], dim=1)
|
|
49
|
+
if self.with_boundary and heatmap is not None:
|
|
50
|
+
ret = torch.cat([ret, xx_boundary, yy_boundary], dim=1)
|
|
51
|
+
return ret
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class CoordConvTh(nn.Module):
|
|
55
|
+
def __init__(self, x_dim, y_dim, with_r, with_boundary,
|
|
56
|
+
in_channels, out_channels, first_one=False,
|
|
57
|
+
relu=False, bn=False, *args, **kwargs):
|
|
58
|
+
super().__init__()
|
|
59
|
+
self.addcoords = AddCoordsTh(x_dim=x_dim, y_dim=y_dim,
|
|
60
|
+
with_r=with_r, with_boundary=with_boundary)
|
|
61
|
+
in_channels += 2
|
|
62
|
+
if with_r:
|
|
63
|
+
in_channels += 1
|
|
64
|
+
if with_boundary and not first_one:
|
|
65
|
+
in_channels += 2
|
|
66
|
+
self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, *args, **kwargs)
|
|
67
|
+
self.relu = nn.ReLU() if relu else None
|
|
68
|
+
self.bn = nn.BatchNorm2d(out_channels) if bn else None
|
|
69
|
+
self.with_boundary = with_boundary
|
|
70
|
+
self.first_one = first_one
|
|
71
|
+
|
|
72
|
+
def forward(self, input_tensor, heatmap=None):
|
|
73
|
+
assert (self.with_boundary and not self.first_one) == (heatmap is not None)
|
|
74
|
+
ret = self.addcoords(input_tensor, heatmap)
|
|
75
|
+
ret = self.conv(ret)
|
|
76
|
+
if self.bn is not None: ret = self.bn(ret)
|
|
77
|
+
if self.relu is not None: ret = self.relu(ret)
|
|
78
|
+
return ret
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ── Decoder ───────────────────────────────────────────────────────────────────
|
|
82
|
+
|
|
83
|
+
class _DecoderDefault:
|
|
84
|
+
def __init__(self, weight=1, use_weight_map=False):
|
|
85
|
+
self.weight = weight
|
|
86
|
+
self.use_weight_map = use_weight_map
|
|
87
|
+
|
|
88
|
+
def _make_grid(self, h, w):
|
|
89
|
+
yy, xx = torch.meshgrid(
|
|
90
|
+
torch.arange(h).float() / (h - 1) * 2 - 1,
|
|
91
|
+
torch.arange(w).float() / (w - 1) * 2 - 1)
|
|
92
|
+
return yy, xx
|
|
93
|
+
|
|
94
|
+
def get_coords_from_heatmap(self, heatmap):
|
|
95
|
+
batch, npoints, h, w = heatmap.shape
|
|
96
|
+
if self.use_weight_map:
|
|
97
|
+
heatmap = heatmap * self.weight
|
|
98
|
+
yy, xx = self._make_grid(h, w)
|
|
99
|
+
yy = yy.view(1, 1, h, w).to(heatmap)
|
|
100
|
+
xx = xx.view(1, 1, h, w).to(heatmap)
|
|
101
|
+
heatmap_sum = torch.clamp(heatmap.sum([2, 3]), min=1e-6)
|
|
102
|
+
yy_coord = (yy * heatmap).sum([2, 3]) / heatmap_sum
|
|
103
|
+
xx_coord = (xx * heatmap).sum([2, 3]) / heatmap_sum
|
|
104
|
+
return torch.stack([xx_coord, yy_coord], dim=-1)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_decoder(decoder_type='default'):
|
|
108
|
+
if decoder_type == 'default':
|
|
109
|
+
return _DecoderDefault()
|
|
110
|
+
raise NotImplementedError(f"Unknown decoder: {decoder_type}")
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# ── Building blocks ───────────────────────────────────────────────────────────
|
|
114
|
+
|
|
115
|
+
class Activation(nn.Module):
|
|
116
|
+
def __init__(self, kind='relu', channel=None):
|
|
117
|
+
super().__init__()
|
|
118
|
+
self.kind = kind
|
|
119
|
+
norm_str, act_str = kind.split('+') if '+' in kind else ('none', kind)
|
|
120
|
+
self.norm_fn = {'in': F.instance_norm,
|
|
121
|
+
'bn': nn.BatchNorm2d(channel),
|
|
122
|
+
'bn_noaffine': nn.BatchNorm2d(channel, affine=False, track_running_stats=True),
|
|
123
|
+
'none': None}[norm_str]
|
|
124
|
+
self.act_fn = {'relu': F.relu, 'softplus': nn.Softplus(), 'exp': torch.exp,
|
|
125
|
+
'sigmoid': torch.sigmoid, 'tanh': torch.tanh, 'none': None}[act_str]
|
|
126
|
+
|
|
127
|
+
def forward(self, x):
|
|
128
|
+
if self.norm_fn is not None: x = self.norm_fn(x)
|
|
129
|
+
if self.act_fn is not None: x = self.act_fn(x)
|
|
130
|
+
return x
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class ConvBlock(nn.Module):
|
|
134
|
+
def __init__(self, inp_dim, out_dim, kernel_size=3, stride=1, bn=False, relu=True, groups=1):
|
|
135
|
+
super().__init__()
|
|
136
|
+
self.conv = nn.Conv2d(inp_dim, out_dim, kernel_size, stride,
|
|
137
|
+
padding=(kernel_size - 1) // 2, groups=groups, bias=True)
|
|
138
|
+
self.relu = nn.ReLU() if relu else None
|
|
139
|
+
self.bn = nn.BatchNorm2d(out_dim) if bn else None
|
|
140
|
+
|
|
141
|
+
def forward(self, x):
|
|
142
|
+
x = self.conv(x)
|
|
143
|
+
if self.bn is not None: x = self.bn(x)
|
|
144
|
+
if self.relu is not None: x = self.relu(x)
|
|
145
|
+
return x
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class ResBlock(nn.Module):
|
|
149
|
+
def __init__(self, inp_dim, out_dim, mid_dim=None):
|
|
150
|
+
super().__init__()
|
|
151
|
+
mid_dim = mid_dim or out_dim // 2
|
|
152
|
+
self.relu = nn.ReLU()
|
|
153
|
+
self.bn1 = nn.BatchNorm2d(inp_dim)
|
|
154
|
+
self.conv1 = ConvBlock(inp_dim, mid_dim, 1, relu=False)
|
|
155
|
+
self.bn2 = nn.BatchNorm2d(mid_dim)
|
|
156
|
+
self.conv2 = ConvBlock(mid_dim, mid_dim, 3, relu=False)
|
|
157
|
+
self.bn3 = nn.BatchNorm2d(mid_dim)
|
|
158
|
+
self.conv3 = ConvBlock(mid_dim, out_dim, 1, relu=False)
|
|
159
|
+
self.need_skip = inp_dim != out_dim
|
|
160
|
+
self.skip_layer = ConvBlock(inp_dim, out_dim, 1, relu=False) if self.need_skip else None
|
|
161
|
+
|
|
162
|
+
def forward(self, x):
|
|
163
|
+
residual = self.skip_layer(x) if self.need_skip else x
|
|
164
|
+
out = self.conv3(self.relu(self.bn3(self.conv2(self.relu(self.bn2(self.conv1(self.relu(self.bn1(x)))))))))
|
|
165
|
+
return out + residual
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
class Hourglass(nn.Module):
|
|
169
|
+
def __init__(self, n, f, increase=0, up_mode='nearest',
|
|
170
|
+
add_coord=False, first_one=False, x_dim=64, y_dim=64):
|
|
171
|
+
super().__init__()
|
|
172
|
+
nf = f + increase
|
|
173
|
+
self.coordconv = CoordConvTh(x_dim=x_dim, y_dim=y_dim, with_r=True, with_boundary=True,
|
|
174
|
+
relu=False, bn=False, in_channels=f, out_channels=f,
|
|
175
|
+
first_one=first_one, kernel_size=1, stride=1, padding=0
|
|
176
|
+
) if add_coord else None
|
|
177
|
+
self.up1 = ResBlock(f, f)
|
|
178
|
+
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
|
|
179
|
+
self.low1 = ResBlock(f, nf)
|
|
180
|
+
self.n = n
|
|
181
|
+
self.low2 = Hourglass(n=n-1, f=nf, increase=increase, up_mode=up_mode,
|
|
182
|
+
add_coord=False) if n > 1 else ResBlock(nf, nf)
|
|
183
|
+
self.low3 = ResBlock(nf, f)
|
|
184
|
+
self.up2 = nn.Upsample(scale_factor=2, mode=up_mode)
|
|
185
|
+
|
|
186
|
+
def forward(self, x, heatmap=None):
|
|
187
|
+
if self.coordconv is not None:
|
|
188
|
+
x = self.coordconv(x, heatmap)
|
|
189
|
+
return self.up1(x) + self.up2(self.low3(self.low2(self.low1(self.pool1(x)))))
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class E2HTransform(nn.Module):
|
|
193
|
+
def __init__(self, edge_info, num_points, num_edges):
|
|
194
|
+
super().__init__()
|
|
195
|
+
e2h = np.zeros([num_points, num_edges])
|
|
196
|
+
for eid, (_, indices) in enumerate(edge_info):
|
|
197
|
+
for pid in indices:
|
|
198
|
+
e2h[pid, eid] = 1
|
|
199
|
+
e2h = torch.from_numpy(e2h).float()
|
|
200
|
+
self.register_buffer('weight', e2h.view(e2h.size(0), e2h.size(1), 1, 1))
|
|
201
|
+
bias = ((e2h @ torch.ones(e2h.size(1))) < 0.5).to(e2h)
|
|
202
|
+
self.register_buffer('bias', bias)
|
|
203
|
+
|
|
204
|
+
def forward(self, edgemaps):
|
|
205
|
+
return F.conv2d(edgemaps, weight=self.weight, bias=self.bias)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ── StackedHGNetV1 ────────────────────────────────────────────────────────────
|
|
209
|
+
|
|
210
|
+
class StackedHGNetV1(nn.Module):
|
|
211
|
+
def __init__(self, config, classes_num, edge_info,
|
|
212
|
+
nstack=4, nlevels=4, in_channel=256, increase=0,
|
|
213
|
+
add_coord=True, decoder_type='default'):
|
|
214
|
+
super().__init__()
|
|
215
|
+
self.cfg = config
|
|
216
|
+
self.decoder = get_decoder(decoder_type)
|
|
217
|
+
self.nstack = nstack
|
|
218
|
+
self.add_coord = add_coord
|
|
219
|
+
self.num_heats = classes_num[0]
|
|
220
|
+
|
|
221
|
+
first_conv = CoordConvTh(x_dim=config.width, y_dim=config.height,
|
|
222
|
+
with_r=True, with_boundary=False,
|
|
223
|
+
relu=True, bn=True, in_channels=3, out_channels=64,
|
|
224
|
+
kernel_size=7, stride=2, padding=3
|
|
225
|
+
) if add_coord else ConvBlock(3, 64, 7, 2, bn=True, relu=True)
|
|
226
|
+
|
|
227
|
+
self.pre = nn.Sequential(first_conv, ResBlock(64, 128),
|
|
228
|
+
nn.MaxPool2d(2, 2), ResBlock(128, 128), ResBlock(128, in_channel))
|
|
229
|
+
|
|
230
|
+
self.hgs = nn.ModuleList([
|
|
231
|
+
Hourglass(n=nlevels, f=in_channel, increase=increase, add_coord=add_coord,
|
|
232
|
+
first_one=(i == 0),
|
|
233
|
+
x_dim=config.width // nstack,
|
|
234
|
+
y_dim=config.height // nstack)
|
|
235
|
+
for i in range(nstack)])
|
|
236
|
+
|
|
237
|
+
self.features = nn.ModuleList([
|
|
238
|
+
nn.Sequential(ResBlock(in_channel, in_channel),
|
|
239
|
+
ConvBlock(in_channel, in_channel, 1, bn=True, relu=True))
|
|
240
|
+
for _ in range(nstack)])
|
|
241
|
+
|
|
242
|
+
self.out_heatmaps = nn.ModuleList([ConvBlock(in_channel, self.num_heats, 1, relu=False, bn=False) for _ in range(nstack)])
|
|
243
|
+
self.merge_features = nn.ModuleList([ConvBlock(in_channel, in_channel, 1, relu=False, bn=False) for _ in range(nstack - 1)])
|
|
244
|
+
self.merge_heatmaps = nn.ModuleList([ConvBlock(self.num_heats, in_channel, 1, relu=False, bn=False) for _ in range(nstack - 1)])
|
|
245
|
+
self.heatmap_act = Activation("in+relu", self.num_heats)
|
|
246
|
+
|
|
247
|
+
if config.use_AAM:
|
|
248
|
+
self.num_edges = classes_num[1]
|
|
249
|
+
self.num_points = classes_num[2]
|
|
250
|
+
self.e2h = E2HTransform(edge_info, self.num_points, self.num_edges)
|
|
251
|
+
self.out_edgemaps = nn.ModuleList([ConvBlock(in_channel, self.num_edges, 1, relu=False, bn=False) for _ in range(nstack)])
|
|
252
|
+
self.out_pointmaps = nn.ModuleList([ConvBlock(in_channel, self.num_points, 1, relu=False, bn=False) for _ in range(nstack)])
|
|
253
|
+
self.merge_edgemaps = nn.ModuleList([ConvBlock(self.num_edges, in_channel, 1, relu=False, bn=False) for _ in range(nstack - 1)])
|
|
254
|
+
self.merge_pointmaps = nn.ModuleList([ConvBlock(self.num_points, in_channel, 1, relu=False, bn=False) for _ in range(nstack - 1)])
|
|
255
|
+
self.edgemap_act = Activation("sigmoid", self.num_edges)
|
|
256
|
+
self.pointmap_act = Activation("sigmoid", self.num_points)
|
|
257
|
+
|
|
258
|
+
def forward(self, x):
|
|
259
|
+
x = self.pre(x)
|
|
260
|
+
y, fusionmaps, heatmaps = [], [], None
|
|
261
|
+
|
|
262
|
+
for i in range(self.nstack):
|
|
263
|
+
hg = self.hgs[i](x, heatmap=heatmaps)
|
|
264
|
+
feature = self.features[i](hg)
|
|
265
|
+
heatmaps = self.heatmap_act(self.out_heatmaps[i](feature))
|
|
266
|
+
|
|
267
|
+
if self.cfg.use_AAM:
|
|
268
|
+
pointmaps = self.pointmap_act(self.out_pointmaps[i](feature))
|
|
269
|
+
edgemaps = self.edgemap_act(self.out_edgemaps[i](feature))
|
|
270
|
+
fusion = self.e2h(edgemaps) * pointmaps * heatmaps
|
|
271
|
+
else:
|
|
272
|
+
fusion = heatmaps
|
|
273
|
+
|
|
274
|
+
landmarks = self.decoder.get_coords_from_heatmap(fusion)
|
|
275
|
+
|
|
276
|
+
if i < self.nstack - 1:
|
|
277
|
+
x = x + self.merge_features[i](feature) + self.merge_heatmaps[i](heatmaps)
|
|
278
|
+
if self.cfg.use_AAM:
|
|
279
|
+
x += self.merge_pointmaps[i](pointmaps) + self.merge_edgemaps[i](edgemaps)
|
|
280
|
+
|
|
281
|
+
y.append(landmarks)
|
|
282
|
+
if self.cfg.use_AAM:
|
|
283
|
+
y.append(pointmaps)
|
|
284
|
+
y.append(edgemaps)
|
|
285
|
+
fusionmaps.append(fusion)
|
|
286
|
+
|
|
287
|
+
return y, fusionmaps, landmarks
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
import torch.nn as nn
|
|
5
|
+
import torch.nn.functional as F
|
|
6
|
+
import timm
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
# ── Graph Neural Network (AU head) ────────────────────────────────────────────
|
|
10
|
+
|
|
11
|
+
def _normalize_digraph(A):
|
|
12
|
+
b, n, _ = A.shape
|
|
13
|
+
degs_inv_sqrt = A.detach().sum(dim=-1) ** -0.5
|
|
14
|
+
norm = torch.eye(n, device=A.device).view(1, n, n) * degs_inv_sqrt.view(b, n, 1)
|
|
15
|
+
return torch.bmm(torch.bmm(norm, A), norm)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GNN(nn.Module):
|
|
19
|
+
def __init__(self, in_channels, num_classes, neighbor_num=4, metric='dots'):
|
|
20
|
+
super().__init__()
|
|
21
|
+
self.metric = metric
|
|
22
|
+
self.neighbor_num = neighbor_num
|
|
23
|
+
self.U = nn.Linear(in_channels, in_channels)
|
|
24
|
+
self.V = nn.Linear(in_channels, in_channels)
|
|
25
|
+
self.bnv = nn.BatchNorm1d(num_classes)
|
|
26
|
+
self.relu = nn.ReLU()
|
|
27
|
+
nn.init.normal_(self.U.weight, 0, math.sqrt(2. / in_channels))
|
|
28
|
+
nn.init.normal_(self.V.weight, 0, math.sqrt(2. / in_channels))
|
|
29
|
+
self.bnv.weight.data.fill_(1)
|
|
30
|
+
self.bnv.bias.data.zero_()
|
|
31
|
+
|
|
32
|
+
def forward(self, x):
|
|
33
|
+
b, n, c = x.shape
|
|
34
|
+
si = x.detach()
|
|
35
|
+
if self.metric == 'dots':
|
|
36
|
+
si = torch.einsum('bij,bjk->bik', si, si.transpose(1, 2))
|
|
37
|
+
elif self.metric == 'cosine':
|
|
38
|
+
si = F.normalize(si, p=2, dim=-1)
|
|
39
|
+
si = torch.einsum('bij,bjk->bik', si, si.transpose(1, 2))
|
|
40
|
+
elif self.metric == 'l1':
|
|
41
|
+
si = torch.abs(si.unsqueeze(2) - si.unsqueeze(1)).sum(dim=-1)
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError(f"Unknown metric: {self.metric}")
|
|
44
|
+
|
|
45
|
+
largest = self.metric != 'l1'
|
|
46
|
+
threshold = si.topk(k=self.neighbor_num, dim=-1, largest=largest)[0][:, :, -1].view(b, n, 1)
|
|
47
|
+
adj = (si >= threshold).float() if largest else (si <= threshold).float()
|
|
48
|
+
A = _normalize_digraph(adj)
|
|
49
|
+
agg = torch.einsum('bij,bjk->bik', A, self.V(x))
|
|
50
|
+
return self.relu(x + self.bnv(agg + self.U(x)))
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class AUHead(nn.Module):
|
|
54
|
+
def __init__(self, in_channels, num_classes, neighbor_num=4, metric='dots'):
|
|
55
|
+
super().__init__()
|
|
56
|
+
self.class_linears = nn.ModuleList([nn.Linear(in_channels, in_channels) for _ in range(num_classes)])
|
|
57
|
+
self.gnn = GNN(in_channels, num_classes, neighbor_num=neighbor_num, metric=metric)
|
|
58
|
+
self.sc = nn.Parameter(torch.zeros(num_classes, in_channels))
|
|
59
|
+
self.relu = nn.ReLU()
|
|
60
|
+
nn.init.xavier_uniform_(self.sc)
|
|
61
|
+
|
|
62
|
+
def forward(self, x):
|
|
63
|
+
f_u = torch.cat([layer(x).unsqueeze(1) for layer in self.class_linears], dim=1)
|
|
64
|
+
f_v = self.gnn(f_u)
|
|
65
|
+
sc = F.normalize(self.relu(self.sc), p=2, dim=-1)
|
|
66
|
+
cl = F.normalize(f_v, p=2, dim=-1)
|
|
67
|
+
return (cl * sc.unsqueeze(0)).sum(dim=-1)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ── MTL Model ─────────────────────────────────────────────────────────────────
|
|
71
|
+
|
|
72
|
+
class MTL(nn.Module):
|
|
73
|
+
def __init__(self, base_model_name='tf_efficientnet_b0_ns', expr_classes=8, au_numbers=8):
|
|
74
|
+
super().__init__()
|
|
75
|
+
self.base_model = timm.create_model(base_model_name, pretrained=False)
|
|
76
|
+
self.base_model.classifier = nn.Identity()
|
|
77
|
+
d = self.base_model.num_features
|
|
78
|
+
|
|
79
|
+
self.relu = nn.ReLU()
|
|
80
|
+
self.fc_emotion = nn.Linear(d, d)
|
|
81
|
+
self.fc_gaze = nn.Linear(d, d)
|
|
82
|
+
self.fc_au = nn.Linear(d, d)
|
|
83
|
+
|
|
84
|
+
self.emotion_classifier = nn.Linear(d, expr_classes)
|
|
85
|
+
self.gaze_regressor = nn.Linear(d, 2)
|
|
86
|
+
self.au_regressor = AUHead(in_channels=d, num_classes=au_numbers, neighbor_num=4, metric='dots')
|
|
87
|
+
|
|
88
|
+
def forward(self, x):
|
|
89
|
+
feat = self.base_model(x)
|
|
90
|
+
emotion = self.emotion_classifier(self.relu(self.fc_emotion(feat)))
|
|
91
|
+
gaze = self.gaze_regressor(self.relu(self.fc_gaze(feat)))
|
|
92
|
+
au = self.au_regressor(self.relu(self.fc_au(feat)))
|
|
93
|
+
return emotion, gaze, au
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from typing import Dict, List, Tuple
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
import torch.nn as nn
|
|
5
|
+
import torch.nn.functional as F
|
|
6
|
+
from torchvision import models
|
|
7
|
+
from torchvision.models import _utils
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# ── FPN / SSH building blocks ─────────────────────────────────────────────────
|
|
11
|
+
|
|
12
|
+
def _conv_bn(inp, oup, stride=1, leaky=0):
|
|
13
|
+
return nn.Sequential(
|
|
14
|
+
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
|
|
15
|
+
nn.BatchNorm2d(oup),
|
|
16
|
+
nn.LeakyReLU(negative_slope=leaky, inplace=True))
|
|
17
|
+
|
|
18
|
+
def _conv_bn_no_relu(inp, oup, stride):
|
|
19
|
+
return nn.Sequential(
|
|
20
|
+
nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
|
|
21
|
+
nn.BatchNorm2d(oup))
|
|
22
|
+
|
|
23
|
+
def _conv_bn1x1(inp, oup, stride, leaky=0):
|
|
24
|
+
return nn.Sequential(
|
|
25
|
+
nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
|
|
26
|
+
nn.BatchNorm2d(oup),
|
|
27
|
+
nn.LeakyReLU(negative_slope=leaky, inplace=True))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SSH(nn.Module):
|
|
31
|
+
def __init__(self, in_channel, out_channel):
|
|
32
|
+
super().__init__()
|
|
33
|
+
if out_channel % 4 != 0:
|
|
34
|
+
raise ValueError(f"out_channel must be divisible by 4, got {out_channel}")
|
|
35
|
+
leaky = 0.1 if out_channel <= 64 else 0
|
|
36
|
+
self.conv3X3 = _conv_bn_no_relu(in_channel, out_channel // 2, stride=1)
|
|
37
|
+
self.conv5X5_1 = _conv_bn(in_channel, out_channel // 4, stride=1, leaky=leaky)
|
|
38
|
+
self.conv5X5_2 = _conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1)
|
|
39
|
+
self.conv7X7_2 = _conv_bn(out_channel // 4, out_channel // 4, stride=1, leaky=leaky)
|
|
40
|
+
self.conv7x7_3 = _conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1)
|
|
41
|
+
|
|
42
|
+
def forward(self, x):
|
|
43
|
+
c3 = self.conv3X3(x)
|
|
44
|
+
c5_1 = self.conv5X5_1(x)
|
|
45
|
+
c5 = self.conv5X5_2(c5_1)
|
|
46
|
+
c7 = self.conv7x7_3(self.conv7X7_2(c5_1))
|
|
47
|
+
return F.relu(torch.cat([c3, c5, c7], dim=1))
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class FPN(nn.Module):
|
|
51
|
+
def __init__(self, in_channels_list, out_channels):
|
|
52
|
+
super().__init__()
|
|
53
|
+
leaky = 0.1 if out_channels <= 64 else 0
|
|
54
|
+
self.output1 = _conv_bn1x1(in_channels_list[0], out_channels, stride=1, leaky=leaky)
|
|
55
|
+
self.output2 = _conv_bn1x1(in_channels_list[1], out_channels, stride=1, leaky=leaky)
|
|
56
|
+
self.output3 = _conv_bn1x1(in_channels_list[2], out_channels, stride=1, leaky=leaky)
|
|
57
|
+
self.merge1 = _conv_bn(out_channels, out_channels, leaky=leaky)
|
|
58
|
+
self.merge2 = _conv_bn(out_channels, out_channels, leaky=leaky)
|
|
59
|
+
|
|
60
|
+
def forward(self, x: Dict[str, torch.Tensor]) -> List[torch.Tensor]:
|
|
61
|
+
y = list(x.values())
|
|
62
|
+
o1, o2, o3 = self.output1(y[0]), self.output2(y[1]), self.output3(y[2])
|
|
63
|
+
o2 = self.merge2(o2 + F.interpolate(o3, size=o2.shape[2:], mode='nearest'))
|
|
64
|
+
o1 = self.merge1(o1 + F.interpolate(o2, size=o1.shape[2:], mode='nearest'))
|
|
65
|
+
return [o1, o2, o3]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ── Prediction heads ──────────────────────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
class _ClassHead(nn.Module):
|
|
71
|
+
def __init__(self, in_channels=512, num_anchors=2):
|
|
72
|
+
super().__init__()
|
|
73
|
+
self.conv1x1 = nn.Conv2d(in_channels, num_anchors * 2, 1)
|
|
74
|
+
|
|
75
|
+
def forward(self, x):
|
|
76
|
+
return self.conv1x1(x).permute(0, 2, 3, 1).contiguous().view(x.shape[0], -1, 2)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class _BboxHead(nn.Module):
|
|
80
|
+
def __init__(self, in_channels=512, num_anchors=2):
|
|
81
|
+
super().__init__()
|
|
82
|
+
self.conv1x1 = nn.Conv2d(in_channels, num_anchors * 4, 1)
|
|
83
|
+
|
|
84
|
+
def forward(self, x):
|
|
85
|
+
return self.conv1x1(x).permute(0, 2, 3, 1).contiguous().view(x.shape[0], -1, 4)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class _LandmarkHead(nn.Module):
|
|
89
|
+
def __init__(self, in_channels=512, num_anchors=2):
|
|
90
|
+
super().__init__()
|
|
91
|
+
self.conv1x1 = nn.Conv2d(in_channels, num_anchors * 10, 1)
|
|
92
|
+
|
|
93
|
+
def forward(self, x):
|
|
94
|
+
return self.conv1x1(x).permute(0, 2, 3, 1).contiguous().view(x.shape[0], -1, 10)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ── RetinaFace ────────────────────────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
class RetinaFace(nn.Module):
|
|
100
|
+
def __init__(self, name, pretrained, in_channels, return_layers, out_channels):
|
|
101
|
+
super().__init__()
|
|
102
|
+
if name != "Resnet50":
|
|
103
|
+
raise NotImplementedError(f"Only Resnet50 supported, got {name}")
|
|
104
|
+
backbone = models.resnet50(pretrained=pretrained)
|
|
105
|
+
self.body = _utils.IntermediateLayerGetter(backbone, return_layers)
|
|
106
|
+
c = in_channels
|
|
107
|
+
self.fpn = FPN([c * 2, c * 4, c * 8], out_channels)
|
|
108
|
+
self.ssh1 = SSH(out_channels, out_channels)
|
|
109
|
+
self.ssh2 = SSH(out_channels, out_channels)
|
|
110
|
+
self.ssh3 = SSH(out_channels, out_channels)
|
|
111
|
+
self.ClassHead = nn.ModuleList([_ClassHead(out_channels) for _ in range(3)])
|
|
112
|
+
self.BboxHead = nn.ModuleList([_BboxHead(out_channels) for _ in range(3)])
|
|
113
|
+
self.LandmarkHead = nn.ModuleList([_LandmarkHead(out_channels) for _ in range(3)])
|
|
114
|
+
|
|
115
|
+
def forward(self, x) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
116
|
+
fpn_out = self.fpn(self.body(x))
|
|
117
|
+
features = [self.ssh1(fpn_out[0]), self.ssh2(fpn_out[1]), self.ssh3(fpn_out[2])]
|
|
118
|
+
bbox = torch.cat([self.BboxHead[i](f) for i, f in enumerate(features)], dim=1)
|
|
119
|
+
cls = torch.cat([self.ClassHead[i](f) for i, f in enumerate(features)], dim=1)
|
|
120
|
+
ldm = torch.cat([self.LandmarkHead[i](f) for i, f in enumerate(features)], dim=1)
|
|
121
|
+
return bbox, cls, ldm
|
openface3/pipeline.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
import time
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import torch
|
|
5
|
+
from huggingface_hub import hf_hub_download
|
|
6
|
+
|
|
7
|
+
from .detection import Detector
|
|
8
|
+
from .alignment import Aligner
|
|
9
|
+
from .analysis import Analyzer
|
|
10
|
+
|
|
11
|
+
HF_REPO_ID = "haneeshbyreddy/openface3-weights"
|
|
12
|
+
|
|
13
|
+
EMOTION_LABELS = ['Neutral', 'Happy', 'Sad', 'Surprise', 'Fear', 'Disgust', 'Anger', 'Contempt']
|
|
14
|
+
AU_IDS = ['AU01', 'AU02', 'AU04', 'AU06', 'AU09', 'AU12', 'AU25', 'AU26']
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class Pipeline:
|
|
18
|
+
def __init__(self, device='cpu'):
|
|
19
|
+
self.detector = Detector(device=device)
|
|
20
|
+
self.detector.load_weights()
|
|
21
|
+
self.detector.eval()
|
|
22
|
+
|
|
23
|
+
landmark_path = hf_hub_download(HF_REPO_ID, 'aligner.pkl')
|
|
24
|
+
multitask_path = hf_hub_download(HF_REPO_ID, 'analyzer.pth')
|
|
25
|
+
self.aligner = Aligner(model_path=landmark_path, device=device)
|
|
26
|
+
self.analyzer = Analyzer(model_path=multitask_path, device=device)
|
|
27
|
+
self._warmup()
|
|
28
|
+
|
|
29
|
+
def _warmup(self):
|
|
30
|
+
dummy_det = torch.zeros(1, 3, self.detector.max_size, self.detector.max_size,
|
|
31
|
+
device=self.detector.device)
|
|
32
|
+
self.detector.infer(dummy_det)
|
|
33
|
+
|
|
34
|
+
dummy_crop = np.zeros((256, 256, 3), dtype=np.uint8)
|
|
35
|
+
self.aligner.infer(dummy_crop)
|
|
36
|
+
|
|
37
|
+
dummy_face = self.analyzer.preprocess(np.zeros((224, 224, 3), dtype=np.uint8))
|
|
38
|
+
self.analyzer.infer(dummy_face)
|
|
39
|
+
|
|
40
|
+
def run(self, image: np.ndarray) -> tuple:
|
|
41
|
+
"""image: (H,W,3) uint8 RGB. Returns (results, timings)."""
|
|
42
|
+
t_total = time.perf_counter()
|
|
43
|
+
|
|
44
|
+
t0 = time.perf_counter()
|
|
45
|
+
tensor, meta = self.detector.preprocess(image)
|
|
46
|
+
t1 = time.perf_counter()
|
|
47
|
+
raw_det = self.detector.infer(tensor)
|
|
48
|
+
detect_infer_ms = (time.perf_counter() - t1) * 1000.0
|
|
49
|
+
detections = self.detector.postprocess(raw_det, meta, confidence_threshold=0.95)
|
|
50
|
+
detect_ms = (time.perf_counter() - t0) * 1000.0
|
|
51
|
+
|
|
52
|
+
results = []
|
|
53
|
+
align_ms = 0.0
|
|
54
|
+
align_infer_ms = 0.0
|
|
55
|
+
analyze_ms = 0.0
|
|
56
|
+
analyze_infer_ms = 0.0
|
|
57
|
+
|
|
58
|
+
for det in detections:
|
|
59
|
+
t0 = time.perf_counter()
|
|
60
|
+
crop, matrix = self.aligner.preprocess(image, det['bbox'])
|
|
61
|
+
t1 = time.perf_counter()
|
|
62
|
+
raw_lm = self.aligner.infer(crop)
|
|
63
|
+
align_infer_ms += (time.perf_counter() - t1) * 1000.0
|
|
64
|
+
landmarks = self.aligner.postprocess(raw_lm, matrix)
|
|
65
|
+
align_ms += (time.perf_counter() - t0) * 1000.0
|
|
66
|
+
|
|
67
|
+
x1, y1, x2, y2 = det['bbox']
|
|
68
|
+
face_crop = image[y1:y2, x1:x2]
|
|
69
|
+
|
|
70
|
+
if face_crop.size > 0:
|
|
71
|
+
t0 = time.perf_counter()
|
|
72
|
+
tensor_face = self.analyzer.preprocess(face_crop)
|
|
73
|
+
t1 = time.perf_counter()
|
|
74
|
+
raw_ana = self.analyzer.infer(tensor_face)
|
|
75
|
+
analyze_infer_ms += (time.perf_counter() - t1) * 1000.0
|
|
76
|
+
analysis = self.analyzer.postprocess(raw_ana)
|
|
77
|
+
analyze_ms += (time.perf_counter() - t0) * 1000.0
|
|
78
|
+
else:
|
|
79
|
+
analysis = {}
|
|
80
|
+
|
|
81
|
+
results.append({**det, 'landmarks': landmarks, **analysis})
|
|
82
|
+
|
|
83
|
+
total_ms = (time.perf_counter() - t_total) * 1000.0
|
|
84
|
+
timings = {
|
|
85
|
+
'detect_ms': detect_ms,
|
|
86
|
+
'detect_infer_ms': detect_infer_ms,
|
|
87
|
+
'align_ms': align_ms,
|
|
88
|
+
'align_infer_ms': align_infer_ms,
|
|
89
|
+
'analyze_ms': analyze_ms,
|
|
90
|
+
'analyze_infer_ms': analyze_infer_ms,
|
|
91
|
+
'total_ms': total_ms,
|
|
92
|
+
}
|
|
93
|
+
return results, timings
|
|
94
|
+
|
|
95
|
+
@staticmethod
|
|
96
|
+
def serialize(results: list, timings: dict, frame_n: int = 0, ts: float = 0.0) -> dict:
|
|
97
|
+
"""
|
|
98
|
+
Convert pipeline output to a JSON-ready dict.
|
|
99
|
+
Both main.py (JSONL recording) and the backend API use this format.
|
|
100
|
+
"""
|
|
101
|
+
faces = []
|
|
102
|
+
for r in results:
|
|
103
|
+
em_logits = r['emotion'].flatten().tolist() if r.get('emotion') is not None else []
|
|
104
|
+
top_em = int(np.argmax(em_logits)) if em_logits else 0
|
|
105
|
+
gaze = r['gaze'].flatten().tolist() if r.get('gaze') is not None else [0.0, 0.0]
|
|
106
|
+
au_vals = r['au'].flatten().tolist() if r.get('au') is not None else []
|
|
107
|
+
|
|
108
|
+
faces.append({
|
|
109
|
+
'bbox': r['bbox'],
|
|
110
|
+
'confidence': round(float(r['score']), 4),
|
|
111
|
+
'landmarks': r['landmarks'].tolist(),
|
|
112
|
+
'emotion': {
|
|
113
|
+
'label': EMOTION_LABELS[top_em] if em_logits else None,
|
|
114
|
+
'logits': [round(v, 4) for v in em_logits],
|
|
115
|
+
},
|
|
116
|
+
'gaze': {
|
|
117
|
+
'horizontal_deg': round(gaze[0], 2),
|
|
118
|
+
'vertical_deg': round(gaze[1], 2),
|
|
119
|
+
},
|
|
120
|
+
'action_units': {
|
|
121
|
+
au_id: round(float(np.clip(v, 0.0, 1.0)), 4)
|
|
122
|
+
for au_id, v in zip(AU_IDS, au_vals)
|
|
123
|
+
},
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
return {
|
|
127
|
+
'frame': frame_n,
|
|
128
|
+
'ts': round(ts, 4),
|
|
129
|
+
'faces': faces,
|
|
130
|
+
'timings': {
|
|
131
|
+
'detect': {'total_ms': round(timings['detect_ms'], 2),
|
|
132
|
+
'infer_ms': round(timings['detect_infer_ms'], 2)},
|
|
133
|
+
'align': {'total_ms': round(timings['align_ms'], 2),
|
|
134
|
+
'infer_ms': round(timings['align_infer_ms'], 2)},
|
|
135
|
+
'analyze': {'total_ms': round(timings['analyze_ms'], 2),
|
|
136
|
+
'infer_ms': round(timings['analyze_infer_ms'],2)},
|
|
137
|
+
'total_ms': round(timings['total_ms'], 2),
|
|
138
|
+
},
|
|
139
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: openface3
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Facial analysis pipeline: detection, 98-point alignment, and multitask emotion/gaze/AU inference
|
|
5
|
+
Project-URL: Homepage, https://github.com/haneeshbyreddy/openface3
|
|
6
|
+
Project-URL: Issues, https://github.com/haneeshbyreddy/openface3/issues
|
|
7
|
+
Project-URL: HuggingFace, https://huggingface.co/haneeshbyreddy/openface3-weights
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2026 Haneesh Byreddy
|
|
11
|
+
|
|
12
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
13
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
14
|
+
in the Software without restriction, including without limitation the rights
|
|
15
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
17
|
+
furnished to do so, subject to the following conditions:
|
|
18
|
+
|
|
19
|
+
The above copyright notice and this permission notice shall be included in all
|
|
20
|
+
copies or substantial portions of the Software.
|
|
21
|
+
|
|
22
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
23
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
24
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
25
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
26
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
27
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
28
|
+
SOFTWARE.
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Keywords: action-units,emotion,face,facial-analysis,gaze
|
|
31
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
32
|
+
Classifier: Operating System :: OS Independent
|
|
33
|
+
Classifier: Programming Language :: Python :: 3
|
|
34
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
35
|
+
Requires-Python: >=3.9
|
|
36
|
+
Requires-Dist: albumentations
|
|
37
|
+
Requires-Dist: huggingface-hub
|
|
38
|
+
Requires-Dist: iglovikov-helper-functions
|
|
39
|
+
Requires-Dist: numpy
|
|
40
|
+
Requires-Dist: opencv-python
|
|
41
|
+
Requires-Dist: timm
|
|
42
|
+
Requires-Dist: torch>=2.0
|
|
43
|
+
Requires-Dist: torchvision>=0.15
|
|
44
|
+
Description-Content-Type: text/markdown
|
|
45
|
+
|
|
46
|
+
# openface3
|
|
47
|
+
|
|
48
|
+
Facial analysis pipeline: face detection, 98-point landmark alignment, and multitask emotion / gaze / action unit inference.
|
|
49
|
+
|
|
50
|
+
## Install
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install openface3
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Weights are downloaded automatically from [Hugging Face](https://huggingface.co/haneeshbyreddy/openface3-weights) on first use and cached via the standard HF cache (`~/.cache/huggingface/hub/`).
|
|
57
|
+
|
|
58
|
+
## Usage
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
import cv2
|
|
62
|
+
from openface3 import Pipeline
|
|
63
|
+
|
|
64
|
+
pipeline = Pipeline(device='cuda') # or 'cpu'
|
|
65
|
+
|
|
66
|
+
image = cv2.cvtColor(cv2.imread('image.jpg'), cv2.COLOR_BGR2RGB)
|
|
67
|
+
results, timings = pipeline.run(image)
|
|
68
|
+
|
|
69
|
+
for face in results:
|
|
70
|
+
print(face['emotion']) # (1, 8) logits — softmax to get probabilities
|
|
71
|
+
print(face['gaze']) # (1, 2) [horizontal_deg, vertical_deg]
|
|
72
|
+
print(face['au']) # (1, 8) AU activations in [0, 1]
|
|
73
|
+
print(face['landmarks']) # (98, 2) WFLW keypoints in image coords
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Serialise a frame to JSON:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
payload = Pipeline.serialize(results, timings, frame_n=0, ts=0.0)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Labels
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from openface3 import EMOTION_LABELS, AU_IDS
|
|
86
|
+
# EMOTION_LABELS: ['Neutral','Happy','Sad','Surprise','Fear','Disgust','Anger','Contempt']
|
|
87
|
+
# AU_IDS: ['AU01','AU02','AU04','AU06','AU09','AU12','AU25','AU26']
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## License
|
|
91
|
+
|
|
92
|
+
MIT
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
openface3/__init__.py,sha256=tFFij0ysa_9aXqFHCt9YNI5kFed7gXF8IMGDjuZQFbU,55
|
|
2
|
+
openface3/alignment.py,sha256=JviB2nErtLbt7jtrK_z7QRcS1mV0nVOgzPPnYclLGG0,5451
|
|
3
|
+
openface3/analysis.py,sha256=lfLu03JqRfJ9e4sh2k9rEf_YwPV6I6HuagxCil2RIBA,1367
|
|
4
|
+
openface3/detection.py,sha256=-6pFf6ZAE7Hy_9U4G79Rfw3b1Mj3wYLgRcCmRpR7jOY,5795
|
|
5
|
+
openface3/pipeline.py,sha256=FyMNCRfsidt8FIzoXPS2BCg5jty-B0FiB8wM_Gebq5Q,5738
|
|
6
|
+
openface3/models/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
openface3/models/landmark.py,sha256=f7DMmteI0HsdEhhnnM_Ie6EUIVX-aLCA6y_nGFdlaqo,13623
|
|
8
|
+
openface3/models/multitask.py,sha256=PdPRf-fNoil4OCm33pBdEfsFCVPeK0of7Cwe7BwgnRs,3996
|
|
9
|
+
openface3/models/retinaface.py,sha256=86_NipPze8iNkbUNwaxHAwPsOHCrV8poc-0HTEgAFWo,5559
|
|
10
|
+
openface3-0.1.0.dist-info/METADATA,sha256=HYl6zbPuG_4WxxoysR5UqM3WivilqttnZSUAFqvA5OQ,3480
|
|
11
|
+
openface3-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
12
|
+
openface3-0.1.0.dist-info/licenses/LICENSE,sha256=-vgb05Q13y-cnll7Xd55R5oFInwVK6HeIn__nJ7jhq8,1072
|
|
13
|
+
openface3-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Haneesh Byreddy
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|