@camstack/addon-detection-pipeline 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ """Postprocessor dispatch table for inference_pool.py.
2
+
3
+ Each postprocessor takes raw model output (dict of numpy arrays) and model config,
4
+ returns a structured dict matching the StepOutput JSON format.
5
+ """
6
+ from .yolo import postprocess_yolo
7
+ from .yolo_seg import postprocess_yolo_seg
8
+ from .scrfd import postprocess_scrfd
9
+ from .arcface import postprocess_arcface
10
+ from .softmax import postprocess_softmax
11
+ from .ctc import postprocess_ctc
12
+ from .saliency import postprocess_saliency
13
+ from .yamnet import postprocess_yamnet
14
+
15
+ POSTPROCESSORS = {
16
+ "yolo": postprocess_yolo,
17
+ "yolo-seg": postprocess_yolo_seg,
18
+ "scrfd": postprocess_scrfd,
19
+ "arcface": postprocess_arcface,
20
+ "softmax": postprocess_softmax,
21
+ "ctc": postprocess_ctc,
22
+ "saliency": postprocess_saliency,
23
+ "yamnet": postprocess_yamnet,
24
+ }
@@ -0,0 +1,31 @@
1
+ """ArcFace face embedding postprocessor.
2
+
3
+ Input: raw embedding tensor from ArcFace model
4
+ Output: {"kind": "embedding", "embedding": [...], "embeddingNorm": [...]}
5
+ """
6
+ import numpy as np
7
+
8
+
9
+ def postprocess_arcface(
10
+ predictions: dict,
11
+ config: dict,
12
+ orig_w: int,
13
+ orig_h: int,
14
+ scale: float,
15
+ pad: tuple[int, int],
16
+ ) -> dict:
17
+ """L2-normalize the embedding vector."""
18
+ # Get the first output tensor (usually named "fc1" or similar)
19
+ raw = np.array(list(predictions.values())[0]).flatten().astype(np.float32)
20
+
21
+ norm = np.linalg.norm(raw)
22
+ if norm == 0:
23
+ normalized = np.zeros_like(raw)
24
+ else:
25
+ normalized = raw / norm
26
+
27
+ return {
28
+ "kind": "embedding",
29
+ "embedding": raw.tolist(),
30
+ "embeddingNorm": normalized.tolist(),
31
+ }
@@ -0,0 +1,68 @@
1
+ """CTC text recognition postprocessor.
2
+
3
+ Used for: vgg_english_g2 (plate OCR).
4
+
5
+ Input: [1, seqLen, numChars] logits
6
+ Output: {"kind": "text", "text": "ABC123", "confidence": 0.95}
7
+
8
+ Algorithm: argmax per timestep → collapse consecutive duplicates → remove blank (index 0) → join
9
+ """
10
+ import numpy as np
11
+
12
+
13
+ def postprocess_ctc(
14
+ predictions: dict,
15
+ config: dict,
16
+ orig_w: int,
17
+ orig_h: int,
18
+ scale: float,
19
+ pad: tuple[int, int],
20
+ ) -> dict:
21
+ """CTC greedy decode."""
22
+ charset = config.get("charset", [])
23
+
24
+ # Get first output tensor
25
+ raw = np.array(list(predictions.values())[0]).astype(np.float32)
26
+
27
+ # Handle shapes: [1, T, C] or [T, C]
28
+ if raw.ndim == 3:
29
+ raw = raw[0] # remove batch
30
+ # raw is now [T, C] — logits (not probabilities)
31
+ seq_len, num_chars = raw.shape
32
+
33
+ # Apply softmax to get probabilities
34
+ raw_max = np.max(raw, axis=1, keepdims=True)
35
+ exp_raw = np.exp(raw - raw_max)
36
+ probs = exp_raw / np.sum(exp_raw, axis=1, keepdims=True)
37
+
38
+ # Argmax per timestep
39
+ best_indices = np.argmax(probs, axis=1) # [T]
40
+ best_scores = probs[np.arange(seq_len), best_indices] # [T]
41
+
42
+ # Collapse consecutive duplicates
43
+ collapsed = []
44
+ collapsed_scores = []
45
+ for i, idx in enumerate(best_indices):
46
+ if i == 0 or idx != best_indices[i - 1]:
47
+ collapsed.append(int(idx))
48
+ collapsed_scores.append(float(best_scores[i]))
49
+
50
+ # Remove blank (index 0) and join
51
+ chars = []
52
+ scores = []
53
+ for idx, score in zip(collapsed, collapsed_scores):
54
+ if idx != 0:
55
+ if idx < len(charset):
56
+ chars.append(charset[idx])
57
+ else:
58
+ chars.append(chr(idx))
59
+ scores.append(score)
60
+
61
+ text = "".join(chars)
62
+ confidence = float(np.mean(scores)) if scores else 0.0
63
+
64
+ return {
65
+ "kind": "text",
66
+ "text": text,
67
+ "confidence": round(confidence, 4),
68
+ }
@@ -0,0 +1,44 @@
1
+ """Saliency / segmentation refiner postprocessor.
2
+
3
+ Used for: u2netp.
4
+
5
+ Input: raw saliency map tensor
6
+ Output: {"kind": "mask", "mask": "<base64>", "maskWidth": N, "maskHeight": N}
7
+ """
8
+ import base64
9
+ import numpy as np
10
+
11
+
12
+ def postprocess_saliency(
13
+ predictions: dict,
14
+ config: dict,
15
+ orig_w: int,
16
+ orig_h: int,
17
+ scale: float,
18
+ pad: tuple[int, int],
19
+ ) -> dict:
20
+ """Sigmoid → threshold → binary mask → base64."""
21
+ mask_threshold = config.get("maskThreshold", 0.5)
22
+
23
+ # U2-Net outputs multiple saliency maps; use the first (highest resolution)
24
+ raw = np.array(list(predictions.values())[0]).astype(np.float32)
25
+
26
+ # Remove batch/channel dims to get [H, W]
27
+ while raw.ndim > 2:
28
+ raw = raw[0]
29
+
30
+ # Sigmoid
31
+ saliency = 1.0 / (1.0 + np.exp(-raw))
32
+
33
+ # Threshold to binary uint8 (0 or 255)
34
+ binary = (saliency > mask_threshold).astype(np.uint8) * 255
35
+
36
+ h, w = binary.shape
37
+ mask_b64 = base64.b64encode(binary.tobytes()).decode("ascii")
38
+
39
+ return {
40
+ "kind": "mask",
41
+ "mask": mask_b64,
42
+ "maskWidth": int(w),
43
+ "maskHeight": int(h),
44
+ }
@@ -0,0 +1,212 @@
1
+ """SCRFD face detection postprocessor.
2
+
3
+ Multi-stride anchor-based face detector (strides 8, 16, 32).
4
+ Ported from packages/addon-vision/src/shared/postprocess/scrfd.ts.
5
+
6
+ Output: {"kind": "detections", "detections": [{"class": "face", "score", "bbox": [x1,y1,x2,y2], "landmarks?"}]}
7
+ """
8
+ import numpy as np
9
+
10
+
11
+ STRIDES = [8, 16, 32]
12
+ NUM_ANCHORS_PER_STRIDE = 2
13
+
14
+
15
+ def _generate_anchors(stride: int, input_size: int) -> list[tuple[float, float]]:
16
+ """Generate anchor centers for a given stride."""
17
+ feat_size = int(np.ceil(input_size / stride))
18
+ anchors = []
19
+ for y in range(feat_size):
20
+ for x in range(feat_size):
21
+ for _ in range(NUM_ANCHORS_PER_STRIDE):
22
+ anchors.append(((x + 0.5) * stride, (y + 0.5) * stride))
23
+ return anchors
24
+
25
+
26
+ def _iou(a: tuple, b: tuple) -> float:
27
+ ax1, ay1, ax2, ay2 = a
28
+ bx1, by1, bx2, by2 = b
29
+ ix1, iy1 = max(ax1, bx1), max(ay1, by1)
30
+ ix2, iy2 = min(ax2, bx2), min(ay2, by2)
31
+ inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
32
+ if inter == 0:
33
+ return 0.0
34
+ union = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
35
+ return inter / union if union > 0 else 0.0
36
+
37
+
38
+ def _nms(boxes: list[dict], iou_threshold: float = 0.45) -> list[dict]:
39
+ if not boxes:
40
+ return []
41
+ sorted_boxes = sorted(boxes, key=lambda b: b["score"], reverse=True)
42
+ kept = []
43
+ suppressed = set()
44
+ for i, box in enumerate(sorted_boxes):
45
+ if i in suppressed:
46
+ continue
47
+ kept.append(box)
48
+ for j in range(i + 1, len(sorted_boxes)):
49
+ if j in suppressed:
50
+ continue
51
+ if _iou(box["_xyxy"], sorted_boxes[j]["_xyxy"]) > iou_threshold:
52
+ suppressed.add(j)
53
+ return kept
54
+
55
+
56
+ def _match_outputs_to_strides(
57
+ raw_outputs: dict[str, np.ndarray],
58
+ strides: list[int],
59
+ input_size: int,
60
+ ) -> dict[int, dict]:
61
+ """Match model output tensors to strides.
62
+
63
+ Handles two naming conventions:
64
+ 1. Named keys: 'score_8', 'bbox_8', 'kps_8' (ONNX/OpenVINO)
65
+ 2. Generic keys: 'var_732' etc. — matched by shape (CoreML)
66
+
67
+ For CoreML, each stride has 3 tensors with num_anchors matching the stride:
68
+ stride 8 → ceil(input/8)^2 * 2 anchors → shapes (1, N, 1), (1, N, 4), (1, N, 10)
69
+ stride 16 → ceil(input/16)^2 * 2
70
+ stride 32 → ceil(input/32)^2 * 2
71
+ """
72
+ result: dict[int, dict] = {}
73
+
74
+ # Try named keys first
75
+ flat = {k: v.flatten() for k, v in raw_outputs.items()}
76
+ for stride in strides:
77
+ score_key = bbox_key = kps_key = None
78
+ for k in flat:
79
+ if f"score_{stride}" in k or f"_{stride}_score" in k:
80
+ score_key = k
81
+ elif f"bbox_{stride}" in k or f"_{stride}_bbox" in k:
82
+ bbox_key = k
83
+ elif f"kps_{stride}" in k or f"_{stride}_kps" in k:
84
+ kps_key = k
85
+ if score_key and bbox_key:
86
+ result[stride] = {
87
+ "scores": flat[score_key],
88
+ "bboxes": flat[bbox_key],
89
+ "kps": flat.get(kps_key),
90
+ }
91
+
92
+ if len(result) == len(strides):
93
+ return result
94
+
95
+ # Fallback: match by shape (CoreML generic names)
96
+ # Group outputs by last dimension: 1=scores, 4=bboxes, 10=landmarks
97
+ scores_by_n: dict[int, np.ndarray] = {}
98
+ bboxes_by_n: dict[int, np.ndarray] = {}
99
+ kps_by_n: dict[int, np.ndarray] = {}
100
+
101
+ for _k, arr in raw_outputs.items():
102
+ if arr.ndim == 3:
103
+ arr = arr[0] # remove batch dim → (N, C)
104
+ elif arr.ndim == 1:
105
+ continue
106
+ n, c = arr.shape
107
+ if c == 1:
108
+ scores_by_n[n] = arr.flatten()
109
+ elif c == 4:
110
+ bboxes_by_n[n] = arr.flatten()
111
+ elif c == 10:
112
+ kps_by_n[n] = arr.flatten()
113
+
114
+ result = {}
115
+ for stride in strides:
116
+ feat = int(np.ceil(input_size / stride))
117
+ expected_n = feat * feat * NUM_ANCHORS_PER_STRIDE
118
+ if expected_n in scores_by_n and expected_n in bboxes_by_n:
119
+ result[stride] = {
120
+ "scores": scores_by_n[expected_n],
121
+ "bboxes": bboxes_by_n[expected_n],
122
+ "kps": kps_by_n.get(expected_n),
123
+ }
124
+
125
+ return result
126
+
127
+
128
+ def postprocess_scrfd(
129
+ predictions: dict,
130
+ config: dict,
131
+ orig_w: int,
132
+ orig_h: int,
133
+ scale: float,
134
+ pad: tuple[int, int],
135
+ ) -> dict:
136
+ """Postprocess SCRFD output to structured face detections."""
137
+ conf_threshold = float(config.get("confidence", 0.5) or 0.5)
138
+ input_size = config.get("inputSize", 640)
139
+ strides = config.get("strides", STRIDES)
140
+
141
+ # Letterbox transform: scale and pad are passed from inference_pool
142
+ # scale = input_size / max(orig_w, orig_h)
143
+ # pad = ((input_size - orig_w*scale)//2, (input_size - orig_h*scale)//2)
144
+
145
+ # Convert all outputs to numpy arrays (keep ndim)
146
+ raw_outputs = {k: np.array(v) for k, v in predictions.items()}
147
+
148
+ # Match outputs to strides — by named key or by shape
149
+ stride_data = _match_outputs_to_strides(raw_outputs, strides, input_size)
150
+
151
+ candidates = []
152
+
153
+ for stride in strides:
154
+ data = stride_data.get(stride)
155
+ if data is None:
156
+ continue
157
+
158
+ scores = data["scores"]
159
+ bboxes = data["bboxes"]
160
+ kps = data.get("kps")
161
+ anchors = _generate_anchors(stride, input_size)
162
+
163
+ n = len(anchors)
164
+ for i in range(n):
165
+ if i >= len(scores):
166
+ break
167
+ score = float(scores[i])
168
+ if score < conf_threshold:
169
+ continue
170
+
171
+ cx, cy = anchors[i]
172
+
173
+ # Bbox: offsets from anchor center, scaled by stride
174
+ x1 = cx - float(bboxes[i * 4]) * stride
175
+ y1 = cy - float(bboxes[i * 4 + 1]) * stride
176
+ x2 = cx + float(bboxes[i * 4 + 2]) * stride
177
+ y2 = cy + float(bboxes[i * 4 + 3]) * stride
178
+
179
+ # Transform from letterbox coords to original image
180
+ ox1 = max(0, min(orig_w, (x1 - pad[0]) / scale))
181
+ oy1 = max(0, min(orig_h, (y1 - pad[1]) / scale))
182
+ ox2 = max(0, min(orig_w, (x2 - pad[0]) / scale))
183
+ oy2 = max(0, min(orig_h, (y2 - pad[1]) / scale))
184
+
185
+ if (ox2 - ox1) < 1 or (oy2 - oy1) < 1:
186
+ continue
187
+
188
+ det = {
189
+ "class": "face",
190
+ "score": round(score, 4),
191
+ "bbox": [round(ox1, 1), round(oy1, 1), round(ox2, 1), round(oy2, 1)],
192
+ "_xyxy": (ox1, oy1, ox2, oy2),
193
+ }
194
+
195
+ # Landmarks (5 points × 2 coords)
196
+ if kps is not None and (i * 10 + 9) < len(kps):
197
+ landmarks = []
198
+ for p in range(5):
199
+ raw_lx = cx + float(kps[i * 10 + p * 2]) * stride
200
+ raw_ly = cy + float(kps[i * 10 + p * 2 + 1]) * stride
201
+ lx = (raw_lx - pad[0]) / scale
202
+ ly = (raw_ly - pad[1]) / scale
203
+ landmarks.append({"x": round(lx, 1), "y": round(ly, 1)})
204
+ det["landmarks"] = landmarks
205
+
206
+ candidates.append(det)
207
+
208
+ kept = _nms(candidates)
209
+ for d in kept:
210
+ del d["_xyxy"]
211
+
212
+ return {"kind": "detections", "detections": kept}
@@ -0,0 +1,43 @@
1
+ """Softmax classifier postprocessor.
2
+
3
+ Used for: animals-10, bird-nabirds-404, vehicle-type-efficientnet.
4
+
5
+ Input: raw logits tensor
6
+ Output: {"kind": "classifications", "classifications": [{"class", "score"}]}
7
+ """
8
+ import numpy as np
9
+
10
+
11
+ def postprocess_softmax(
12
+ predictions: dict,
13
+ config: dict,
14
+ orig_w: int,
15
+ orig_h: int,
16
+ scale: float,
17
+ pad: tuple[int, int],
18
+ ) -> dict:
19
+ """Softmax + argmax + label lookup."""
20
+ labels = config.get("labels", [])
21
+
22
+ # Get first output tensor
23
+ raw = np.array(list(predictions.values())[0]).flatten().astype(np.float32)
24
+
25
+ # Stable softmax
26
+ shifted = raw - np.max(raw)
27
+ exps = np.exp(shifted)
28
+ probs = exps / np.sum(exps)
29
+
30
+ # Return top-K classifications — normalization (top-1 + alternates) happens in TypeScript
31
+ top_k = min(5, len(probs))
32
+ top_indices = np.argsort(probs)[::-1][:top_k]
33
+
34
+ classifications = []
35
+ for idx in top_indices:
36
+ idx = int(idx)
37
+ score = float(probs[idx])
38
+ if score < 0.01:
39
+ break
40
+ label = labels[idx] if idx < len(labels) else str(idx)
41
+ classifications.append({"class": label, "score": round(score, 4)})
42
+
43
+ return {"kind": "classifications", "classifications": classifications}
@@ -0,0 +1,41 @@
1
+ """YAMNet audio classification postprocessor.
2
+
3
+ Input: [numFrames, 521] score matrix
4
+ Output: {"kind": "classifications", "classifications": [{"class", "score"}]}
5
+ """
6
+ import numpy as np
7
+
8
+
9
+ def postprocess_yamnet(
10
+ predictions: dict,
11
+ config: dict,
12
+ orig_w: int,
13
+ orig_h: int,
14
+ scale: float,
15
+ pad: tuple[int, int],
16
+ ) -> dict:
17
+ """Average scores across frames, return top classes above threshold."""
18
+ labels = config.get("labels", [])
19
+ min_score = config.get("confidence", 0.1)
20
+
21
+ # Get output_0 (scores) — shape [numFrames, 521]
22
+ raw = np.array(list(predictions.values())[0]).astype(np.float32)
23
+ if raw.ndim == 1:
24
+ raw = raw.reshape(1, -1)
25
+
26
+ num_frames, num_classes = raw.shape
27
+
28
+ # Average across frames
29
+ avg_scores = np.mean(raw, axis=0)
30
+
31
+ # Collect classes above threshold, sorted by score
32
+ results = []
33
+ for c in range(num_classes):
34
+ score = float(avg_scores[c])
35
+ if score >= min_score:
36
+ label = labels[c] if c < len(labels) else str(c)
37
+ results.append({"class": label, "score": round(score, 4)})
38
+
39
+ results.sort(key=lambda x: x["score"], reverse=True)
40
+
41
+ return {"kind": "classifications", "classifications": results}