@camstack/addon-detection-pipeline 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +638 -0
- package/dist/index.d.ts +638 -0
- package/dist/index.js +5826 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +5801 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +84 -0
- package/python/inference_pool.py +1088 -0
- package/python/postprocessors/__init__.py +24 -0
- package/python/postprocessors/arcface.py +31 -0
- package/python/postprocessors/ctc.py +68 -0
- package/python/postprocessors/saliency.py +44 -0
- package/python/postprocessors/scrfd.py +212 -0
- package/python/postprocessors/softmax.py +43 -0
- package/python/postprocessors/yamnet.py +41 -0
- package/python/postprocessors/yolo.py +278 -0
- package/python/postprocessors/yolo_seg.py +247 -0
- package/python/requirements-coreml.txt +4 -0
- package/python/requirements-onnxruntime.txt +3 -0
- package/python/requirements-openvino.txt +3 -0
- package/python/requirements.txt +9 -0
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""YOLO v8/v9/v11 postprocessor.
|
|
2
|
+
|
|
3
|
+
Input: raw predictions dict from CoreML/OpenVINO/ONNX
|
|
4
|
+
Output: {"kind": "detections", "detections": [{"class", "score", "bbox": [x1,y1,x2,y2]}]}
|
|
5
|
+
|
|
6
|
+
Handles both standard and built-in NMS outputs.
|
|
7
|
+
"""
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
COCO_80 = [
|
|
11
|
+
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
|
|
12
|
+
"boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
|
|
13
|
+
"bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
|
|
14
|
+
"giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
|
|
15
|
+
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
|
|
16
|
+
"skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
|
|
17
|
+
"fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
|
|
18
|
+
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
|
|
19
|
+
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
|
|
20
|
+
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
|
|
21
|
+
"refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
|
|
22
|
+
"toothbrush",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _iou(a: tuple, b: tuple) -> float:
|
|
27
|
+
"""IoU between two boxes (x1, y1, x2, y2)."""
|
|
28
|
+
ax1, ay1, ax2, ay2 = a
|
|
29
|
+
bx1, by1, bx2, by2 = b
|
|
30
|
+
ix1 = max(ax1, bx1)
|
|
31
|
+
iy1 = max(ay1, by1)
|
|
32
|
+
ix2 = min(ax2, bx2)
|
|
33
|
+
iy2 = min(ay2, by2)
|
|
34
|
+
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
|
35
|
+
if inter == 0:
|
|
36
|
+
return 0.0
|
|
37
|
+
area_a = (ax2 - ax1) * (ay2 - ay1)
|
|
38
|
+
area_b = (bx2 - bx1) * (by2 - by1)
|
|
39
|
+
union = area_a + area_b - inter
|
|
40
|
+
return inter / union if union > 0 else 0.0
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _nms(boxes: list[dict], iou_threshold: float = 0.45) -> list[dict]:
|
|
44
|
+
"""Non-maximum suppression. Input: list of {bbox, score, class}."""
|
|
45
|
+
if not boxes:
|
|
46
|
+
return []
|
|
47
|
+
sorted_boxes = sorted(boxes, key=lambda b: b["score"], reverse=True)
|
|
48
|
+
kept = []
|
|
49
|
+
suppressed = set()
|
|
50
|
+
for i, box in enumerate(sorted_boxes):
|
|
51
|
+
if i in suppressed:
|
|
52
|
+
continue
|
|
53
|
+
kept.append(box)
|
|
54
|
+
for j in range(i + 1, len(sorted_boxes)):
|
|
55
|
+
if j in suppressed:
|
|
56
|
+
continue
|
|
57
|
+
if _iou(box["_xyxy"], sorted_boxes[j]["_xyxy"]) > iou_threshold:
|
|
58
|
+
suppressed.add(j)
|
|
59
|
+
return kept
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _postprocess_coreml_nms(
|
|
63
|
+
predictions: dict,
|
|
64
|
+
config: dict,
|
|
65
|
+
orig_w: int,
|
|
66
|
+
orig_h: int,
|
|
67
|
+
input_size: int,
|
|
68
|
+
) -> dict:
|
|
69
|
+
"""Handle CoreML models with built-in NMS.
|
|
70
|
+
|
|
71
|
+
CoreML NMS output has two tensors:
|
|
72
|
+
- 'confidence': (N, numClasses) — class scores post-NMS
|
|
73
|
+
- 'coordinates': (N, 4) — [cx, cy, w, h] normalized to input image size
|
|
74
|
+
"""
|
|
75
|
+
conf_threshold = float(config.get("confidence", 0))
|
|
76
|
+
labels = config.get("labels", COCO_80)
|
|
77
|
+
|
|
78
|
+
confidence = np.array(predictions["confidence"])
|
|
79
|
+
coordinates = np.array(predictions["coordinates"])
|
|
80
|
+
|
|
81
|
+
if confidence.ndim == 1:
|
|
82
|
+
confidence = confidence.reshape(1, -1)
|
|
83
|
+
if coordinates.ndim == 1:
|
|
84
|
+
coordinates = coordinates.reshape(1, -1)
|
|
85
|
+
|
|
86
|
+
detections = []
|
|
87
|
+
for i in range(confidence.shape[0]):
|
|
88
|
+
best_class = int(np.argmax(confidence[i]))
|
|
89
|
+
best_score = float(confidence[i, best_class])
|
|
90
|
+
|
|
91
|
+
if best_score < conf_threshold:
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
# CoreML coordinates: [cx, cy, w, h] normalized to input_size
|
|
95
|
+
cx, cy, bw, bh = coordinates[i]
|
|
96
|
+
# Scale to input pixel coords
|
|
97
|
+
cx_px = cx * input_size
|
|
98
|
+
cy_px = cy * input_size
|
|
99
|
+
bw_px = bw * input_size
|
|
100
|
+
bh_px = bh * input_size
|
|
101
|
+
|
|
102
|
+
x1 = cx_px - bw_px / 2
|
|
103
|
+
y1 = cy_px - bh_px / 2
|
|
104
|
+
x2 = cx_px + bw_px / 2
|
|
105
|
+
y2 = cy_px + bh_px / 2
|
|
106
|
+
|
|
107
|
+
# Map back to original image coords (undo letterbox)
|
|
108
|
+
# CoreML imageType with letterbox: the model sees the padded image
|
|
109
|
+
# so we need to figure out the letterbox transform
|
|
110
|
+
scale = input_size / max(orig_w, orig_h)
|
|
111
|
+
new_w = int(orig_w * scale)
|
|
112
|
+
new_h = int(orig_h * scale)
|
|
113
|
+
pad_x = (input_size - new_w) / 2
|
|
114
|
+
pad_y = (input_size - new_h) / 2
|
|
115
|
+
|
|
116
|
+
ox1 = max(0, min(orig_w, (x1 - pad_x) / scale))
|
|
117
|
+
oy1 = max(0, min(orig_h, (y1 - pad_y) / scale))
|
|
118
|
+
ox2 = max(0, min(orig_w, (x2 - pad_x) / scale))
|
|
119
|
+
oy2 = max(0, min(orig_h, (y2 - pad_y) / scale))
|
|
120
|
+
|
|
121
|
+
label = labels[best_class] if best_class < len(labels) else str(best_class)
|
|
122
|
+
detections.append({
|
|
123
|
+
"class": label,
|
|
124
|
+
"score": round(best_score, 4),
|
|
125
|
+
"bbox": [round(ox1, 1), round(oy1, 1), round(ox2, 1), round(oy2, 1)],
|
|
126
|
+
})
|
|
127
|
+
|
|
128
|
+
return {"kind": "detections", "detections": detections}
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _postprocess_yolo11_nms(
|
|
132
|
+
output: np.ndarray,
|
|
133
|
+
config: dict,
|
|
134
|
+
orig_w: int,
|
|
135
|
+
orig_h: int,
|
|
136
|
+
scale: float,
|
|
137
|
+
pad: tuple[int, int],
|
|
138
|
+
) -> dict:
|
|
139
|
+
"""Handle YOLO11/v26 post-NMS output: [N, 6] — (x1, y1, x2, y2, score, class_id).
|
|
140
|
+
|
|
141
|
+
Coordinates are in input pixel space (0..inputSize). Already post-NMS.
|
|
142
|
+
"""
|
|
143
|
+
conf_threshold = float(config.get("confidence", 0))
|
|
144
|
+
labels = config.get("labels", COCO_80)
|
|
145
|
+
|
|
146
|
+
detections = []
|
|
147
|
+
for i in range(output.shape[0]):
|
|
148
|
+
x1, y1, x2, y2, score, class_id = output[i]
|
|
149
|
+
score = float(score)
|
|
150
|
+
if score < conf_threshold:
|
|
151
|
+
continue
|
|
152
|
+
class_id = int(class_id)
|
|
153
|
+
if class_id < 0:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Transform from letterbox pixel coords to original image
|
|
157
|
+
ox1 = max(0, min(orig_w, (float(x1) - pad[0]) / scale))
|
|
158
|
+
oy1 = max(0, min(orig_h, (float(y1) - pad[1]) / scale))
|
|
159
|
+
ox2 = max(0, min(orig_w, (float(x2) - pad[0]) / scale))
|
|
160
|
+
oy2 = max(0, min(orig_h, (float(y2) - pad[1]) / scale))
|
|
161
|
+
|
|
162
|
+
label = labels[class_id] if class_id < len(labels) else str(class_id)
|
|
163
|
+
detections.append({
|
|
164
|
+
"class": label,
|
|
165
|
+
"score": round(score, 4),
|
|
166
|
+
"bbox": [round(ox1, 1), round(oy1, 1), round(ox2, 1), round(oy2, 1)],
|
|
167
|
+
})
|
|
168
|
+
|
|
169
|
+
return {"kind": "detections", "detections": detections}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def postprocess_yolo(
|
|
173
|
+
predictions: dict,
|
|
174
|
+
config: dict,
|
|
175
|
+
orig_w: int,
|
|
176
|
+
orig_h: int,
|
|
177
|
+
scale: float,
|
|
178
|
+
pad: tuple[int, int],
|
|
179
|
+
) -> dict:
|
|
180
|
+
"""Postprocess YOLO output to structured detections.
|
|
181
|
+
|
|
182
|
+
Handles two formats:
|
|
183
|
+
1. Raw tensor (ONNX/OpenVINO): [4+numClasses, numBoxes] — needs NMS
|
|
184
|
+
2. CoreML built-in NMS: 'confidence' + 'coordinates' tensors — already post-NMS
|
|
185
|
+
"""
|
|
186
|
+
conf_threshold = float(config.get("confidence", 0))
|
|
187
|
+
labels = config.get("labels", COCO_80)
|
|
188
|
+
num_classes = config.get("numClasses", len(labels))
|
|
189
|
+
|
|
190
|
+
# Detect CoreML NMS output format
|
|
191
|
+
if "confidence" in predictions and "coordinates" in predictions:
|
|
192
|
+
input_size = config.get("inputSize", 640)
|
|
193
|
+
return _postprocess_coreml_nms(predictions, config, orig_w, orig_h, input_size)
|
|
194
|
+
|
|
195
|
+
# Detect YOLO11/v26 post-NMS format: [batch, N, 6] — (x1, y1, x2, y2, score, class_id)
|
|
196
|
+
first_arr = np.array(list(predictions.values())[0])
|
|
197
|
+
if first_arr.ndim == 3:
|
|
198
|
+
first_arr = first_arr[0] # remove batch dim
|
|
199
|
+
if first_arr.ndim == 2 and first_arr.shape[1] == 6:
|
|
200
|
+
return _postprocess_yolo11_nms(first_arr, config, orig_w, orig_h, scale, pad)
|
|
201
|
+
|
|
202
|
+
# Raw tensor path (ONNX / OpenVINO / CoreML without NMS)
|
|
203
|
+
output = None
|
|
204
|
+
for key, val in predictions.items():
|
|
205
|
+
arr = np.array(val)
|
|
206
|
+
if arr.ndim >= 2 and any(d in (4 + num_classes, 84, 116, 144) for d in arr.shape):
|
|
207
|
+
output = arr
|
|
208
|
+
break
|
|
209
|
+
if output is None:
|
|
210
|
+
output = np.array(list(predictions.values())[0])
|
|
211
|
+
|
|
212
|
+
# Handle various shapes
|
|
213
|
+
if output.ndim == 3:
|
|
214
|
+
output = output[0] # remove batch dim
|
|
215
|
+
|
|
216
|
+
# YOLO output: [4+numClasses, numBoxes] or [numBoxes, 4+numClasses]
|
|
217
|
+
if output.shape[0] == 4 + num_classes:
|
|
218
|
+
pass # already [C, N]
|
|
219
|
+
elif output.shape[1] == 4 + num_classes:
|
|
220
|
+
output = output.T # transpose to [C, N]
|
|
221
|
+
else:
|
|
222
|
+
# Try to infer
|
|
223
|
+
if output.shape[0] < output.shape[1]:
|
|
224
|
+
pass # likely [C, N]
|
|
225
|
+
else:
|
|
226
|
+
output = output.T
|
|
227
|
+
|
|
228
|
+
num_boxes = output.shape[1]
|
|
229
|
+
|
|
230
|
+
# Detect if scores are raw logits (YOLO11/v26) vs sigmoid-activated (YOLOv8/v9).
|
|
231
|
+
# Sigmoid-activated scores are in [0, 1]. If any class score exceeds 1.0,
|
|
232
|
+
# the model outputs raw logits and we need to apply sigmoid.
|
|
233
|
+
sample_scores = output[4:4 + num_classes, :min(100, num_boxes)]
|
|
234
|
+
needs_sigmoid = float(np.max(sample_scores)) > 1.0 or float(np.min(sample_scores)) < 0.0
|
|
235
|
+
if needs_sigmoid:
|
|
236
|
+
# Apply sigmoid to all class scores in-place (rows 4..4+numClasses)
|
|
237
|
+
output[4:4 + num_classes, :] = 1.0 / (1.0 + np.exp(-np.clip(output[4:4 + num_classes, :], -50, 50)))
|
|
238
|
+
|
|
239
|
+
# Vectorised candidate extraction — replaces the per-box Python loop
|
|
240
|
+
# that took ~20ms on 8400 boxes. Pure numpy: ~0.5ms.
|
|
241
|
+
class_scores = output[4:4 + num_classes, :] # (C, N)
|
|
242
|
+
best_classes = np.argmax(class_scores, axis=0) # (N,)
|
|
243
|
+
best_scores = class_scores[best_classes, np.arange(num_boxes)] # (N,)
|
|
244
|
+
mask = best_scores >= conf_threshold
|
|
245
|
+
if not np.any(mask):
|
|
246
|
+
return {"kind": "detections", "detections": []}
|
|
247
|
+
|
|
248
|
+
# Gather only above-threshold boxes
|
|
249
|
+
idxs = np.nonzero(mask)[0]
|
|
250
|
+
cx = output[0, idxs]
|
|
251
|
+
cy = output[1, idxs]
|
|
252
|
+
bw = output[2, idxs]
|
|
253
|
+
bh = output[3, idxs]
|
|
254
|
+
sc = best_scores[idxs]
|
|
255
|
+
cl = best_classes[idxs]
|
|
256
|
+
|
|
257
|
+
# Centre → corner, letterbox → original coords
|
|
258
|
+
x1 = np.clip((cx - bw / 2 - pad[0]) / scale, 0, orig_w)
|
|
259
|
+
y1 = np.clip((cy - bh / 2 - pad[1]) / scale, 0, orig_h)
|
|
260
|
+
x2 = np.clip((cx + bw / 2 - pad[0]) / scale, 0, orig_w)
|
|
261
|
+
y2 = np.clip((cy + bh / 2 - pad[1]) / scale, 0, orig_h)
|
|
262
|
+
|
|
263
|
+
candidates = []
|
|
264
|
+
for k in range(len(idxs)):
|
|
265
|
+
ci = int(cl[k])
|
|
266
|
+
candidates.append({
|
|
267
|
+
"class": labels[ci] if ci < len(labels) else str(ci),
|
|
268
|
+
"score": round(float(sc[k]), 4),
|
|
269
|
+
"bbox": [round(float(x1[k]), 1), round(float(y1[k]), 1), round(float(x2[k]), 1), round(float(y2[k]), 1)],
|
|
270
|
+
"_xyxy": (float(x1[k]), float(y1[k]), float(x2[k]), float(y2[k])),
|
|
271
|
+
})
|
|
272
|
+
|
|
273
|
+
kept = _nms(candidates)
|
|
274
|
+
# Remove internal _xyxy field
|
|
275
|
+
for d in kept:
|
|
276
|
+
del d["_xyxy"]
|
|
277
|
+
|
|
278
|
+
return {"kind": "detections", "detections": kept}
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""YOLO-seg postprocessor (YOLO26-seg instance segmentation).
|
|
2
|
+
|
|
3
|
+
Input: raw predictions dict with two outputs:
|
|
4
|
+
- output0: [1, 300, 38] — 300 NMS-filtered detections, each [x1,y1,x2,y2,conf,class_id,coeff_0..coeff_31]
|
|
5
|
+
- output1: [1, 32, 160, 160] — 32 prototype masks at 160x160
|
|
6
|
+
|
|
7
|
+
For raw (no NMS) models:
|
|
8
|
+
- output0: [1, 4+numClasses+32, numBoxes] — standard YOLO format with mask coefficients appended
|
|
9
|
+
|
|
10
|
+
Output: {"kind": "detections", "detections": [{..., "mask": "<b64>", "maskWidth": N, "maskHeight": N}]}
|
|
11
|
+
"""
|
|
12
|
+
import base64
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
COCO_80 = [
|
|
18
|
+
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck",
|
|
19
|
+
"boat", "traffic light", "fire hydrant", "stop sign", "parking meter", "bench",
|
|
20
|
+
"bird", "cat", "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra",
|
|
21
|
+
"giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
|
|
22
|
+
"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove",
|
|
23
|
+
"skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
|
|
24
|
+
"fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange",
|
|
25
|
+
"broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
|
|
26
|
+
"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
|
|
27
|
+
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
|
|
28
|
+
"refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier",
|
|
29
|
+
"toothbrush",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
NUM_MASK_COEFFS = 32
|
|
33
|
+
PROTO_SIZE = 160
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _iou(a: tuple, b: tuple) -> float:
|
|
37
|
+
"""IoU between two boxes (x1, y1, x2, y2)."""
|
|
38
|
+
ax1, ay1, ax2, ay2 = a
|
|
39
|
+
bx1, by1, bx2, by2 = b
|
|
40
|
+
ix1 = max(ax1, bx1)
|
|
41
|
+
iy1 = max(ay1, by1)
|
|
42
|
+
ix2 = min(ax2, bx2)
|
|
43
|
+
iy2 = min(ay2, by2)
|
|
44
|
+
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
|
45
|
+
if inter == 0:
|
|
46
|
+
return 0.0
|
|
47
|
+
area_a = (ax2 - ax1) * (ay2 - ay1)
|
|
48
|
+
area_b = (bx2 - bx1) * (by2 - by1)
|
|
49
|
+
union = area_a + area_b - inter
|
|
50
|
+
return inter / union if union > 0 else 0.0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _nms(boxes: list[dict], iou_threshold: float = 0.45) -> list[dict]:
|
|
54
|
+
"""Non-maximum suppression. Input: list of dicts with _xyxy, score, class."""
|
|
55
|
+
if not boxes:
|
|
56
|
+
return []
|
|
57
|
+
sorted_boxes = sorted(boxes, key=lambda b: b["score"], reverse=True)
|
|
58
|
+
kept = []
|
|
59
|
+
suppressed = set()
|
|
60
|
+
for i, box in enumerate(sorted_boxes):
|
|
61
|
+
if i in suppressed:
|
|
62
|
+
continue
|
|
63
|
+
kept.append(box)
|
|
64
|
+
for j in range(i + 1, len(sorted_boxes)):
|
|
65
|
+
if j in suppressed:
|
|
66
|
+
continue
|
|
67
|
+
if _iou(box["_xyxy"], sorted_boxes[j]["_xyxy"]) > iou_threshold:
|
|
68
|
+
suppressed.add(j)
|
|
69
|
+
return kept
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _sigmoid(x: np.ndarray) -> np.ndarray:
|
|
73
|
+
"""Numerically stable sigmoid."""
|
|
74
|
+
return np.where(x >= 0, 1 / (1 + np.exp(-x)), np.exp(x) / (1 + np.exp(x)))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _find_tensors(predictions: dict, num_classes: int) -> tuple[np.ndarray, np.ndarray]:
|
|
78
|
+
"""Locate the detection tensor and prototype tensor from model outputs.
|
|
79
|
+
|
|
80
|
+
Returns (detections, prototypes) as numpy arrays.
|
|
81
|
+
detections: [num_dets, channels] row-major
|
|
82
|
+
prototypes: [32, 160, 160]
|
|
83
|
+
"""
|
|
84
|
+
det_tensor = None
|
|
85
|
+
proto_tensor = None
|
|
86
|
+
|
|
87
|
+
for key, val in predictions.items():
|
|
88
|
+
arr = np.array(val, dtype=np.float32)
|
|
89
|
+
|
|
90
|
+
# Prototype: shape containing 32 * 160 * 160 = 819200
|
|
91
|
+
if arr.size == NUM_MASK_COEFFS * PROTO_SIZE * PROTO_SIZE:
|
|
92
|
+
proto_tensor = arr.reshape(NUM_MASK_COEFFS, PROTO_SIZE, PROTO_SIZE)
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
# Skip tiny tensors
|
|
96
|
+
if arr.size < 10:
|
|
97
|
+
continue
|
|
98
|
+
|
|
99
|
+
# Detection tensor: the other large tensor
|
|
100
|
+
if det_tensor is None:
|
|
101
|
+
det_tensor = arr
|
|
102
|
+
|
|
103
|
+
if det_tensor is None:
|
|
104
|
+
raise ValueError("YOLO-seg postprocessor: could not find detection tensor")
|
|
105
|
+
if proto_tensor is None:
|
|
106
|
+
raise ValueError("YOLO-seg postprocessor: could not find prototype tensor")
|
|
107
|
+
|
|
108
|
+
# Normalize detection tensor to [num_dets, channels]
|
|
109
|
+
while det_tensor.ndim > 2:
|
|
110
|
+
det_tensor = det_tensor[0] # strip batch dim
|
|
111
|
+
|
|
112
|
+
# NMS format: [300, 38] — rows are detections
|
|
113
|
+
# Raw format: [4+nc+32, N] — cols are detections, needs transpose
|
|
114
|
+
nms_cols = 6 + NUM_MASK_COEFFS # 38
|
|
115
|
+
raw_cols = 4 + num_classes + NUM_MASK_COEFFS
|
|
116
|
+
|
|
117
|
+
if det_tensor.shape[1] == nms_cols:
|
|
118
|
+
pass # already [N, 38]
|
|
119
|
+
elif det_tensor.shape[0] == nms_cols:
|
|
120
|
+
det_tensor = det_tensor.T
|
|
121
|
+
elif det_tensor.shape[1] == raw_cols:
|
|
122
|
+
pass # already [N, raw_cols]
|
|
123
|
+
elif det_tensor.shape[0] == raw_cols:
|
|
124
|
+
det_tensor = det_tensor.T
|
|
125
|
+
elif det_tensor.shape[0] < det_tensor.shape[1]:
|
|
126
|
+
det_tensor = det_tensor.T # likely [C, N] -> [N, C]
|
|
127
|
+
|
|
128
|
+
return det_tensor, proto_tensor
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _crop_mask_to_bbox(
|
|
132
|
+
mask_full: np.ndarray,
|
|
133
|
+
x1: float,
|
|
134
|
+
y1: float,
|
|
135
|
+
x2: float,
|
|
136
|
+
y2: float,
|
|
137
|
+
input_size: int,
|
|
138
|
+
mask_threshold: float = 0.5,
|
|
139
|
+
) -> tuple[np.ndarray, int, int]:
|
|
140
|
+
"""Crop sigmoid mask to bbox region, threshold to binary, return (binary, w, h)."""
|
|
141
|
+
proto_scale = PROTO_SIZE / input_size
|
|
142
|
+
px1 = max(0, int(np.floor(x1 * proto_scale)))
|
|
143
|
+
py1 = max(0, int(np.floor(y1 * proto_scale)))
|
|
144
|
+
px2 = min(PROTO_SIZE, int(np.ceil(x2 * proto_scale)))
|
|
145
|
+
py2 = min(PROTO_SIZE, int(np.ceil(y2 * proto_scale)))
|
|
146
|
+
|
|
147
|
+
crop_w = max(1, px2 - px1)
|
|
148
|
+
crop_h = max(1, py2 - py1)
|
|
149
|
+
|
|
150
|
+
cropped = mask_full[py1:py1 + crop_h, px1:px1 + crop_w]
|
|
151
|
+
binary = (cropped > mask_threshold).astype(np.uint8) * 255
|
|
152
|
+
return binary, crop_w, crop_h
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def postprocess_yolo_seg(
|
|
156
|
+
predictions: dict,
|
|
157
|
+
config: dict,
|
|
158
|
+
orig_w: int,
|
|
159
|
+
orig_h: int,
|
|
160
|
+
scale: float,
|
|
161
|
+
pad: tuple[int, int],
|
|
162
|
+
) -> dict:
|
|
163
|
+
"""Postprocess YOLO-seg output to detections with per-instance masks.
|
|
164
|
+
|
|
165
|
+
Handles:
|
|
166
|
+
1. NMS-enabled export: [N, 6+32] with [x1,y1,x2,y2,conf,class_id,...coeffs]
|
|
167
|
+
2. Raw export: [4+numClasses+32, N] standard YOLO transposed format
|
|
168
|
+
"""
|
|
169
|
+
conf_threshold = float(config.get("confidence", 0))
|
|
170
|
+
labels = config.get("labels", COCO_80)
|
|
171
|
+
num_classes = config.get("numClasses", len(labels))
|
|
172
|
+
input_size = config.get("inputSize", 640)
|
|
173
|
+
mask_threshold = config.get("maskThreshold", 0.5)
|
|
174
|
+
|
|
175
|
+
det_tensor, proto_tensor = _find_tensors(predictions, num_classes)
|
|
176
|
+
|
|
177
|
+
num_dets = det_tensor.shape[0]
|
|
178
|
+
num_cols = det_tensor.shape[1]
|
|
179
|
+
is_nms_format = num_cols == 6 + NUM_MASK_COEFFS
|
|
180
|
+
|
|
181
|
+
candidates = []
|
|
182
|
+
|
|
183
|
+
for i in range(num_dets):
|
|
184
|
+
row = det_tensor[i]
|
|
185
|
+
|
|
186
|
+
if is_nms_format:
|
|
187
|
+
# [x1, y1, x2, y2, confidence, class_id, coeff_0..coeff_31]
|
|
188
|
+
x1, y1, x2, y2 = float(row[0]), float(row[1]), float(row[2]), float(row[3])
|
|
189
|
+
best_score = float(row[4])
|
|
190
|
+
best_class = int(row[5])
|
|
191
|
+
coeffs = row[6:6 + NUM_MASK_COEFFS]
|
|
192
|
+
|
|
193
|
+
if best_score < conf_threshold:
|
|
194
|
+
continue
|
|
195
|
+
if best_class < 0:
|
|
196
|
+
continue # padding row
|
|
197
|
+
else:
|
|
198
|
+
# [cx, cy, w, h, class_scores..., coeff_0..coeff_31]
|
|
199
|
+
cx, cy, w, h = float(row[0]), float(row[1]), float(row[2]), float(row[3])
|
|
200
|
+
class_scores = row[4:4 + num_classes]
|
|
201
|
+
best_class = int(np.argmax(class_scores))
|
|
202
|
+
best_score = float(class_scores[best_class])
|
|
203
|
+
coeffs = row[4 + num_classes:4 + num_classes + NUM_MASK_COEFFS]
|
|
204
|
+
|
|
205
|
+
if best_score < conf_threshold:
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
x1 = cx - w / 2
|
|
209
|
+
y1 = cy - h / 2
|
|
210
|
+
x2 = cx + w / 2
|
|
211
|
+
y2 = cy + h / 2
|
|
212
|
+
|
|
213
|
+
# Compute instance mask: sigmoid(coefficients @ prototypes)
|
|
214
|
+
# coeffs: [32], proto_tensor: [32, 160, 160]
|
|
215
|
+
mask_raw = _sigmoid(coeffs @ proto_tensor.reshape(NUM_MASK_COEFFS, -1)).reshape(
|
|
216
|
+
PROTO_SIZE, PROTO_SIZE
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
# Crop mask to bbox, threshold to binary
|
|
220
|
+
binary, crop_w, crop_h = _crop_mask_to_bbox(
|
|
221
|
+
mask_raw, x1, y1, x2, y2, input_size, mask_threshold
|
|
222
|
+
)
|
|
223
|
+
mask_b64 = base64.b64encode(binary.tobytes()).decode("ascii")
|
|
224
|
+
|
|
225
|
+
# Transform bbox from model coords to original image coords
|
|
226
|
+
ox1 = max(0, min(orig_w, (x1 - pad[0]) / scale))
|
|
227
|
+
oy1 = max(0, min(orig_h, (y1 - pad[1]) / scale))
|
|
228
|
+
ox2 = max(0, min(orig_w, (x2 - pad[0]) / scale))
|
|
229
|
+
oy2 = max(0, min(orig_h, (y2 - pad[1]) / scale))
|
|
230
|
+
|
|
231
|
+
label = labels[best_class] if best_class < len(labels) else str(best_class)
|
|
232
|
+
candidates.append({
|
|
233
|
+
"class": label,
|
|
234
|
+
"score": round(best_score, 4),
|
|
235
|
+
"bbox": [round(ox1, 1), round(oy1, 1), round(ox2, 1), round(oy2, 1)],
|
|
236
|
+
"mask": mask_b64,
|
|
237
|
+
"maskWidth": crop_w,
|
|
238
|
+
"maskHeight": crop_h,
|
|
239
|
+
"_xyxy": (ox1, oy1, ox2, oy2),
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
kept = _nms(candidates)
|
|
243
|
+
# Remove internal _xyxy field
|
|
244
|
+
for d in kept:
|
|
245
|
+
del d["_xyxy"]
|
|
246
|
+
|
|
247
|
+
return {"kind": "detections", "detections": kept}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# Base requirements for inference_pool.py — installed by camstack
|
|
2
|
+
# at addon boot via ctx.deps.installPythonRequirements(...).
|
|
3
|
+
#
|
|
4
|
+
# Backend-specific deps (coremltools / onnxruntime / openvino) are NOT
|
|
5
|
+
# listed here — they are installed lazily from requirements-<backend>.txt
|
|
6
|
+
# the first time the user selects that runtime, to keep first-boot
|
|
7
|
+
# install footprint small.
|
|
8
|
+
numpy>=1.26,<3
|
|
9
|
+
Pillow>=10.0,<12
|