@camstack/addon-detection-pipeline 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.mts +638 -0
- package/dist/index.d.ts +638 -0
- package/dist/index.js +5826 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +5801 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +84 -0
- package/python/inference_pool.py +1088 -0
- package/python/postprocessors/__init__.py +24 -0
- package/python/postprocessors/arcface.py +31 -0
- package/python/postprocessors/ctc.py +68 -0
- package/python/postprocessors/saliency.py +44 -0
- package/python/postprocessors/scrfd.py +212 -0
- package/python/postprocessors/softmax.py +43 -0
- package/python/postprocessors/yamnet.py +41 -0
- package/python/postprocessors/yolo.py +278 -0
- package/python/postprocessors/yolo_seg.py +247 -0
- package/python/requirements-coreml.txt +4 -0
- package/python/requirements-onnxruntime.txt +3 -0
- package/python/requirements-openvino.txt +3 -0
- package/python/requirements.txt +9 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""Postprocessor dispatch table for inference_pool.py.
|
|
2
|
+
|
|
3
|
+
Each postprocessor takes raw model output (dict of numpy arrays) and model config,
|
|
4
|
+
returns a structured dict matching the StepOutput JSON format.
|
|
5
|
+
"""
|
|
6
|
+
from .yolo import postprocess_yolo
|
|
7
|
+
from .yolo_seg import postprocess_yolo_seg
|
|
8
|
+
from .scrfd import postprocess_scrfd
|
|
9
|
+
from .arcface import postprocess_arcface
|
|
10
|
+
from .softmax import postprocess_softmax
|
|
11
|
+
from .ctc import postprocess_ctc
|
|
12
|
+
from .saliency import postprocess_saliency
|
|
13
|
+
from .yamnet import postprocess_yamnet
|
|
14
|
+
|
|
15
|
+
POSTPROCESSORS = {
|
|
16
|
+
"yolo": postprocess_yolo,
|
|
17
|
+
"yolo-seg": postprocess_yolo_seg,
|
|
18
|
+
"scrfd": postprocess_scrfd,
|
|
19
|
+
"arcface": postprocess_arcface,
|
|
20
|
+
"softmax": postprocess_softmax,
|
|
21
|
+
"ctc": postprocess_ctc,
|
|
22
|
+
"saliency": postprocess_saliency,
|
|
23
|
+
"yamnet": postprocess_yamnet,
|
|
24
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""ArcFace face embedding postprocessor.
|
|
2
|
+
|
|
3
|
+
Input: raw embedding tensor from ArcFace model
|
|
4
|
+
Output: {"kind": "embedding", "embedding": [...], "embeddingNorm": [...]}
|
|
5
|
+
"""
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def postprocess_arcface(
|
|
10
|
+
predictions: dict,
|
|
11
|
+
config: dict,
|
|
12
|
+
orig_w: int,
|
|
13
|
+
orig_h: int,
|
|
14
|
+
scale: float,
|
|
15
|
+
pad: tuple[int, int],
|
|
16
|
+
) -> dict:
|
|
17
|
+
"""L2-normalize the embedding vector."""
|
|
18
|
+
# Get the first output tensor (usually named "fc1" or similar)
|
|
19
|
+
raw = np.array(list(predictions.values())[0]).flatten().astype(np.float32)
|
|
20
|
+
|
|
21
|
+
norm = np.linalg.norm(raw)
|
|
22
|
+
if norm == 0:
|
|
23
|
+
normalized = np.zeros_like(raw)
|
|
24
|
+
else:
|
|
25
|
+
normalized = raw / norm
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
"kind": "embedding",
|
|
29
|
+
"embedding": raw.tolist(),
|
|
30
|
+
"embeddingNorm": normalized.tolist(),
|
|
31
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""CTC text recognition postprocessor.
|
|
2
|
+
|
|
3
|
+
Used for: vgg_english_g2 (plate OCR).
|
|
4
|
+
|
|
5
|
+
Input: [1, seqLen, numChars] logits
|
|
6
|
+
Output: {"kind": "text", "text": "ABC123", "confidence": 0.95}
|
|
7
|
+
|
|
8
|
+
Algorithm: argmax per timestep → collapse consecutive duplicates → remove blank (index 0) → join
|
|
9
|
+
"""
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def postprocess_ctc(
|
|
14
|
+
predictions: dict,
|
|
15
|
+
config: dict,
|
|
16
|
+
orig_w: int,
|
|
17
|
+
orig_h: int,
|
|
18
|
+
scale: float,
|
|
19
|
+
pad: tuple[int, int],
|
|
20
|
+
) -> dict:
|
|
21
|
+
"""CTC greedy decode."""
|
|
22
|
+
charset = config.get("charset", [])
|
|
23
|
+
|
|
24
|
+
# Get first output tensor
|
|
25
|
+
raw = np.array(list(predictions.values())[0]).astype(np.float32)
|
|
26
|
+
|
|
27
|
+
# Handle shapes: [1, T, C] or [T, C]
|
|
28
|
+
if raw.ndim == 3:
|
|
29
|
+
raw = raw[0] # remove batch
|
|
30
|
+
# raw is now [T, C] — logits (not probabilities)
|
|
31
|
+
seq_len, num_chars = raw.shape
|
|
32
|
+
|
|
33
|
+
# Apply softmax to get probabilities
|
|
34
|
+
raw_max = np.max(raw, axis=1, keepdims=True)
|
|
35
|
+
exp_raw = np.exp(raw - raw_max)
|
|
36
|
+
probs = exp_raw / np.sum(exp_raw, axis=1, keepdims=True)
|
|
37
|
+
|
|
38
|
+
# Argmax per timestep
|
|
39
|
+
best_indices = np.argmax(probs, axis=1) # [T]
|
|
40
|
+
best_scores = probs[np.arange(seq_len), best_indices] # [T]
|
|
41
|
+
|
|
42
|
+
# Collapse consecutive duplicates
|
|
43
|
+
collapsed = []
|
|
44
|
+
collapsed_scores = []
|
|
45
|
+
for i, idx in enumerate(best_indices):
|
|
46
|
+
if i == 0 or idx != best_indices[i - 1]:
|
|
47
|
+
collapsed.append(int(idx))
|
|
48
|
+
collapsed_scores.append(float(best_scores[i]))
|
|
49
|
+
|
|
50
|
+
# Remove blank (index 0) and join
|
|
51
|
+
chars = []
|
|
52
|
+
scores = []
|
|
53
|
+
for idx, score in zip(collapsed, collapsed_scores):
|
|
54
|
+
if idx != 0:
|
|
55
|
+
if idx < len(charset):
|
|
56
|
+
chars.append(charset[idx])
|
|
57
|
+
else:
|
|
58
|
+
chars.append(chr(idx))
|
|
59
|
+
scores.append(score)
|
|
60
|
+
|
|
61
|
+
text = "".join(chars)
|
|
62
|
+
confidence = float(np.mean(scores)) if scores else 0.0
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
"kind": "text",
|
|
66
|
+
"text": text,
|
|
67
|
+
"confidence": round(confidence, 4),
|
|
68
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Saliency / segmentation refiner postprocessor.
|
|
2
|
+
|
|
3
|
+
Used for: u2netp.
|
|
4
|
+
|
|
5
|
+
Input: raw saliency map tensor
|
|
6
|
+
Output: {"kind": "mask", "mask": "<base64>", "maskWidth": N, "maskHeight": N}
|
|
7
|
+
"""
|
|
8
|
+
import base64
|
|
9
|
+
import numpy as np
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def postprocess_saliency(
|
|
13
|
+
predictions: dict,
|
|
14
|
+
config: dict,
|
|
15
|
+
orig_w: int,
|
|
16
|
+
orig_h: int,
|
|
17
|
+
scale: float,
|
|
18
|
+
pad: tuple[int, int],
|
|
19
|
+
) -> dict:
|
|
20
|
+
"""Sigmoid → threshold → binary mask → base64."""
|
|
21
|
+
mask_threshold = config.get("maskThreshold", 0.5)
|
|
22
|
+
|
|
23
|
+
# U2-Net outputs multiple saliency maps; use the first (highest resolution)
|
|
24
|
+
raw = np.array(list(predictions.values())[0]).astype(np.float32)
|
|
25
|
+
|
|
26
|
+
# Remove batch/channel dims to get [H, W]
|
|
27
|
+
while raw.ndim > 2:
|
|
28
|
+
raw = raw[0]
|
|
29
|
+
|
|
30
|
+
# Sigmoid
|
|
31
|
+
saliency = 1.0 / (1.0 + np.exp(-raw))
|
|
32
|
+
|
|
33
|
+
# Threshold to binary uint8 (0 or 255)
|
|
34
|
+
binary = (saliency > mask_threshold).astype(np.uint8) * 255
|
|
35
|
+
|
|
36
|
+
h, w = binary.shape
|
|
37
|
+
mask_b64 = base64.b64encode(binary.tobytes()).decode("ascii")
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
"kind": "mask",
|
|
41
|
+
"mask": mask_b64,
|
|
42
|
+
"maskWidth": int(w),
|
|
43
|
+
"maskHeight": int(h),
|
|
44
|
+
}
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""SCRFD face detection postprocessor.
|
|
2
|
+
|
|
3
|
+
Multi-stride anchor-based face detector (strides 8, 16, 32).
|
|
4
|
+
Ported from packages/addon-vision/src/shared/postprocess/scrfd.ts.
|
|
5
|
+
|
|
6
|
+
Output: {"kind": "detections", "detections": [{"class": "face", "score", "bbox": [x1,y1,x2,y2], "landmarks?"}]}
|
|
7
|
+
"""
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
STRIDES = [8, 16, 32]
|
|
12
|
+
NUM_ANCHORS_PER_STRIDE = 2
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _generate_anchors(stride: int, input_size: int) -> list[tuple[float, float]]:
|
|
16
|
+
"""Generate anchor centers for a given stride."""
|
|
17
|
+
feat_size = int(np.ceil(input_size / stride))
|
|
18
|
+
anchors = []
|
|
19
|
+
for y in range(feat_size):
|
|
20
|
+
for x in range(feat_size):
|
|
21
|
+
for _ in range(NUM_ANCHORS_PER_STRIDE):
|
|
22
|
+
anchors.append(((x + 0.5) * stride, (y + 0.5) * stride))
|
|
23
|
+
return anchors
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _iou(a: tuple, b: tuple) -> float:
|
|
27
|
+
ax1, ay1, ax2, ay2 = a
|
|
28
|
+
bx1, by1, bx2, by2 = b
|
|
29
|
+
ix1, iy1 = max(ax1, bx1), max(ay1, by1)
|
|
30
|
+
ix2, iy2 = min(ax2, bx2), min(ay2, by2)
|
|
31
|
+
inter = max(0, ix2 - ix1) * max(0, iy2 - iy1)
|
|
32
|
+
if inter == 0:
|
|
33
|
+
return 0.0
|
|
34
|
+
union = (ax2 - ax1) * (ay2 - ay1) + (bx2 - bx1) * (by2 - by1) - inter
|
|
35
|
+
return inter / union if union > 0 else 0.0
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _nms(boxes: list[dict], iou_threshold: float = 0.45) -> list[dict]:
|
|
39
|
+
if not boxes:
|
|
40
|
+
return []
|
|
41
|
+
sorted_boxes = sorted(boxes, key=lambda b: b["score"], reverse=True)
|
|
42
|
+
kept = []
|
|
43
|
+
suppressed = set()
|
|
44
|
+
for i, box in enumerate(sorted_boxes):
|
|
45
|
+
if i in suppressed:
|
|
46
|
+
continue
|
|
47
|
+
kept.append(box)
|
|
48
|
+
for j in range(i + 1, len(sorted_boxes)):
|
|
49
|
+
if j in suppressed:
|
|
50
|
+
continue
|
|
51
|
+
if _iou(box["_xyxy"], sorted_boxes[j]["_xyxy"]) > iou_threshold:
|
|
52
|
+
suppressed.add(j)
|
|
53
|
+
return kept
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _match_outputs_to_strides(
|
|
57
|
+
raw_outputs: dict[str, np.ndarray],
|
|
58
|
+
strides: list[int],
|
|
59
|
+
input_size: int,
|
|
60
|
+
) -> dict[int, dict]:
|
|
61
|
+
"""Match model output tensors to strides.
|
|
62
|
+
|
|
63
|
+
Handles two naming conventions:
|
|
64
|
+
1. Named keys: 'score_8', 'bbox_8', 'kps_8' (ONNX/OpenVINO)
|
|
65
|
+
2. Generic keys: 'var_732' etc. — matched by shape (CoreML)
|
|
66
|
+
|
|
67
|
+
For CoreML, each stride has 3 tensors with num_anchors matching the stride:
|
|
68
|
+
stride 8 → ceil(input/8)^2 * 2 anchors → shapes (1, N, 1), (1, N, 4), (1, N, 10)
|
|
69
|
+
stride 16 → ceil(input/16)^2 * 2
|
|
70
|
+
stride 32 → ceil(input/32)^2 * 2
|
|
71
|
+
"""
|
|
72
|
+
result: dict[int, dict] = {}
|
|
73
|
+
|
|
74
|
+
# Try named keys first
|
|
75
|
+
flat = {k: v.flatten() for k, v in raw_outputs.items()}
|
|
76
|
+
for stride in strides:
|
|
77
|
+
score_key = bbox_key = kps_key = None
|
|
78
|
+
for k in flat:
|
|
79
|
+
if f"score_{stride}" in k or f"_{stride}_score" in k:
|
|
80
|
+
score_key = k
|
|
81
|
+
elif f"bbox_{stride}" in k or f"_{stride}_bbox" in k:
|
|
82
|
+
bbox_key = k
|
|
83
|
+
elif f"kps_{stride}" in k or f"_{stride}_kps" in k:
|
|
84
|
+
kps_key = k
|
|
85
|
+
if score_key and bbox_key:
|
|
86
|
+
result[stride] = {
|
|
87
|
+
"scores": flat[score_key],
|
|
88
|
+
"bboxes": flat[bbox_key],
|
|
89
|
+
"kps": flat.get(kps_key),
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
if len(result) == len(strides):
|
|
93
|
+
return result
|
|
94
|
+
|
|
95
|
+
# Fallback: match by shape (CoreML generic names)
|
|
96
|
+
# Group outputs by last dimension: 1=scores, 4=bboxes, 10=landmarks
|
|
97
|
+
scores_by_n: dict[int, np.ndarray] = {}
|
|
98
|
+
bboxes_by_n: dict[int, np.ndarray] = {}
|
|
99
|
+
kps_by_n: dict[int, np.ndarray] = {}
|
|
100
|
+
|
|
101
|
+
for _k, arr in raw_outputs.items():
|
|
102
|
+
if arr.ndim == 3:
|
|
103
|
+
arr = arr[0] # remove batch dim → (N, C)
|
|
104
|
+
elif arr.ndim == 1:
|
|
105
|
+
continue
|
|
106
|
+
n, c = arr.shape
|
|
107
|
+
if c == 1:
|
|
108
|
+
scores_by_n[n] = arr.flatten()
|
|
109
|
+
elif c == 4:
|
|
110
|
+
bboxes_by_n[n] = arr.flatten()
|
|
111
|
+
elif c == 10:
|
|
112
|
+
kps_by_n[n] = arr.flatten()
|
|
113
|
+
|
|
114
|
+
result = {}
|
|
115
|
+
for stride in strides:
|
|
116
|
+
feat = int(np.ceil(input_size / stride))
|
|
117
|
+
expected_n = feat * feat * NUM_ANCHORS_PER_STRIDE
|
|
118
|
+
if expected_n in scores_by_n and expected_n in bboxes_by_n:
|
|
119
|
+
result[stride] = {
|
|
120
|
+
"scores": scores_by_n[expected_n],
|
|
121
|
+
"bboxes": bboxes_by_n[expected_n],
|
|
122
|
+
"kps": kps_by_n.get(expected_n),
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def postprocess_scrfd(
|
|
129
|
+
predictions: dict,
|
|
130
|
+
config: dict,
|
|
131
|
+
orig_w: int,
|
|
132
|
+
orig_h: int,
|
|
133
|
+
scale: float,
|
|
134
|
+
pad: tuple[int, int],
|
|
135
|
+
) -> dict:
|
|
136
|
+
"""Postprocess SCRFD output to structured face detections."""
|
|
137
|
+
conf_threshold = float(config.get("confidence", 0.5) or 0.5)
|
|
138
|
+
input_size = config.get("inputSize", 640)
|
|
139
|
+
strides = config.get("strides", STRIDES)
|
|
140
|
+
|
|
141
|
+
# Letterbox transform: scale and pad are passed from inference_pool
|
|
142
|
+
# scale = input_size / max(orig_w, orig_h)
|
|
143
|
+
# pad = ((input_size - orig_w*scale)//2, (input_size - orig_h*scale)//2)
|
|
144
|
+
|
|
145
|
+
# Convert all outputs to numpy arrays (keep ndim)
|
|
146
|
+
raw_outputs = {k: np.array(v) for k, v in predictions.items()}
|
|
147
|
+
|
|
148
|
+
# Match outputs to strides — by named key or by shape
|
|
149
|
+
stride_data = _match_outputs_to_strides(raw_outputs, strides, input_size)
|
|
150
|
+
|
|
151
|
+
candidates = []
|
|
152
|
+
|
|
153
|
+
for stride in strides:
|
|
154
|
+
data = stride_data.get(stride)
|
|
155
|
+
if data is None:
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
scores = data["scores"]
|
|
159
|
+
bboxes = data["bboxes"]
|
|
160
|
+
kps = data.get("kps")
|
|
161
|
+
anchors = _generate_anchors(stride, input_size)
|
|
162
|
+
|
|
163
|
+
n = len(anchors)
|
|
164
|
+
for i in range(n):
|
|
165
|
+
if i >= len(scores):
|
|
166
|
+
break
|
|
167
|
+
score = float(scores[i])
|
|
168
|
+
if score < conf_threshold:
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
cx, cy = anchors[i]
|
|
172
|
+
|
|
173
|
+
# Bbox: offsets from anchor center, scaled by stride
|
|
174
|
+
x1 = cx - float(bboxes[i * 4]) * stride
|
|
175
|
+
y1 = cy - float(bboxes[i * 4 + 1]) * stride
|
|
176
|
+
x2 = cx + float(bboxes[i * 4 + 2]) * stride
|
|
177
|
+
y2 = cy + float(bboxes[i * 4 + 3]) * stride
|
|
178
|
+
|
|
179
|
+
# Transform from letterbox coords to original image
|
|
180
|
+
ox1 = max(0, min(orig_w, (x1 - pad[0]) / scale))
|
|
181
|
+
oy1 = max(0, min(orig_h, (y1 - pad[1]) / scale))
|
|
182
|
+
ox2 = max(0, min(orig_w, (x2 - pad[0]) / scale))
|
|
183
|
+
oy2 = max(0, min(orig_h, (y2 - pad[1]) / scale))
|
|
184
|
+
|
|
185
|
+
if (ox2 - ox1) < 1 or (oy2 - oy1) < 1:
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
det = {
|
|
189
|
+
"class": "face",
|
|
190
|
+
"score": round(score, 4),
|
|
191
|
+
"bbox": [round(ox1, 1), round(oy1, 1), round(ox2, 1), round(oy2, 1)],
|
|
192
|
+
"_xyxy": (ox1, oy1, ox2, oy2),
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
# Landmarks (5 points × 2 coords)
|
|
196
|
+
if kps is not None and (i * 10 + 9) < len(kps):
|
|
197
|
+
landmarks = []
|
|
198
|
+
for p in range(5):
|
|
199
|
+
raw_lx = cx + float(kps[i * 10 + p * 2]) * stride
|
|
200
|
+
raw_ly = cy + float(kps[i * 10 + p * 2 + 1]) * stride
|
|
201
|
+
lx = (raw_lx - pad[0]) / scale
|
|
202
|
+
ly = (raw_ly - pad[1]) / scale
|
|
203
|
+
landmarks.append({"x": round(lx, 1), "y": round(ly, 1)})
|
|
204
|
+
det["landmarks"] = landmarks
|
|
205
|
+
|
|
206
|
+
candidates.append(det)
|
|
207
|
+
|
|
208
|
+
kept = _nms(candidates)
|
|
209
|
+
for d in kept:
|
|
210
|
+
del d["_xyxy"]
|
|
211
|
+
|
|
212
|
+
return {"kind": "detections", "detections": kept}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Softmax classifier postprocessor.
|
|
2
|
+
|
|
3
|
+
Used for: animals-10, bird-nabirds-404, vehicle-type-efficientnet.
|
|
4
|
+
|
|
5
|
+
Input: raw logits tensor
|
|
6
|
+
Output: {"kind": "classifications", "classifications": [{"class", "score"}]}
|
|
7
|
+
"""
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def postprocess_softmax(
|
|
12
|
+
predictions: dict,
|
|
13
|
+
config: dict,
|
|
14
|
+
orig_w: int,
|
|
15
|
+
orig_h: int,
|
|
16
|
+
scale: float,
|
|
17
|
+
pad: tuple[int, int],
|
|
18
|
+
) -> dict:
|
|
19
|
+
"""Softmax + argmax + label lookup."""
|
|
20
|
+
labels = config.get("labels", [])
|
|
21
|
+
|
|
22
|
+
# Get first output tensor
|
|
23
|
+
raw = np.array(list(predictions.values())[0]).flatten().astype(np.float32)
|
|
24
|
+
|
|
25
|
+
# Stable softmax
|
|
26
|
+
shifted = raw - np.max(raw)
|
|
27
|
+
exps = np.exp(shifted)
|
|
28
|
+
probs = exps / np.sum(exps)
|
|
29
|
+
|
|
30
|
+
# Return top-K classifications — normalization (top-1 + alternates) happens in TypeScript
|
|
31
|
+
top_k = min(5, len(probs))
|
|
32
|
+
top_indices = np.argsort(probs)[::-1][:top_k]
|
|
33
|
+
|
|
34
|
+
classifications = []
|
|
35
|
+
for idx in top_indices:
|
|
36
|
+
idx = int(idx)
|
|
37
|
+
score = float(probs[idx])
|
|
38
|
+
if score < 0.01:
|
|
39
|
+
break
|
|
40
|
+
label = labels[idx] if idx < len(labels) else str(idx)
|
|
41
|
+
classifications.append({"class": label, "score": round(score, 4)})
|
|
42
|
+
|
|
43
|
+
return {"kind": "classifications", "classifications": classifications}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""YAMNet audio classification postprocessor.
|
|
2
|
+
|
|
3
|
+
Input: [numFrames, 521] score matrix
|
|
4
|
+
Output: {"kind": "classifications", "classifications": [{"class", "score"}]}
|
|
5
|
+
"""
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def postprocess_yamnet(
|
|
10
|
+
predictions: dict,
|
|
11
|
+
config: dict,
|
|
12
|
+
orig_w: int,
|
|
13
|
+
orig_h: int,
|
|
14
|
+
scale: float,
|
|
15
|
+
pad: tuple[int, int],
|
|
16
|
+
) -> dict:
|
|
17
|
+
"""Average scores across frames, return top classes above threshold."""
|
|
18
|
+
labels = config.get("labels", [])
|
|
19
|
+
min_score = config.get("confidence", 0.1)
|
|
20
|
+
|
|
21
|
+
# Get output_0 (scores) — shape [numFrames, 521]
|
|
22
|
+
raw = np.array(list(predictions.values())[0]).astype(np.float32)
|
|
23
|
+
if raw.ndim == 1:
|
|
24
|
+
raw = raw.reshape(1, -1)
|
|
25
|
+
|
|
26
|
+
num_frames, num_classes = raw.shape
|
|
27
|
+
|
|
28
|
+
# Average across frames
|
|
29
|
+
avg_scores = np.mean(raw, axis=0)
|
|
30
|
+
|
|
31
|
+
# Collect classes above threshold, sorted by score
|
|
32
|
+
results = []
|
|
33
|
+
for c in range(num_classes):
|
|
34
|
+
score = float(avg_scores[c])
|
|
35
|
+
if score >= min_score:
|
|
36
|
+
label = labels[c] if c < len(labels) else str(c)
|
|
37
|
+
results.append({"class": label, "score": round(score, 4)})
|
|
38
|
+
|
|
39
|
+
results.sort(key=lambda x: x["score"], reverse=True)
|
|
40
|
+
|
|
41
|
+
return {"kind": "classifications", "classifications": results}
|