@camstack/addon-vision 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/addons/animal-classifier/index.d.mts +25 -0
- package/dist/addons/animal-classifier/index.d.ts +25 -0
- package/dist/addons/animal-classifier/index.js +652 -0
- package/dist/addons/animal-classifier/index.js.map +1 -0
- package/dist/addons/animal-classifier/index.mjs +10 -0
- package/dist/addons/animal-classifier/index.mjs.map +1 -0
- package/dist/addons/audio-classification/index.d.mts +31 -0
- package/dist/addons/audio-classification/index.d.ts +31 -0
- package/dist/addons/audio-classification/index.js +572 -0
- package/dist/addons/audio-classification/index.js.map +1 -0
- package/dist/addons/audio-classification/index.mjs +8 -0
- package/dist/addons/audio-classification/index.mjs.map +1 -0
- package/dist/addons/bird-global-classifier/index.d.mts +26 -0
- package/dist/addons/bird-global-classifier/index.d.ts +26 -0
- package/dist/addons/bird-global-classifier/index.js +658 -0
- package/dist/addons/bird-global-classifier/index.js.map +1 -0
- package/dist/addons/bird-global-classifier/index.mjs +10 -0
- package/dist/addons/bird-global-classifier/index.mjs.map +1 -0
- package/dist/addons/bird-nabirds-classifier/index.d.mts +28 -0
- package/dist/addons/bird-nabirds-classifier/index.d.ts +28 -0
- package/dist/addons/bird-nabirds-classifier/index.js +700 -0
- package/dist/addons/bird-nabirds-classifier/index.js.map +1 -0
- package/dist/addons/bird-nabirds-classifier/index.mjs +10 -0
- package/dist/addons/bird-nabirds-classifier/index.mjs.map +1 -0
- package/dist/addons/camera-native-detection/index.d.mts +32 -0
- package/dist/addons/camera-native-detection/index.d.ts +32 -0
- package/dist/addons/camera-native-detection/index.js +99 -0
- package/dist/addons/camera-native-detection/index.js.map +1 -0
- package/dist/addons/camera-native-detection/index.mjs +7 -0
- package/dist/addons/camera-native-detection/index.mjs.map +1 -0
- package/dist/addons/face-detection/index.d.mts +24 -0
- package/dist/addons/face-detection/index.d.ts +24 -0
- package/dist/addons/face-detection/index.js +720 -0
- package/dist/addons/face-detection/index.js.map +1 -0
- package/dist/addons/face-detection/index.mjs +10 -0
- package/dist/addons/face-detection/index.mjs.map +1 -0
- package/dist/addons/face-recognition/index.d.mts +24 -0
- package/dist/addons/face-recognition/index.d.ts +24 -0
- package/dist/addons/face-recognition/index.js +603 -0
- package/dist/addons/face-recognition/index.js.map +1 -0
- package/dist/addons/face-recognition/index.mjs +9 -0
- package/dist/addons/face-recognition/index.mjs.map +1 -0
- package/dist/addons/motion-detection/index.d.mts +26 -0
- package/dist/addons/motion-detection/index.d.ts +26 -0
- package/dist/addons/motion-detection/index.js +273 -0
- package/dist/addons/motion-detection/index.js.map +1 -0
- package/dist/addons/motion-detection/index.mjs +8 -0
- package/dist/addons/motion-detection/index.mjs.map +1 -0
- package/dist/addons/object-detection/index.d.mts +26 -0
- package/dist/addons/object-detection/index.d.ts +26 -0
- package/dist/addons/object-detection/index.js +1214 -0
- package/dist/addons/object-detection/index.js.map +1 -0
- package/dist/addons/object-detection/index.mjs +10 -0
- package/dist/addons/object-detection/index.mjs.map +1 -0
- package/dist/addons/plate-detection/index.d.mts +25 -0
- package/dist/addons/plate-detection/index.d.ts +25 -0
- package/dist/addons/plate-detection/index.js +646 -0
- package/dist/addons/plate-detection/index.js.map +1 -0
- package/dist/addons/plate-detection/index.mjs +10 -0
- package/dist/addons/plate-detection/index.mjs.map +1 -0
- package/dist/addons/plate-recognition/index.d.mts +25 -0
- package/dist/addons/plate-recognition/index.d.ts +25 -0
- package/dist/addons/plate-recognition/index.js +648 -0
- package/dist/addons/plate-recognition/index.js.map +1 -0
- package/dist/addons/plate-recognition/index.mjs +9 -0
- package/dist/addons/plate-recognition/index.mjs.map +1 -0
- package/dist/chunk-3MQFUDRU.mjs +260 -0
- package/dist/chunk-3MQFUDRU.mjs.map +1 -0
- package/dist/chunk-5AIQSN32.mjs +227 -0
- package/dist/chunk-5AIQSN32.mjs.map +1 -0
- package/dist/chunk-5JJZGKL7.mjs +186 -0
- package/dist/chunk-5JJZGKL7.mjs.map +1 -0
- package/dist/chunk-6OR5TE7A.mjs +101 -0
- package/dist/chunk-6OR5TE7A.mjs.map +1 -0
- package/dist/chunk-AYBFB7ID.mjs +763 -0
- package/dist/chunk-AYBFB7ID.mjs.map +1 -0
- package/dist/chunk-B3R66MPF.mjs +219 -0
- package/dist/chunk-B3R66MPF.mjs.map +1 -0
- package/dist/chunk-DTOAB2CE.mjs +79 -0
- package/dist/chunk-DTOAB2CE.mjs.map +1 -0
- package/dist/chunk-ISOIDU4U.mjs +54 -0
- package/dist/chunk-ISOIDU4U.mjs.map +1 -0
- package/dist/chunk-J4WRYHHY.mjs +212 -0
- package/dist/chunk-J4WRYHHY.mjs.map +1 -0
- package/dist/chunk-KUO2BVFY.mjs +90 -0
- package/dist/chunk-KUO2BVFY.mjs.map +1 -0
- package/dist/chunk-LPI42WL6.mjs +324 -0
- package/dist/chunk-LPI42WL6.mjs.map +1 -0
- package/dist/chunk-MEVASN3P.mjs +305 -0
- package/dist/chunk-MEVASN3P.mjs.map +1 -0
- package/dist/chunk-PDSHDDPV.mjs +255 -0
- package/dist/chunk-PDSHDDPV.mjs.map +1 -0
- package/dist/chunk-Q3SQOYG6.mjs +218 -0
- package/dist/chunk-Q3SQOYG6.mjs.map +1 -0
- package/dist/chunk-QIMDG34B.mjs +229 -0
- package/dist/chunk-QIMDG34B.mjs.map +1 -0
- package/dist/index.d.mts +171 -0
- package/dist/index.d.ts +171 -0
- package/dist/index.js +3463 -0
- package/dist/index.js.map +1 -0
- package/dist/index.mjs +111 -0
- package/dist/index.mjs.map +1 -0
- package/package.json +49 -0
- package/python/__pycache__/coreml_inference.cpython-313.pyc +0 -0
- package/python/__pycache__/openvino_inference.cpython-313.pyc +0 -0
- package/python/__pycache__/pytorch_inference.cpython-313.pyc +0 -0
- package/python/coreml_inference.py +319 -0
- package/python/openvino_inference.py +247 -0
- package/python/pytorch_inference.py +255 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""OpenVINO inference for YOLO object detection.
|
|
3
|
+
|
|
4
|
+
Binary IPC protocol over stdin/stdout:
|
|
5
|
+
Input: [4 bytes LE uint32 length][JPEG bytes]
|
|
6
|
+
Output: [4 bytes LE uint32 length][JSON bytes]
|
|
7
|
+
|
|
8
|
+
JSON output format:
|
|
9
|
+
{
|
|
10
|
+
"detections": [
|
|
11
|
+
{"className": "person", "score": 0.92, "bbox": [x1, y1, x2, y2]},
|
|
12
|
+
...
|
|
13
|
+
],
|
|
14
|
+
"inferenceMs": 12.5
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
Bounding boxes are NORMALIZED (0-1 range).
|
|
18
|
+
|
|
19
|
+
Usage:
|
|
20
|
+
python openvino_inference.py <model_path> [--device CPU|GPU|AUTO] [--input-size 640] [--confidence 0.25]
|
|
21
|
+
"""
|
|
22
|
+
import sys
|
|
23
|
+
import struct
|
|
24
|
+
import json
|
|
25
|
+
import argparse
|
|
26
|
+
import time
|
|
27
|
+
import io
|
|
28
|
+
import numpy as np
|
|
29
|
+
from PIL import Image
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Binary IPC helpers
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
def read_frame(stream) -> bytes:
|
|
37
|
+
"""Read one JPEG frame from binary IPC stream."""
|
|
38
|
+
header = stream.read(4)
|
|
39
|
+
if len(header) < 4:
|
|
40
|
+
return b""
|
|
41
|
+
length = struct.unpack("<I", header)[0]
|
|
42
|
+
return stream.read(length)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def write_result(stream, result: dict) -> None:
|
|
46
|
+
"""Write JSON result to binary IPC stream."""
|
|
47
|
+
payload = json.dumps(result).encode("utf-8")
|
|
48
|
+
stream.write(struct.pack("<I", len(payload)) + payload)
|
|
49
|
+
stream.flush()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
# Image preprocessing
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
def letterbox(img: Image.Image, size: int) -> tuple[np.ndarray, float, tuple[int, int]]:
|
|
57
|
+
"""Resize with letterbox padding (maintain aspect ratio).
|
|
58
|
+
|
|
59
|
+
Returns (CHW float32 array normalised 0-1, scale, (pad_x, pad_y)).
|
|
60
|
+
"""
|
|
61
|
+
w, h = img.size
|
|
62
|
+
scale = min(size / w, size / h)
|
|
63
|
+
nw, nh = int(w * scale), int(h * scale)
|
|
64
|
+
img_resized = img.resize((nw, nh), Image.BILINEAR)
|
|
65
|
+
|
|
66
|
+
canvas = Image.new("RGB", (size, size), (114, 114, 114))
|
|
67
|
+
pad_x, pad_y = (size - nw) // 2, (size - nh) // 2
|
|
68
|
+
canvas.paste(img_resized, (pad_x, pad_y))
|
|
69
|
+
|
|
70
|
+
arr = np.array(canvas, dtype=np.float32) / 255.0
|
|
71
|
+
return arr, scale, (pad_x, pad_y)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ---------------------------------------------------------------------------
|
|
75
|
+
# COCO 80 class names
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
|
|
78
|
+
COCO_CLASSES = [
|
|
79
|
+
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
|
|
80
|
+
"traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
|
|
81
|
+
"dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
|
|
82
|
+
"umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
|
|
83
|
+
"kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
|
|
84
|
+
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
|
|
85
|
+
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
|
|
86
|
+
"couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
|
|
87
|
+
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
|
|
88
|
+
"refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# YOLO output parsing
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
def compute_iou(x1a: float, y1a: float, x2a: float, y2a: float,
|
|
97
|
+
x1b: float, y1b: float, x2b: float, y2b: float) -> float:
|
|
98
|
+
"""Compute IoU between two bounding boxes."""
|
|
99
|
+
xi1 = max(x1a, x1b)
|
|
100
|
+
yi1 = max(y1a, y1b)
|
|
101
|
+
xi2 = min(x2a, x2b)
|
|
102
|
+
yi2 = min(y2a, y2b)
|
|
103
|
+
inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
|
|
104
|
+
area_a = (x2a - x1a) * (y2a - y1a)
|
|
105
|
+
area_b = (x2b - x1b) * (y2b - y1b)
|
|
106
|
+
union = area_a + area_b - inter
|
|
107
|
+
return inter / union if union > 0 else 0.0
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def parse_yolo_output(output: np.ndarray, conf_threshold: float,
|
|
111
|
+
img_w: int, img_h: int, input_size: int,
|
|
112
|
+
scale: float, pad: tuple[int, int]) -> list[dict]:
|
|
113
|
+
"""Parse YOLO output tensor [1, 84, 8400] into normalised detections."""
|
|
114
|
+
if output.ndim == 3 and output.shape[0] == 1:
|
|
115
|
+
output = output[0]
|
|
116
|
+
if output.shape[0] == 84:
|
|
117
|
+
output = output.T # [8400, 84]
|
|
118
|
+
|
|
119
|
+
cx, cy, w, h = output[:, 0], output[:, 1], output[:, 2], output[:, 3]
|
|
120
|
+
class_scores = output[:, 4:]
|
|
121
|
+
|
|
122
|
+
class_ids = np.argmax(class_scores, axis=1)
|
|
123
|
+
scores = class_scores[np.arange(len(class_ids)), class_ids]
|
|
124
|
+
|
|
125
|
+
mask = scores > conf_threshold
|
|
126
|
+
cx, cy, w, h = cx[mask], cy[mask], w[mask], h[mask]
|
|
127
|
+
scores, class_ids = scores[mask], class_ids[mask]
|
|
128
|
+
|
|
129
|
+
x1 = cx - w / 2
|
|
130
|
+
y1 = cy - h / 2
|
|
131
|
+
x2 = cx + w / 2
|
|
132
|
+
y2 = cy + h / 2
|
|
133
|
+
|
|
134
|
+
pad_x, pad_y = pad
|
|
135
|
+
x1 = (x1 - pad_x) / scale
|
|
136
|
+
y1 = (y1 - pad_y) / scale
|
|
137
|
+
x2 = (x2 - pad_x) / scale
|
|
138
|
+
y2 = (y2 - pad_y) / scale
|
|
139
|
+
|
|
140
|
+
x1 = np.clip(x1 / img_w, 0, 1)
|
|
141
|
+
y1 = np.clip(y1 / img_h, 0, 1)
|
|
142
|
+
x2 = np.clip(x2 / img_w, 0, 1)
|
|
143
|
+
y2 = np.clip(y2 / img_h, 0, 1)
|
|
144
|
+
|
|
145
|
+
detections: list[dict] = []
|
|
146
|
+
indices = np.argsort(-scores)
|
|
147
|
+
suppressed: set[int] = set()
|
|
148
|
+
|
|
149
|
+
for i in indices[:100]:
|
|
150
|
+
idx = int(i)
|
|
151
|
+
if idx in suppressed:
|
|
152
|
+
continue
|
|
153
|
+
detections.append({
|
|
154
|
+
"className": COCO_CLASSES[class_ids[idx]] if class_ids[idx] < len(COCO_CLASSES) else f"class_{class_ids[idx]}",
|
|
155
|
+
"score": round(float(scores[idx]), 4),
|
|
156
|
+
"bbox": [round(float(x1[idx]), 4), round(float(y1[idx]), 4),
|
|
157
|
+
round(float(x2[idx]), 4), round(float(y2[idx]), 4)],
|
|
158
|
+
})
|
|
159
|
+
for j in indices:
|
|
160
|
+
jdx = int(j)
|
|
161
|
+
if jdx in suppressed or jdx == idx:
|
|
162
|
+
continue
|
|
163
|
+
iou = compute_iou(float(x1[idx]), float(y1[idx]), float(x2[idx]), float(y2[idx]),
|
|
164
|
+
float(x1[jdx]), float(y1[jdx]), float(x2[jdx]), float(y2[jdx]))
|
|
165
|
+
if iou > 0.45:
|
|
166
|
+
suppressed.add(jdx)
|
|
167
|
+
suppressed.add(idx)
|
|
168
|
+
|
|
169
|
+
return detections
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
# ---------------------------------------------------------------------------
|
|
173
|
+
# Main
|
|
174
|
+
# ---------------------------------------------------------------------------
|
|
175
|
+
|
|
176
|
+
def main() -> None:
|
|
177
|
+
parser = argparse.ArgumentParser(description="OpenVINO inference via binary IPC")
|
|
178
|
+
parser.add_argument("model_path", help="Path to .xml model (OpenVINO IR format)")
|
|
179
|
+
parser.add_argument("--device", default="AUTO", choices=["CPU", "GPU", "AUTO"],
|
|
180
|
+
help="OpenVINO compute device")
|
|
181
|
+
parser.add_argument("--input-size", type=int, default=640,
|
|
182
|
+
help="Model input size (square)")
|
|
183
|
+
parser.add_argument("--confidence", type=float, default=0.25,
|
|
184
|
+
help="Confidence threshold")
|
|
185
|
+
args = parser.parse_args()
|
|
186
|
+
|
|
187
|
+
from openvino.runtime import Core
|
|
188
|
+
|
|
189
|
+
ie = Core()
|
|
190
|
+
|
|
191
|
+
sys.stderr.write(f"[openvino] Loading model: {args.model_path}\n")
|
|
192
|
+
sys.stderr.write(f"[openvino] Device: {args.device}\n")
|
|
193
|
+
sys.stderr.write(f"[openvino] Available devices: {ie.available_devices}\n")
|
|
194
|
+
sys.stderr.flush()
|
|
195
|
+
|
|
196
|
+
model = ie.read_model(args.model_path)
|
|
197
|
+
compiled = ie.compile_model(model, args.device)
|
|
198
|
+
infer_request = compiled.create_infer_request()
|
|
199
|
+
|
|
200
|
+
# Resolve input/output tensor info
|
|
201
|
+
input_layer = compiled.input(0)
|
|
202
|
+
output_layer = compiled.output(0)
|
|
203
|
+
|
|
204
|
+
sys.stderr.write(f"[openvino] Input shape: {input_layer.shape}, dtype: {input_layer.element_type}\n")
|
|
205
|
+
sys.stderr.write(f"[openvino] Output shape: {output_layer.shape}\n")
|
|
206
|
+
sys.stderr.write("[openvino] Model loaded, ready for inference\n")
|
|
207
|
+
sys.stderr.flush()
|
|
208
|
+
|
|
209
|
+
stdin_binary = sys.stdin.buffer
|
|
210
|
+
stdout_binary = sys.stdout.buffer
|
|
211
|
+
|
|
212
|
+
while True:
|
|
213
|
+
jpeg = read_frame(stdin_binary)
|
|
214
|
+
if not jpeg:
|
|
215
|
+
break
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
start = time.perf_counter()
|
|
219
|
+
|
|
220
|
+
img = Image.open(io.BytesIO(jpeg)).convert("RGB")
|
|
221
|
+
orig_w, orig_h = img.size
|
|
222
|
+
|
|
223
|
+
arr, scale, pad = letterbox(img, args.input_size)
|
|
224
|
+
|
|
225
|
+
# OpenVINO expects [1, 3, H, W] float32
|
|
226
|
+
input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
|
|
227
|
+
|
|
228
|
+
infer_request.infer({0: input_arr})
|
|
229
|
+
output = infer_request.get_output_tensor(0).data.copy()
|
|
230
|
+
|
|
231
|
+
detections = parse_yolo_output(
|
|
232
|
+
output, args.confidence, orig_w, orig_h,
|
|
233
|
+
args.input_size, scale, pad,
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
elapsed = (time.perf_counter() - start) * 1000
|
|
237
|
+
result = {"detections": detections, "inferenceMs": round(elapsed, 2)}
|
|
238
|
+
except Exception as exc:
|
|
239
|
+
sys.stderr.write(f"[openvino] Inference error: {exc}\n")
|
|
240
|
+
sys.stderr.flush()
|
|
241
|
+
result = {"detections": [], "error": str(exc), "inferenceMs": 0}
|
|
242
|
+
|
|
243
|
+
write_result(stdout_binary, result)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
if __name__ == "__main__":
|
|
247
|
+
main()
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""ONNX Runtime (Python) inference for YOLO object detection.
|
|
3
|
+
|
|
4
|
+
Uses onnxruntime-python which can leverage CoreMLExecutionProvider on macOS,
|
|
5
|
+
CUDAExecutionProvider on Linux/Windows with NVIDIA GPUs, or CPUExecutionProvider
|
|
6
|
+
everywhere. This is simpler and more portable than requiring PyTorch directly.
|
|
7
|
+
|
|
8
|
+
Binary IPC protocol over stdin/stdout:
|
|
9
|
+
Input: [4 bytes LE uint32 length][JPEG bytes]
|
|
10
|
+
Output: [4 bytes LE uint32 length][JSON bytes]
|
|
11
|
+
|
|
12
|
+
JSON output format:
|
|
13
|
+
{
|
|
14
|
+
"detections": [
|
|
15
|
+
{"className": "person", "score": 0.92, "bbox": [x1, y1, x2, y2]},
|
|
16
|
+
...
|
|
17
|
+
],
|
|
18
|
+
"inferenceMs": 12.5
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
Bounding boxes are NORMALIZED (0-1 range).
|
|
22
|
+
|
|
23
|
+
Usage:
|
|
24
|
+
python pytorch_inference.py <model_path> [--device cpu|cuda|mps] [--input-size 640] [--confidence 0.25]
|
|
25
|
+
"""
|
|
26
|
+
import sys
|
|
27
|
+
import struct
|
|
28
|
+
import json
|
|
29
|
+
import argparse
|
|
30
|
+
import time
|
|
31
|
+
import io
|
|
32
|
+
import numpy as np
|
|
33
|
+
from PIL import Image
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Binary IPC helpers
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
def read_frame(stream) -> bytes:
|
|
41
|
+
"""Read one JPEG frame from binary IPC stream."""
|
|
42
|
+
header = stream.read(4)
|
|
43
|
+
if len(header) < 4:
|
|
44
|
+
return b""
|
|
45
|
+
length = struct.unpack("<I", header)[0]
|
|
46
|
+
return stream.read(length)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def write_result(stream, result: dict) -> None:
|
|
50
|
+
"""Write JSON result to binary IPC stream."""
|
|
51
|
+
payload = json.dumps(result).encode("utf-8")
|
|
52
|
+
stream.write(struct.pack("<I", len(payload)) + payload)
|
|
53
|
+
stream.flush()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
# Image preprocessing
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
def letterbox(img: Image.Image, size: int) -> tuple[np.ndarray, float, tuple[int, int]]:
|
|
61
|
+
"""Resize with letterbox padding (maintain aspect ratio).
|
|
62
|
+
|
|
63
|
+
Returns (CHW float32 array normalised 0-1, scale, (pad_x, pad_y)).
|
|
64
|
+
"""
|
|
65
|
+
w, h = img.size
|
|
66
|
+
scale = min(size / w, size / h)
|
|
67
|
+
nw, nh = int(w * scale), int(h * scale)
|
|
68
|
+
img_resized = img.resize((nw, nh), Image.BILINEAR)
|
|
69
|
+
|
|
70
|
+
canvas = Image.new("RGB", (size, size), (114, 114, 114))
|
|
71
|
+
pad_x, pad_y = (size - nw) // 2, (size - nh) // 2
|
|
72
|
+
canvas.paste(img_resized, (pad_x, pad_y))
|
|
73
|
+
|
|
74
|
+
arr = np.array(canvas, dtype=np.float32) / 255.0
|
|
75
|
+
return arr, scale, (pad_x, pad_y)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
# COCO 80 class names
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
|
|
82
|
+
COCO_CLASSES = [
|
|
83
|
+
"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
|
|
84
|
+
"traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
|
|
85
|
+
"dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
|
|
86
|
+
"umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
|
|
87
|
+
"kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
|
|
88
|
+
"bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
|
|
89
|
+
"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
|
|
90
|
+
"couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
|
|
91
|
+
"remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
|
|
92
|
+
"refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ---------------------------------------------------------------------------
|
|
97
|
+
# YOLO output parsing
|
|
98
|
+
# ---------------------------------------------------------------------------
|
|
99
|
+
|
|
100
|
+
def compute_iou(x1a: float, y1a: float, x2a: float, y2a: float,
|
|
101
|
+
x1b: float, y1b: float, x2b: float, y2b: float) -> float:
|
|
102
|
+
"""Compute IoU between two bounding boxes."""
|
|
103
|
+
xi1 = max(x1a, x1b)
|
|
104
|
+
yi1 = max(y1a, y1b)
|
|
105
|
+
xi2 = min(x2a, x2b)
|
|
106
|
+
yi2 = min(y2a, y2b)
|
|
107
|
+
inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
|
|
108
|
+
area_a = (x2a - x1a) * (y2a - y1a)
|
|
109
|
+
area_b = (x2b - x1b) * (y2b - y1b)
|
|
110
|
+
union = area_a + area_b - inter
|
|
111
|
+
return inter / union if union > 0 else 0.0
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def parse_yolo_output(output: np.ndarray, conf_threshold: float,
|
|
115
|
+
img_w: int, img_h: int, input_size: int,
|
|
116
|
+
scale: float, pad: tuple[int, int]) -> list[dict]:
|
|
117
|
+
"""Parse YOLO output tensor [1, 84, 8400] into normalised detections."""
|
|
118
|
+
if output.ndim == 3 and output.shape[0] == 1:
|
|
119
|
+
output = output[0]
|
|
120
|
+
if output.shape[0] == 84:
|
|
121
|
+
output = output.T # [8400, 84]
|
|
122
|
+
|
|
123
|
+
cx, cy, w, h = output[:, 0], output[:, 1], output[:, 2], output[:, 3]
|
|
124
|
+
class_scores = output[:, 4:]
|
|
125
|
+
|
|
126
|
+
class_ids = np.argmax(class_scores, axis=1)
|
|
127
|
+
scores = class_scores[np.arange(len(class_ids)), class_ids]
|
|
128
|
+
|
|
129
|
+
mask = scores > conf_threshold
|
|
130
|
+
cx, cy, w, h = cx[mask], cy[mask], w[mask], h[mask]
|
|
131
|
+
scores, class_ids = scores[mask], class_ids[mask]
|
|
132
|
+
|
|
133
|
+
x1 = cx - w / 2
|
|
134
|
+
y1 = cy - h / 2
|
|
135
|
+
x2 = cx + w / 2
|
|
136
|
+
y2 = cy + h / 2
|
|
137
|
+
|
|
138
|
+
pad_x, pad_y = pad
|
|
139
|
+
x1 = (x1 - pad_x) / scale
|
|
140
|
+
y1 = (y1 - pad_y) / scale
|
|
141
|
+
x2 = (x2 - pad_x) / scale
|
|
142
|
+
y2 = (y2 - pad_y) / scale
|
|
143
|
+
|
|
144
|
+
x1 = np.clip(x1 / img_w, 0, 1)
|
|
145
|
+
y1 = np.clip(y1 / img_h, 0, 1)
|
|
146
|
+
x2 = np.clip(x2 / img_w, 0, 1)
|
|
147
|
+
y2 = np.clip(y2 / img_h, 0, 1)
|
|
148
|
+
|
|
149
|
+
detections: list[dict] = []
|
|
150
|
+
indices = np.argsort(-scores)
|
|
151
|
+
suppressed: set[int] = set()
|
|
152
|
+
|
|
153
|
+
for i in indices[:100]:
|
|
154
|
+
idx = int(i)
|
|
155
|
+
if idx in suppressed:
|
|
156
|
+
continue
|
|
157
|
+
detections.append({
|
|
158
|
+
"className": COCO_CLASSES[class_ids[idx]] if class_ids[idx] < len(COCO_CLASSES) else f"class_{class_ids[idx]}",
|
|
159
|
+
"score": round(float(scores[idx]), 4),
|
|
160
|
+
"bbox": [round(float(x1[idx]), 4), round(float(y1[idx]), 4),
|
|
161
|
+
round(float(x2[idx]), 4), round(float(y2[idx]), 4)],
|
|
162
|
+
})
|
|
163
|
+
for j in indices:
|
|
164
|
+
jdx = int(j)
|
|
165
|
+
if jdx in suppressed or jdx == idx:
|
|
166
|
+
continue
|
|
167
|
+
iou = compute_iou(float(x1[idx]), float(y1[idx]), float(x2[idx]), float(y2[idx]),
|
|
168
|
+
float(x1[jdx]), float(y1[jdx]), float(x2[jdx]), float(y2[jdx]))
|
|
169
|
+
if iou > 0.45:
|
|
170
|
+
suppressed.add(jdx)
|
|
171
|
+
suppressed.add(idx)
|
|
172
|
+
|
|
173
|
+
return detections
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# ---------------------------------------------------------------------------
|
|
177
|
+
# Main
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
|
|
180
|
+
def main() -> None:
|
|
181
|
+
parser = argparse.ArgumentParser(description="ONNX Runtime (Python) inference via binary IPC")
|
|
182
|
+
parser.add_argument("model_path", help="Path to .onnx model")
|
|
183
|
+
parser.add_argument("--device", default="cpu", choices=["cpu", "cuda", "mps"],
|
|
184
|
+
help="Compute device (selects execution provider)")
|
|
185
|
+
parser.add_argument("--input-size", type=int, default=640,
|
|
186
|
+
help="Model input size (square)")
|
|
187
|
+
parser.add_argument("--confidence", type=float, default=0.25,
|
|
188
|
+
help="Confidence threshold")
|
|
189
|
+
args = parser.parse_args()
|
|
190
|
+
|
|
191
|
+
import onnxruntime as ort
|
|
192
|
+
|
|
193
|
+
# Select execution providers based on requested device
|
|
194
|
+
if args.device == "cuda":
|
|
195
|
+
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
196
|
+
elif args.device == "mps" and sys.platform == "darwin":
|
|
197
|
+
providers = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
|
|
198
|
+
elif sys.platform == "darwin":
|
|
199
|
+
# On macOS, prefer CoreML even for "cpu" — it may dispatch to ANE
|
|
200
|
+
providers = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
|
|
201
|
+
else:
|
|
202
|
+
providers = ["CPUExecutionProvider"]
|
|
203
|
+
|
|
204
|
+
sys.stderr.write(f"[pytorch/ort] Loading model: {args.model_path}\n")
|
|
205
|
+
sys.stderr.write(f"[pytorch/ort] Requested device: {args.device}\n")
|
|
206
|
+
sys.stderr.write(f"[pytorch/ort] Providers: {providers}\n")
|
|
207
|
+
sys.stderr.flush()
|
|
208
|
+
|
|
209
|
+
session = ort.InferenceSession(args.model_path, providers=providers)
|
|
210
|
+
input_name = session.get_inputs()[0].name
|
|
211
|
+
active_providers = session.get_providers()
|
|
212
|
+
|
|
213
|
+
sys.stderr.write(f"[pytorch/ort] Active providers: {active_providers}\n")
|
|
214
|
+
sys.stderr.write("[pytorch/ort] Model loaded, ready for inference\n")
|
|
215
|
+
sys.stderr.flush()
|
|
216
|
+
|
|
217
|
+
stdin_binary = sys.stdin.buffer
|
|
218
|
+
stdout_binary = sys.stdout.buffer
|
|
219
|
+
|
|
220
|
+
while True:
|
|
221
|
+
jpeg = read_frame(stdin_binary)
|
|
222
|
+
if not jpeg:
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
try:
|
|
226
|
+
start = time.perf_counter()
|
|
227
|
+
|
|
228
|
+
img = Image.open(io.BytesIO(jpeg)).convert("RGB")
|
|
229
|
+
orig_w, orig_h = img.size
|
|
230
|
+
|
|
231
|
+
arr, scale, pad = letterbox(img, args.input_size)
|
|
232
|
+
|
|
233
|
+
# ONNX expects [1, 3, H, W] float32
|
|
234
|
+
input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
|
|
235
|
+
|
|
236
|
+
outputs = session.run(None, {input_name: input_arr})
|
|
237
|
+
output = outputs[0]
|
|
238
|
+
|
|
239
|
+
detections = parse_yolo_output(
|
|
240
|
+
output, args.confidence, orig_w, orig_h,
|
|
241
|
+
args.input_size, scale, pad,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
elapsed = (time.perf_counter() - start) * 1000
|
|
245
|
+
result = {"detections": detections, "inferenceMs": round(elapsed, 2)}
|
|
246
|
+
except Exception as exc:
|
|
247
|
+
sys.stderr.write(f"[pytorch/ort] Inference error: {exc}\n")
|
|
248
|
+
sys.stderr.flush()
|
|
249
|
+
result = {"detections": [], "error": str(exc), "inferenceMs": 0}
|
|
250
|
+
|
|
251
|
+
write_result(stdout_binary, result)
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
if __name__ == "__main__":
|
|
255
|
+
main()
|