@camstack/addon-vision 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/dist/addons/animal-classifier/index.d.mts +25 -0
  2. package/dist/addons/animal-classifier/index.d.ts +25 -0
  3. package/dist/addons/animal-classifier/index.js +652 -0
  4. package/dist/addons/animal-classifier/index.js.map +1 -0
  5. package/dist/addons/animal-classifier/index.mjs +10 -0
  6. package/dist/addons/animal-classifier/index.mjs.map +1 -0
  7. package/dist/addons/audio-classification/index.d.mts +31 -0
  8. package/dist/addons/audio-classification/index.d.ts +31 -0
  9. package/dist/addons/audio-classification/index.js +572 -0
  10. package/dist/addons/audio-classification/index.js.map +1 -0
  11. package/dist/addons/audio-classification/index.mjs +8 -0
  12. package/dist/addons/audio-classification/index.mjs.map +1 -0
  13. package/dist/addons/bird-global-classifier/index.d.mts +26 -0
  14. package/dist/addons/bird-global-classifier/index.d.ts +26 -0
  15. package/dist/addons/bird-global-classifier/index.js +658 -0
  16. package/dist/addons/bird-global-classifier/index.js.map +1 -0
  17. package/dist/addons/bird-global-classifier/index.mjs +10 -0
  18. package/dist/addons/bird-global-classifier/index.mjs.map +1 -0
  19. package/dist/addons/bird-nabirds-classifier/index.d.mts +28 -0
  20. package/dist/addons/bird-nabirds-classifier/index.d.ts +28 -0
  21. package/dist/addons/bird-nabirds-classifier/index.js +700 -0
  22. package/dist/addons/bird-nabirds-classifier/index.js.map +1 -0
  23. package/dist/addons/bird-nabirds-classifier/index.mjs +10 -0
  24. package/dist/addons/bird-nabirds-classifier/index.mjs.map +1 -0
  25. package/dist/addons/camera-native-detection/index.d.mts +32 -0
  26. package/dist/addons/camera-native-detection/index.d.ts +32 -0
  27. package/dist/addons/camera-native-detection/index.js +99 -0
  28. package/dist/addons/camera-native-detection/index.js.map +1 -0
  29. package/dist/addons/camera-native-detection/index.mjs +7 -0
  30. package/dist/addons/camera-native-detection/index.mjs.map +1 -0
  31. package/dist/addons/face-detection/index.d.mts +24 -0
  32. package/dist/addons/face-detection/index.d.ts +24 -0
  33. package/dist/addons/face-detection/index.js +720 -0
  34. package/dist/addons/face-detection/index.js.map +1 -0
  35. package/dist/addons/face-detection/index.mjs +10 -0
  36. package/dist/addons/face-detection/index.mjs.map +1 -0
  37. package/dist/addons/face-recognition/index.d.mts +24 -0
  38. package/dist/addons/face-recognition/index.d.ts +24 -0
  39. package/dist/addons/face-recognition/index.js +603 -0
  40. package/dist/addons/face-recognition/index.js.map +1 -0
  41. package/dist/addons/face-recognition/index.mjs +9 -0
  42. package/dist/addons/face-recognition/index.mjs.map +1 -0
  43. package/dist/addons/motion-detection/index.d.mts +26 -0
  44. package/dist/addons/motion-detection/index.d.ts +26 -0
  45. package/dist/addons/motion-detection/index.js +273 -0
  46. package/dist/addons/motion-detection/index.js.map +1 -0
  47. package/dist/addons/motion-detection/index.mjs +8 -0
  48. package/dist/addons/motion-detection/index.mjs.map +1 -0
  49. package/dist/addons/object-detection/index.d.mts +26 -0
  50. package/dist/addons/object-detection/index.d.ts +26 -0
  51. package/dist/addons/object-detection/index.js +1214 -0
  52. package/dist/addons/object-detection/index.js.map +1 -0
  53. package/dist/addons/object-detection/index.mjs +10 -0
  54. package/dist/addons/object-detection/index.mjs.map +1 -0
  55. package/dist/addons/plate-detection/index.d.mts +25 -0
  56. package/dist/addons/plate-detection/index.d.ts +25 -0
  57. package/dist/addons/plate-detection/index.js +646 -0
  58. package/dist/addons/plate-detection/index.js.map +1 -0
  59. package/dist/addons/plate-detection/index.mjs +10 -0
  60. package/dist/addons/plate-detection/index.mjs.map +1 -0
  61. package/dist/addons/plate-recognition/index.d.mts +25 -0
  62. package/dist/addons/plate-recognition/index.d.ts +25 -0
  63. package/dist/addons/plate-recognition/index.js +648 -0
  64. package/dist/addons/plate-recognition/index.js.map +1 -0
  65. package/dist/addons/plate-recognition/index.mjs +9 -0
  66. package/dist/addons/plate-recognition/index.mjs.map +1 -0
  67. package/dist/chunk-3MQFUDRU.mjs +260 -0
  68. package/dist/chunk-3MQFUDRU.mjs.map +1 -0
  69. package/dist/chunk-5AIQSN32.mjs +227 -0
  70. package/dist/chunk-5AIQSN32.mjs.map +1 -0
  71. package/dist/chunk-5JJZGKL7.mjs +186 -0
  72. package/dist/chunk-5JJZGKL7.mjs.map +1 -0
  73. package/dist/chunk-6OR5TE7A.mjs +101 -0
  74. package/dist/chunk-6OR5TE7A.mjs.map +1 -0
  75. package/dist/chunk-AYBFB7ID.mjs +763 -0
  76. package/dist/chunk-AYBFB7ID.mjs.map +1 -0
  77. package/dist/chunk-B3R66MPF.mjs +219 -0
  78. package/dist/chunk-B3R66MPF.mjs.map +1 -0
  79. package/dist/chunk-DTOAB2CE.mjs +79 -0
  80. package/dist/chunk-DTOAB2CE.mjs.map +1 -0
  81. package/dist/chunk-ISOIDU4U.mjs +54 -0
  82. package/dist/chunk-ISOIDU4U.mjs.map +1 -0
  83. package/dist/chunk-J4WRYHHY.mjs +212 -0
  84. package/dist/chunk-J4WRYHHY.mjs.map +1 -0
  85. package/dist/chunk-KUO2BVFY.mjs +90 -0
  86. package/dist/chunk-KUO2BVFY.mjs.map +1 -0
  87. package/dist/chunk-LPI42WL6.mjs +324 -0
  88. package/dist/chunk-LPI42WL6.mjs.map +1 -0
  89. package/dist/chunk-MEVASN3P.mjs +305 -0
  90. package/dist/chunk-MEVASN3P.mjs.map +1 -0
  91. package/dist/chunk-PDSHDDPV.mjs +255 -0
  92. package/dist/chunk-PDSHDDPV.mjs.map +1 -0
  93. package/dist/chunk-Q3SQOYG6.mjs +218 -0
  94. package/dist/chunk-Q3SQOYG6.mjs.map +1 -0
  95. package/dist/chunk-QIMDG34B.mjs +229 -0
  96. package/dist/chunk-QIMDG34B.mjs.map +1 -0
  97. package/dist/index.d.mts +171 -0
  98. package/dist/index.d.ts +171 -0
  99. package/dist/index.js +3463 -0
  100. package/dist/index.js.map +1 -0
  101. package/dist/index.mjs +111 -0
  102. package/dist/index.mjs.map +1 -0
  103. package/package.json +49 -0
  104. package/python/__pycache__/coreml_inference.cpython-313.pyc +0 -0
  105. package/python/__pycache__/openvino_inference.cpython-313.pyc +0 -0
  106. package/python/__pycache__/pytorch_inference.cpython-313.pyc +0 -0
  107. package/python/coreml_inference.py +319 -0
  108. package/python/openvino_inference.py +247 -0
  109. package/python/pytorch_inference.py +255 -0
@@ -0,0 +1,247 @@
1
+ #!/usr/bin/env python3
2
+ """OpenVINO inference for YOLO object detection.
3
+
4
+ Binary IPC protocol over stdin/stdout:
5
+ Input: [4 bytes LE uint32 length][JPEG bytes]
6
+ Output: [4 bytes LE uint32 length][JSON bytes]
7
+
8
+ JSON output format:
9
+ {
10
+ "detections": [
11
+ {"className": "person", "score": 0.92, "bbox": [x1, y1, x2, y2]},
12
+ ...
13
+ ],
14
+ "inferenceMs": 12.5
15
+ }
16
+
17
+ Bounding boxes are NORMALIZED (0-1 range).
18
+
19
+ Usage:
20
+ python openvino_inference.py <model_path> [--device CPU|GPU|AUTO] [--input-size 640] [--confidence 0.25]
21
+ """
22
+ import sys
23
+ import struct
24
+ import json
25
+ import argparse
26
+ import time
27
+ import io
28
+ import numpy as np
29
+ from PIL import Image
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Binary IPC helpers
34
+ # ---------------------------------------------------------------------------
35
+
36
+ def read_frame(stream) -> bytes:
37
+ """Read one JPEG frame from binary IPC stream."""
38
+ header = stream.read(4)
39
+ if len(header) < 4:
40
+ return b""
41
+ length = struct.unpack("<I", header)[0]
42
+ return stream.read(length)
43
+
44
+
45
+ def write_result(stream, result: dict) -> None:
46
+ """Write JSON result to binary IPC stream."""
47
+ payload = json.dumps(result).encode("utf-8")
48
+ stream.write(struct.pack("<I", len(payload)) + payload)
49
+ stream.flush()
50
+
51
+
52
+ # ---------------------------------------------------------------------------
53
+ # Image preprocessing
54
+ # ---------------------------------------------------------------------------
55
+
56
+ def letterbox(img: Image.Image, size: int) -> tuple[np.ndarray, float, tuple[int, int]]:
57
+ """Resize with letterbox padding (maintain aspect ratio).
58
+
59
+ Returns (CHW float32 array normalised 0-1, scale, (pad_x, pad_y)).
60
+ """
61
+ w, h = img.size
62
+ scale = min(size / w, size / h)
63
+ nw, nh = int(w * scale), int(h * scale)
64
+ img_resized = img.resize((nw, nh), Image.BILINEAR)
65
+
66
+ canvas = Image.new("RGB", (size, size), (114, 114, 114))
67
+ pad_x, pad_y = (size - nw) // 2, (size - nh) // 2
68
+ canvas.paste(img_resized, (pad_x, pad_y))
69
+
70
+ arr = np.array(canvas, dtype=np.float32) / 255.0
71
+ return arr, scale, (pad_x, pad_y)
72
+
73
+
74
+ # ---------------------------------------------------------------------------
75
+ # COCO 80 class names
76
+ # ---------------------------------------------------------------------------
77
+
78
+ COCO_CLASSES = [
79
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
80
+ "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
81
+ "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
82
+ "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
83
+ "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
84
+ "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
85
+ "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
86
+ "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
87
+ "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
88
+ "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
89
+ ]
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # YOLO output parsing
94
+ # ---------------------------------------------------------------------------
95
+
96
+ def compute_iou(x1a: float, y1a: float, x2a: float, y2a: float,
97
+ x1b: float, y1b: float, x2b: float, y2b: float) -> float:
98
+ """Compute IoU between two bounding boxes."""
99
+ xi1 = max(x1a, x1b)
100
+ yi1 = max(y1a, y1b)
101
+ xi2 = min(x2a, x2b)
102
+ yi2 = min(y2a, y2b)
103
+ inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
104
+ area_a = (x2a - x1a) * (y2a - y1a)
105
+ area_b = (x2b - x1b) * (y2b - y1b)
106
+ union = area_a + area_b - inter
107
+ return inter / union if union > 0 else 0.0
108
+
109
+
110
+ def parse_yolo_output(output: np.ndarray, conf_threshold: float,
111
+ img_w: int, img_h: int, input_size: int,
112
+ scale: float, pad: tuple[int, int]) -> list[dict]:
113
+ """Parse YOLO output tensor [1, 84, 8400] into normalised detections."""
114
+ if output.ndim == 3 and output.shape[0] == 1:
115
+ output = output[0]
116
+ if output.shape[0] == 84:
117
+ output = output.T # [8400, 84]
118
+
119
+ cx, cy, w, h = output[:, 0], output[:, 1], output[:, 2], output[:, 3]
120
+ class_scores = output[:, 4:]
121
+
122
+ class_ids = np.argmax(class_scores, axis=1)
123
+ scores = class_scores[np.arange(len(class_ids)), class_ids]
124
+
125
+ mask = scores > conf_threshold
126
+ cx, cy, w, h = cx[mask], cy[mask], w[mask], h[mask]
127
+ scores, class_ids = scores[mask], class_ids[mask]
128
+
129
+ x1 = cx - w / 2
130
+ y1 = cy - h / 2
131
+ x2 = cx + w / 2
132
+ y2 = cy + h / 2
133
+
134
+ pad_x, pad_y = pad
135
+ x1 = (x1 - pad_x) / scale
136
+ y1 = (y1 - pad_y) / scale
137
+ x2 = (x2 - pad_x) / scale
138
+ y2 = (y2 - pad_y) / scale
139
+
140
+ x1 = np.clip(x1 / img_w, 0, 1)
141
+ y1 = np.clip(y1 / img_h, 0, 1)
142
+ x2 = np.clip(x2 / img_w, 0, 1)
143
+ y2 = np.clip(y2 / img_h, 0, 1)
144
+
145
+ detections: list[dict] = []
146
+ indices = np.argsort(-scores)
147
+ suppressed: set[int] = set()
148
+
149
+ for i in indices[:100]:
150
+ idx = int(i)
151
+ if idx in suppressed:
152
+ continue
153
+ detections.append({
154
+ "className": COCO_CLASSES[class_ids[idx]] if class_ids[idx] < len(COCO_CLASSES) else f"class_{class_ids[idx]}",
155
+ "score": round(float(scores[idx]), 4),
156
+ "bbox": [round(float(x1[idx]), 4), round(float(y1[idx]), 4),
157
+ round(float(x2[idx]), 4), round(float(y2[idx]), 4)],
158
+ })
159
+ for j in indices:
160
+ jdx = int(j)
161
+ if jdx in suppressed or jdx == idx:
162
+ continue
163
+ iou = compute_iou(float(x1[idx]), float(y1[idx]), float(x2[idx]), float(y2[idx]),
164
+ float(x1[jdx]), float(y1[jdx]), float(x2[jdx]), float(y2[jdx]))
165
+ if iou > 0.45:
166
+ suppressed.add(jdx)
167
+ suppressed.add(idx)
168
+
169
+ return detections
170
+
171
+
172
+ # ---------------------------------------------------------------------------
173
+ # Main
174
+ # ---------------------------------------------------------------------------
175
+
176
+ def main() -> None:
177
+ parser = argparse.ArgumentParser(description="OpenVINO inference via binary IPC")
178
+ parser.add_argument("model_path", help="Path to .xml model (OpenVINO IR format)")
179
+ parser.add_argument("--device", default="AUTO", choices=["CPU", "GPU", "AUTO"],
180
+ help="OpenVINO compute device")
181
+ parser.add_argument("--input-size", type=int, default=640,
182
+ help="Model input size (square)")
183
+ parser.add_argument("--confidence", type=float, default=0.25,
184
+ help="Confidence threshold")
185
+ args = parser.parse_args()
186
+
187
+ from openvino.runtime import Core
188
+
189
+ ie = Core()
190
+
191
+ sys.stderr.write(f"[openvino] Loading model: {args.model_path}\n")
192
+ sys.stderr.write(f"[openvino] Device: {args.device}\n")
193
+ sys.stderr.write(f"[openvino] Available devices: {ie.available_devices}\n")
194
+ sys.stderr.flush()
195
+
196
+ model = ie.read_model(args.model_path)
197
+ compiled = ie.compile_model(model, args.device)
198
+ infer_request = compiled.create_infer_request()
199
+
200
+ # Resolve input/output tensor info
201
+ input_layer = compiled.input(0)
202
+ output_layer = compiled.output(0)
203
+
204
+ sys.stderr.write(f"[openvino] Input shape: {input_layer.shape}, dtype: {input_layer.element_type}\n")
205
+ sys.stderr.write(f"[openvino] Output shape: {output_layer.shape}\n")
206
+ sys.stderr.write("[openvino] Model loaded, ready for inference\n")
207
+ sys.stderr.flush()
208
+
209
+ stdin_binary = sys.stdin.buffer
210
+ stdout_binary = sys.stdout.buffer
211
+
212
+ while True:
213
+ jpeg = read_frame(stdin_binary)
214
+ if not jpeg:
215
+ break
216
+
217
+ try:
218
+ start = time.perf_counter()
219
+
220
+ img = Image.open(io.BytesIO(jpeg)).convert("RGB")
221
+ orig_w, orig_h = img.size
222
+
223
+ arr, scale, pad = letterbox(img, args.input_size)
224
+
225
+ # OpenVINO expects [1, 3, H, W] float32
226
+ input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
227
+
228
+ infer_request.infer({0: input_arr})
229
+ output = infer_request.get_output_tensor(0).data.copy()
230
+
231
+ detections = parse_yolo_output(
232
+ output, args.confidence, orig_w, orig_h,
233
+ args.input_size, scale, pad,
234
+ )
235
+
236
+ elapsed = (time.perf_counter() - start) * 1000
237
+ result = {"detections": detections, "inferenceMs": round(elapsed, 2)}
238
+ except Exception as exc:
239
+ sys.stderr.write(f"[openvino] Inference error: {exc}\n")
240
+ sys.stderr.flush()
241
+ result = {"detections": [], "error": str(exc), "inferenceMs": 0}
242
+
243
+ write_result(stdout_binary, result)
244
+
245
+
246
+ if __name__ == "__main__":
247
+ main()
@@ -0,0 +1,255 @@
1
+ #!/usr/bin/env python3
2
+ """ONNX Runtime (Python) inference for YOLO object detection.
3
+
4
+ Uses onnxruntime-python which can leverage CoreMLExecutionProvider on macOS,
5
+ CUDAExecutionProvider on Linux/Windows with NVIDIA GPUs, or CPUExecutionProvider
6
+ everywhere. This is simpler and more portable than requiring PyTorch directly.
7
+
8
+ Binary IPC protocol over stdin/stdout:
9
+ Input: [4 bytes LE uint32 length][JPEG bytes]
10
+ Output: [4 bytes LE uint32 length][JSON bytes]
11
+
12
+ JSON output format:
13
+ {
14
+ "detections": [
15
+ {"className": "person", "score": 0.92, "bbox": [x1, y1, x2, y2]},
16
+ ...
17
+ ],
18
+ "inferenceMs": 12.5
19
+ }
20
+
21
+ Bounding boxes are NORMALIZED (0-1 range).
22
+
23
+ Usage:
24
+ python pytorch_inference.py <model_path> [--device cpu|cuda|mps] [--input-size 640] [--confidence 0.25]
25
+ """
26
+ import sys
27
+ import struct
28
+ import json
29
+ import argparse
30
+ import time
31
+ import io
32
+ import numpy as np
33
+ from PIL import Image
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Binary IPC helpers
38
+ # ---------------------------------------------------------------------------
39
+
40
+ def read_frame(stream) -> bytes:
41
+ """Read one JPEG frame from binary IPC stream."""
42
+ header = stream.read(4)
43
+ if len(header) < 4:
44
+ return b""
45
+ length = struct.unpack("<I", header)[0]
46
+ return stream.read(length)
47
+
48
+
49
+ def write_result(stream, result: dict) -> None:
50
+ """Write JSON result to binary IPC stream."""
51
+ payload = json.dumps(result).encode("utf-8")
52
+ stream.write(struct.pack("<I", len(payload)) + payload)
53
+ stream.flush()
54
+
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Image preprocessing
58
+ # ---------------------------------------------------------------------------
59
+
60
+ def letterbox(img: Image.Image, size: int) -> tuple[np.ndarray, float, tuple[int, int]]:
61
+ """Resize with letterbox padding (maintain aspect ratio).
62
+
63
+ Returns (CHW float32 array normalised 0-1, scale, (pad_x, pad_y)).
64
+ """
65
+ w, h = img.size
66
+ scale = min(size / w, size / h)
67
+ nw, nh = int(w * scale), int(h * scale)
68
+ img_resized = img.resize((nw, nh), Image.BILINEAR)
69
+
70
+ canvas = Image.new("RGB", (size, size), (114, 114, 114))
71
+ pad_x, pad_y = (size - nw) // 2, (size - nh) // 2
72
+ canvas.paste(img_resized, (pad_x, pad_y))
73
+
74
+ arr = np.array(canvas, dtype=np.float32) / 255.0
75
+ return arr, scale, (pad_x, pad_y)
76
+
77
+
78
+ # ---------------------------------------------------------------------------
79
+ # COCO 80 class names
80
+ # ---------------------------------------------------------------------------
81
+
82
+ COCO_CLASSES = [
83
+ "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat",
84
+ "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
85
+ "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack",
86
+ "umbrella", "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball",
87
+ "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
88
+ "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
89
+ "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair",
90
+ "couch", "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse",
91
+ "remote", "keyboard", "cell phone", "microwave", "oven", "toaster", "sink",
92
+ "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", "hair drier", "toothbrush",
93
+ ]
94
+
95
+
96
+ # ---------------------------------------------------------------------------
97
+ # YOLO output parsing
98
+ # ---------------------------------------------------------------------------
99
+
100
+ def compute_iou(x1a: float, y1a: float, x2a: float, y2a: float,
101
+ x1b: float, y1b: float, x2b: float, y2b: float) -> float:
102
+ """Compute IoU between two bounding boxes."""
103
+ xi1 = max(x1a, x1b)
104
+ yi1 = max(y1a, y1b)
105
+ xi2 = min(x2a, x2b)
106
+ yi2 = min(y2a, y2b)
107
+ inter = max(0, xi2 - xi1) * max(0, yi2 - yi1)
108
+ area_a = (x2a - x1a) * (y2a - y1a)
109
+ area_b = (x2b - x1b) * (y2b - y1b)
110
+ union = area_a + area_b - inter
111
+ return inter / union if union > 0 else 0.0
112
+
113
+
114
+ def parse_yolo_output(output: np.ndarray, conf_threshold: float,
115
+ img_w: int, img_h: int, input_size: int,
116
+ scale: float, pad: tuple[int, int]) -> list[dict]:
117
+ """Parse YOLO output tensor [1, 84, 8400] into normalised detections."""
118
+ if output.ndim == 3 and output.shape[0] == 1:
119
+ output = output[0]
120
+ if output.shape[0] == 84:
121
+ output = output.T # [8400, 84]
122
+
123
+ cx, cy, w, h = output[:, 0], output[:, 1], output[:, 2], output[:, 3]
124
+ class_scores = output[:, 4:]
125
+
126
+ class_ids = np.argmax(class_scores, axis=1)
127
+ scores = class_scores[np.arange(len(class_ids)), class_ids]
128
+
129
+ mask = scores > conf_threshold
130
+ cx, cy, w, h = cx[mask], cy[mask], w[mask], h[mask]
131
+ scores, class_ids = scores[mask], class_ids[mask]
132
+
133
+ x1 = cx - w / 2
134
+ y1 = cy - h / 2
135
+ x2 = cx + w / 2
136
+ y2 = cy + h / 2
137
+
138
+ pad_x, pad_y = pad
139
+ x1 = (x1 - pad_x) / scale
140
+ y1 = (y1 - pad_y) / scale
141
+ x2 = (x2 - pad_x) / scale
142
+ y2 = (y2 - pad_y) / scale
143
+
144
+ x1 = np.clip(x1 / img_w, 0, 1)
145
+ y1 = np.clip(y1 / img_h, 0, 1)
146
+ x2 = np.clip(x2 / img_w, 0, 1)
147
+ y2 = np.clip(y2 / img_h, 0, 1)
148
+
149
+ detections: list[dict] = []
150
+ indices = np.argsort(-scores)
151
+ suppressed: set[int] = set()
152
+
153
+ for i in indices[:100]:
154
+ idx = int(i)
155
+ if idx in suppressed:
156
+ continue
157
+ detections.append({
158
+ "className": COCO_CLASSES[class_ids[idx]] if class_ids[idx] < len(COCO_CLASSES) else f"class_{class_ids[idx]}",
159
+ "score": round(float(scores[idx]), 4),
160
+ "bbox": [round(float(x1[idx]), 4), round(float(y1[idx]), 4),
161
+ round(float(x2[idx]), 4), round(float(y2[idx]), 4)],
162
+ })
163
+ for j in indices:
164
+ jdx = int(j)
165
+ if jdx in suppressed or jdx == idx:
166
+ continue
167
+ iou = compute_iou(float(x1[idx]), float(y1[idx]), float(x2[idx]), float(y2[idx]),
168
+ float(x1[jdx]), float(y1[jdx]), float(x2[jdx]), float(y2[jdx]))
169
+ if iou > 0.45:
170
+ suppressed.add(jdx)
171
+ suppressed.add(idx)
172
+
173
+ return detections
174
+
175
+
176
+ # ---------------------------------------------------------------------------
177
+ # Main
178
+ # ---------------------------------------------------------------------------
179
+
180
+ def main() -> None:
181
+ parser = argparse.ArgumentParser(description="ONNX Runtime (Python) inference via binary IPC")
182
+ parser.add_argument("model_path", help="Path to .onnx model")
183
+ parser.add_argument("--device", default="cpu", choices=["cpu", "cuda", "mps"],
184
+ help="Compute device (selects execution provider)")
185
+ parser.add_argument("--input-size", type=int, default=640,
186
+ help="Model input size (square)")
187
+ parser.add_argument("--confidence", type=float, default=0.25,
188
+ help="Confidence threshold")
189
+ args = parser.parse_args()
190
+
191
+ import onnxruntime as ort
192
+
193
+ # Select execution providers based on requested device
194
+ if args.device == "cuda":
195
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
196
+ elif args.device == "mps" and sys.platform == "darwin":
197
+ providers = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
198
+ elif sys.platform == "darwin":
199
+ # On macOS, prefer CoreML even for "cpu" — it may dispatch to ANE
200
+ providers = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
201
+ else:
202
+ providers = ["CPUExecutionProvider"]
203
+
204
+ sys.stderr.write(f"[pytorch/ort] Loading model: {args.model_path}\n")
205
+ sys.stderr.write(f"[pytorch/ort] Requested device: {args.device}\n")
206
+ sys.stderr.write(f"[pytorch/ort] Providers: {providers}\n")
207
+ sys.stderr.flush()
208
+
209
+ session = ort.InferenceSession(args.model_path, providers=providers)
210
+ input_name = session.get_inputs()[0].name
211
+ active_providers = session.get_providers()
212
+
213
+ sys.stderr.write(f"[pytorch/ort] Active providers: {active_providers}\n")
214
+ sys.stderr.write("[pytorch/ort] Model loaded, ready for inference\n")
215
+ sys.stderr.flush()
216
+
217
+ stdin_binary = sys.stdin.buffer
218
+ stdout_binary = sys.stdout.buffer
219
+
220
+ while True:
221
+ jpeg = read_frame(stdin_binary)
222
+ if not jpeg:
223
+ break
224
+
225
+ try:
226
+ start = time.perf_counter()
227
+
228
+ img = Image.open(io.BytesIO(jpeg)).convert("RGB")
229
+ orig_w, orig_h = img.size
230
+
231
+ arr, scale, pad = letterbox(img, args.input_size)
232
+
233
+ # ONNX expects [1, 3, H, W] float32
234
+ input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
235
+
236
+ outputs = session.run(None, {input_name: input_arr})
237
+ output = outputs[0]
238
+
239
+ detections = parse_yolo_output(
240
+ output, args.confidence, orig_w, orig_h,
241
+ args.input_size, scale, pad,
242
+ )
243
+
244
+ elapsed = (time.perf_counter() - start) * 1000
245
+ result = {"detections": detections, "inferenceMs": round(elapsed, 2)}
246
+ except Exception as exc:
247
+ sys.stderr.write(f"[pytorch/ort] Inference error: {exc}\n")
248
+ sys.stderr.flush()
249
+ result = {"detections": [], "error": str(exc), "inferenceMs": 0}
250
+
251
+ write_result(stdout_binary, result)
252
+
253
+
254
+ if __name__ == "__main__":
255
+ main()