@camstack/addon-detection-pipeline 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1088 @@
1
+ #!/usr/bin/env python3
2
+ """Async inference pool — request_id multiplexed, per-runtime concurrency.
3
+
4
+ Architecture mirrors Scrypted's ML plugins (coreml / openvino / onnx):
5
+ - asyncio main loop reads requests from stdin; inference is dispatched
6
+ onto a runtime-specific executor so the reader never blocks.
7
+ - Each request carries a 32-bit id; responses are tagged with the same
8
+ id so the Node side can keep N requests in flight concurrently.
9
+ - Runtime executors:
10
+ CoreML → ThreadPoolExecutor(1) — ANE is single-context; one
11
+ Python thread is enough, and avoids GIL thrashing.
12
+ OpenVINO → ThreadPoolExecutor(1) driving the compiled model (the
13
+ OV runtime manages internal infer-request parallelism).
14
+ ONNX → ThreadPoolExecutor(N) — N independent InferenceSessions
15
+ where N = concurrency setting; each session pinned to
16
+ its own worker thread.
17
+ - Optional raw-frame path skips JPEG decode when the caller already
18
+ has a decoded RGB/BGR/GRAY buffer (e.g. the stream-broker decoder
19
+ cap output).
20
+
21
+ Startup protocol (Node → Python, v2):
22
+ 1. [4B total_len][4B req_id=0][1B msg_type=0x00][JSON config]
23
+ config = {"runtime": ..., "models": [...], "concurrency": N?}
24
+ 2. Python responds with [4B total_len][4B req_id=0][JSON ready status].
25
+
26
+ Runtime protocol:
27
+ Request: [4B total_len][4B req_id][1B msg_type][payload]
28
+ Response: [4B total_len][4B req_id][JSON payload]
29
+
30
+ msg_type:
31
+ 0x00 — command payload = JSON
32
+ 0x01 — infer_jpeg payload = [1B model_idx][JPEG bytes]
33
+ 0x02 — infer_raw payload = [1B model_idx][4B width][4B height]
34
+ [1B fmt 0=RGB,1=BGR,2=GRAY][pixels]
35
+
36
+ Commands: load, unload, replace, status.
37
+ """
38
+ from __future__ import annotations
39
+
40
+ import asyncio
41
+ import concurrent.futures
42
+ import io
43
+ import json
44
+ import os
45
+ import struct
46
+ import sys
47
+ import threading
48
+ import time
49
+ from dataclasses import dataclass, field
50
+ from typing import Any, Awaitable, Callable, Optional
51
+
52
+ import numpy as np
53
+ from PIL import Image
54
+
55
+ from postprocessors import POSTPROCESSORS
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Wire protocol constants
59
+ # ---------------------------------------------------------------------------
60
+
61
+ MSG_COMMAND = 0x00
62
+ MSG_INFER_JPEG = 0x01
63
+ MSG_INFER_RAW = 0x02
64
+ # MSG_INFER_BATCH — N raw items packed into a single IPC frame.
65
+ # Wire format:
66
+ # [1B model_idx][1B count][N × item]
67
+ # Each item:
68
+ # [4B width][4B height][1B fmt][4B size][size bytes raw]
69
+ # Response payload:
70
+ # {"results": [<single-detect dict>, ...]} (same length as count)
71
+ # Each item is dispatched concurrently via asyncio.gather so the
72
+ # predict pool's existing parallelism applies; the saving over N
73
+ # separate calls is one IPC round-trip per item collapsed to one,
74
+ # matching Scrypted's batch=4 semantics for fair benchmarking.
75
+ MSG_INFER_BATCH = 0x03
76
+ MSG_CACHE_FRAME = 0x04
77
+ MSG_INFER_CACHED = 0x05
78
+
79
+ RAW_FMT_RGB = 0x00
80
+ RAW_FMT_BGR = 0x01
81
+ RAW_FMT_GRAY = 0x02
82
+
83
+
84
+ # ---------------------------------------------------------------------------
85
+ # Preprocessing (unchanged from v1 — same shapes/math, moved into helpers)
86
+ # ---------------------------------------------------------------------------
87
+
88
+
89
+ def letterbox_image(img: Image.Image, size: int) -> tuple[Image.Image, float, tuple[int, int]]:
90
+ """Resize with letterbox padding; returns (PIL canvas, scale, (padX, padY)).
91
+
92
+ Stays in uint8 PIL space — no float conversion. Used by the imageType
93
+ fast path where CoreML's predict() accepts a PIL Image directly and
94
+ handles the BGR/normalize/CVPixelBuffer dance internally.
95
+ """
96
+ w, h = img.size
97
+ scale = size / max(w, h)
98
+ new_w = int(w * scale)
99
+ new_h = int(h * scale)
100
+ pad_x = (size - new_w) // 2
101
+ pad_y = (size - new_h) // 2
102
+ if w == size and h == size:
103
+ # Already at target size — skip resize + paste + alloc entirely.
104
+ return img, 1.0, (0, 0)
105
+ resized = img.resize((new_w, new_h), Image.BILINEAR)
106
+ canvas = Image.new("RGB", (size, size), (114, 114, 114))
107
+ canvas.paste(resized, (pad_x, pad_y))
108
+ return canvas, scale, (pad_x, pad_y)
109
+
110
+
111
+ def letterbox(img: Image.Image, size: int) -> tuple[np.ndarray, float, tuple[int, int]]:
112
+ """Resize with letterbox padding; returns (array[0..1] HWC float32, scale, (padX, padY)).
113
+
114
+ Used by multiArrayType inputs where the model expects a normalized
115
+ float tensor. Does the uint8→float32/255 conversion exactly once.
116
+ """
117
+ canvas, scale, pad = letterbox_image(img, size)
118
+ arr = np.array(canvas, dtype=np.float32) / 255.0
119
+ return arr, scale, pad
120
+
121
+
122
+ def resize_image(img: Image.Image, width: int, height: int) -> np.ndarray:
123
+ resized = img.resize((width, height), Image.BILINEAR)
124
+ return np.array(resized, dtype=np.float32) / 255.0
125
+
126
+
127
+ def decode_jpeg(jpeg: bytes) -> Image.Image:
128
+ return Image.open(io.BytesIO(jpeg)).convert("RGB")
129
+
130
+
131
+ def wrap_raw(raw: bytes, width: int, height: int, fmt: int) -> Image.Image:
132
+ """Zero-copy (via PIL frombuffer) wrap of a raw frame buffer."""
133
+ if fmt == RAW_FMT_GRAY:
134
+ img = Image.frombuffer("L", (width, height), raw, "raw", "L", 0, 1)
135
+ return img.convert("RGB")
136
+ mode = "BGR" if fmt == RAW_FMT_BGR else "RGB"
137
+ img = Image.frombuffer("RGB", (width, height), raw, "raw", mode, 0, 1)
138
+ return img
139
+
140
+
141
+ # ---------------------------------------------------------------------------
142
+ # Model slot + runtime-specific loading
143
+ # ---------------------------------------------------------------------------
144
+
145
+
146
+ @dataclass
147
+ class ModelSlot:
148
+ model: Any = None
149
+ config: dict = field(default_factory=dict)
150
+ loaded: bool = False
151
+ predict_fn: Optional[Callable[[dict], dict]] = None
152
+ # Atomic snapshot of model.spec input names; used by both single
153
+ # and batched predict paths to filter input_dicts against the
154
+ # CURRENTLY-loaded model. Defends against the race where an in-
155
+ # flight preprocess holds a reference to the previously-loaded
156
+ # model's `_input_names` and adds keys (iouThreshold,
157
+ # confidenceThreshold) that the new model rejects.
158
+ input_names: frozenset = field(default_factory=frozenset)
159
+
160
+
161
+ _runtime: str = ""
162
+ _runtime_lib: Any = None
163
+
164
+
165
+ def _init_runtime(runtime: str) -> None:
166
+ global _runtime, _runtime_lib
167
+ _runtime = runtime
168
+ if runtime == "coreml":
169
+ import coremltools as ct
170
+ _runtime_lib = ct
171
+ elif runtime == "openvino":
172
+ from openvino.runtime import Core
173
+ _runtime_lib = Core()
174
+ elif runtime == "onnxruntime":
175
+ import onnxruntime as ort
176
+ _runtime_lib = ort
177
+ else:
178
+ raise ValueError(f"Unknown runtime: {runtime}")
179
+
180
+
181
+ def _load_model(slot: ModelSlot, config: dict) -> None:
182
+ """Load a model into a slot using the active runtime. Thin adapter per backend."""
183
+ slot.config = dict(config)
184
+ path = config["path"]
185
+
186
+ if _runtime == "coreml":
187
+ ct = _runtime_lib
188
+ device = config.get("device", "all")
189
+ compute_map = {
190
+ "cpu": ct.ComputeUnit.CPU_ONLY,
191
+ "gpu": ct.ComputeUnit.CPU_AND_GPU,
192
+ "ane": ct.ComputeUnit.CPU_AND_NE,
193
+ "all": ct.ComputeUnit.ALL,
194
+ }
195
+ model = ct.models.MLModel(path, compute_units=compute_map.get(device, ct.ComputeUnit.ALL))
196
+ spec = model.get_spec()
197
+ input_spec = spec.description.input[0]
198
+ slot.config["_input_name"] = input_spec.name
199
+ slot.config["_input_type"] = input_spec.type.WhichOneof("Type")
200
+ slot.config["_input_names"] = [i.name for i in spec.description.input]
201
+ if slot.config["_input_type"] == "multiArrayType":
202
+ slot.config["_input_shape"] = list(input_spec.type.multiArrayType.shape)
203
+ # Detect batch flexibility — when the model's first axis is
204
+ # declared as a `shapeRange` (RangeDim at export time), we
205
+ # can stack N preprocessed frames into a single (N,3,H,W)
206
+ # tensor and issue ONE predict call instead of iterating
207
+ # the list-of-dicts path. One ANE dispatch per batch.
208
+ mat = input_spec.type.multiArrayType
209
+ if mat.WhichOneof("ShapeFlexibility") == "shapeRange" and len(mat.shapeRange.sizeRanges) > 0:
210
+ batch_axis = mat.shapeRange.sizeRanges[0]
211
+ if batch_axis.upperBound > 1:
212
+ slot.config["_supports_batch"] = True
213
+ slot.config["_max_batch"] = int(batch_axis.upperBound)
214
+ slot.model = model
215
+ slot.input_names = frozenset(i.name for i in spec.description.input)
216
+ # Filter via slot.input_names atomically read from the CURRENT
217
+ # slot. During a model `replace`, in-flight preprocess calls
218
+ # may have added stale keys (iouThreshold, confidenceThreshold)
219
+ # for the prior model that the new one rejects.
220
+ def _predict(inp: dict, _slot: ModelSlot = slot) -> dict:
221
+ names = _slot.input_names
222
+ filtered = {k: v for k, v in inp.items() if k in names} if names else inp
223
+ return _slot.model.predict(filtered)
224
+ slot.predict_fn = _predict
225
+
226
+ elif _runtime == "openvino":
227
+ core = _runtime_lib
228
+ ov_device = config.get("device", "AUTO").upper()
229
+ compiled = core.compile_model(path, device_name=ov_device)
230
+ output_layers = [compiled.output(i) for i in range(len(compiled.outputs))]
231
+ output_names = [o.get_any_name() for o in compiled.outputs]
232
+
233
+ def predict(inp_dict: dict) -> dict:
234
+ inp = list(inp_dict.values())[0]
235
+ result = compiled(inp)
236
+ return {name: result[layer] for name, layer in zip(output_names, output_layers)}
237
+
238
+ slot.model = compiled
239
+ slot.predict_fn = predict
240
+
241
+ elif _runtime == "onnxruntime":
242
+ ort = _runtime_lib
243
+ ort_device = config.get("device", "cpu")
244
+ if ort_device == "cuda":
245
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
246
+ elif ort_device == "coreml":
247
+ providers = ["CoreMLExecutionProvider", "CPUExecutionProvider"]
248
+ else:
249
+ providers = ["CPUExecutionProvider"]
250
+ session = ort.InferenceSession(path, providers=providers)
251
+ output_names = [o.name for o in session.get_outputs()]
252
+ slot.config["_input_name"] = session.get_inputs()[0].name
253
+
254
+ def predict(inp_dict: dict) -> dict:
255
+ outputs = session.run(output_names, inp_dict)
256
+ return {name: out for name, out in zip(output_names, outputs)}
257
+
258
+ slot.model = session
259
+ slot.predict_fn = predict
260
+
261
+ slot.loaded = True
262
+
263
+
264
+ def _unload_model(slot: ModelSlot) -> None:
265
+ slot.model = None
266
+ slot.predict_fn = None
267
+ slot.input_names = frozenset()
268
+ slot.config = {}
269
+ slot.loaded = False
270
+
271
+
272
+ # ---------------------------------------------------------------------------
273
+ # Preprocess + run_inference — pure sync helpers called from worker threads
274
+ # ---------------------------------------------------------------------------
275
+
276
+
277
+ # Preprocess cache for bench frames only. Keyed by (_bench_frame_id, inputSize)
278
+ # where _bench_frame_id is a stable integer tag set on PIL Images stored in
279
+ # frame_cache. Camera frames don't have this tag → no cache → correct behavior.
280
+ # Avoids the CPython id() reuse problem that caused stale detections.
281
+ _bench_preprocess_cache: dict[tuple[int, int], tuple[dict, float, tuple[int, int]]] = {}
282
+
283
+ def _preprocess(img: Image.Image, config: dict) -> tuple[dict, float, tuple[int, int]]:
284
+ input_size = config.get("inputSize", 640)
285
+ # Bench frames have _bench_frame_id tag → use preprocess cache
286
+ bench_fid = getattr(img, '_bench_frame_id', None)
287
+ if bench_fid is not None:
288
+ cache_key = (bench_fid, input_size)
289
+ cached = _bench_preprocess_cache.get(cache_key)
290
+ if cached is not None:
291
+ return cached
292
+ preprocess_mode = config.get("preprocessMode", "letterbox")
293
+ input_dict: dict = {}
294
+ if _runtime == "coreml":
295
+ input_name = config.get("_input_name", "image")
296
+ input_type = config.get("_input_type", "imageType")
297
+ input_shape = config.get("_input_shape")
298
+ if input_type == "imageType":
299
+ # FAST PATH — stay in uint8 PIL space. CoreML's predict()
300
+ # accepts a PIL Image directly and handles the
301
+ # BGR/normalize/CVPixelBuffer dance internally on its own
302
+ # (zero-copy when the size matches the model input). Going
303
+ # via numpy float32 → /255 → ×255 → astype(uint8) → Image.
304
+ # fromarray() costs ~16ms per 640×640 frame on M3 Pro and is
305
+ # PURE WASTE for imageType inputs.
306
+ if preprocess_mode == "letterbox":
307
+ canvas, scale_val, pad = letterbox_image(img, input_size)
308
+ elif img.size == (input_size, input_size):
309
+ canvas, scale_val, pad = img, 1.0, (0, 0)
310
+ else:
311
+ canvas, scale_val, pad = img.resize((input_size, input_size), Image.BILINEAR), 1.0, (0, 0)
312
+ input_dict[input_name] = canvas
313
+ arr = None # not used downstream for imageType — postprocess runs on predictions
314
+ elif input_shape is not None and len(input_shape) == 4:
315
+ if preprocess_mode == "letterbox":
316
+ arr, scale_val, pad = letterbox(img, input_size)
317
+ else:
318
+ arr = resize_image(img, input_size, input_size)
319
+ scale_val = 1.0
320
+ pad = (0, 0)
321
+ _, _, _, w_or_c = input_shape
322
+ if w_or_c in (1, 3):
323
+ input_arr = arr[np.newaxis].astype(np.float32)
324
+ else:
325
+ input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
326
+ if input_shape[1] == 1 and input_shape[2] != input_shape[3]:
327
+ gray = np.mean(arr, axis=2)
328
+ target_h, target_w = input_shape[2], input_shape[3]
329
+ gray_img = Image.fromarray((gray * 255).astype(np.uint8), mode="L")
330
+ gray_img = gray_img.resize((target_w, target_h), Image.BILINEAR)
331
+ input_arr = np.array(gray_img, dtype=np.float32)[np.newaxis, np.newaxis] / 255.0
332
+ input_dict[input_name] = input_arr
333
+ else:
334
+ input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
335
+ input_dict[input_name] = input_arr
336
+
337
+ input_names = config.get("_input_names", [])
338
+ if "iouThreshold" in input_names:
339
+ input_dict["iouThreshold"] = 0.45
340
+ if "confidenceThreshold" in input_names:
341
+ input_dict["confidenceThreshold"] = config.get("confidence", 0.25)
342
+ else:
343
+ # OpenVINO / ONNX — always need float CHW tensor.
344
+ if preprocess_mode == "letterbox":
345
+ arr, scale_val, pad = letterbox(img, input_size)
346
+ else:
347
+ arr = resize_image(img, input_size, input_size)
348
+ scale_val = 1.0
349
+ pad = (0, 0)
350
+ input_name = config.get("_input_name", "images")
351
+ input_arr = arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)
352
+ input_dict[input_name] = input_arr
353
+
354
+ result = (input_dict, scale_val, pad)
355
+ if bench_fid is not None:
356
+ _bench_preprocess_cache[(bench_fid, input_size)] = result
357
+ return result
358
+
359
+
360
+ def _postprocess(
361
+ predictions: dict,
362
+ slot_config: dict,
363
+ orig_w: int,
364
+ orig_h: int,
365
+ scale_val: float,
366
+ pad: tuple[int, int],
367
+ elapsed_ms: float,
368
+ preprocess_ms: float = 0.0,
369
+ predict_ms: float = 0.0,
370
+ batch_size: int = 1,
371
+ ) -> dict:
372
+ postprocessor_type = slot_config.get("postprocessor", "yolo")
373
+ postprocessor_fn = POSTPROCESSORS.get(postprocessor_type)
374
+ if postprocessor_fn is None:
375
+ raise ValueError(f"Unknown postprocessor: {postprocessor_type}")
376
+ result = postprocessor_fn(predictions, slot_config, orig_w, orig_h, scale_val, pad)
377
+ result["inferenceMs"] = round(elapsed_ms, 2)
378
+ result["preprocessMs"] = round(preprocess_ms, 2)
379
+ result["predictMs"] = round(predict_ms, 2)
380
+ result["batchSize"] = batch_size
381
+ result["frameSize"] = f"{orig_w}x{orig_h}"
382
+ return result
383
+
384
+
385
+ # ---------------------------------------------------------------------------
386
+ # Runtime dispatcher — pipelined prepare → predict → postprocess
387
+ # ---------------------------------------------------------------------------
388
+
389
+
390
+ class RuntimeDispatcher:
391
+ """Split-stage dispatcher that lets preprocess run ahead of predict.
392
+
393
+ Stages use separate thread pools so an in-flight `predict` doesn't
394
+ block the preprocess for the next frame:
395
+
396
+ camera A: preprocess → predict ─┐
397
+ camera B: preprocess → predict ─┐
398
+ camera C: preprocess → predict ─┐
399
+
400
+ While predict for A runs on the predict_pool, preprocess for B can
401
+ run on the prepare_pool; each stage releases the Python event loop
402
+ between calls so the reader stays responsive.
403
+
404
+ `concurrency` sets the prepare pool size AND the predict pool size.
405
+ For CoreML/OpenVINO the runtime itself serialises access to the
406
+ device, so a single predict worker is usually enough. ONNX Runtime
407
+ benefits from >1 (its InferenceSession.run is thread-safe and the
408
+ native thread pool schedules concurrent runs).
409
+ """
410
+
411
+ def __init__(self, runtime: str, concurrency: int) -> None:
412
+ workers = max(1, concurrency)
413
+ self._workers = workers
414
+ self._prepare_pool = concurrent.futures.ThreadPoolExecutor(
415
+ max_workers=workers,
416
+ thread_name_prefix=f"{runtime}-prep",
417
+ )
418
+ self._predict_pool = concurrent.futures.ThreadPoolExecutor(
419
+ max_workers=workers,
420
+ thread_name_prefix=f"{runtime}-predict",
421
+ )
422
+
423
+ @property
424
+ def workers(self) -> int:
425
+ return self._workers
426
+
427
+ async def run(self, slot: ModelSlot, img: Image.Image) -> dict:
428
+ loop = asyncio.get_event_loop()
429
+ t_start = time.perf_counter()
430
+ orig_w, orig_h = img.size
431
+ # Stage 1 — preprocess. Runs on prepare_pool; overlaps with
432
+ # any predict already scheduled for a previous frame.
433
+ input_dict, scale_val, pad = await loop.run_in_executor(
434
+ self._prepare_pool, _preprocess, img, slot.config,
435
+ )
436
+ t_predict_start = time.perf_counter()
437
+ preprocess_ms = (t_predict_start - t_start) * 1000
438
+ # Stage 2 — predict. Dedicated pool so preprocessors for later
439
+ # frames can run while the runtime works on this one.
440
+ predictions = await loop.run_in_executor(
441
+ self._predict_pool, slot.predict_fn, input_dict,
442
+ )
443
+ # Stage 3 — postprocess. Cheap CPU-bound step; reuse the prepare
444
+ # pool to avoid tying up a predict worker on scalar math.
445
+ t_predict_end = time.perf_counter()
446
+ predict_ms = (t_predict_end - t_predict_start) * 1000
447
+ elapsed_ms = (t_predict_end - t_start) * 1000
448
+ return await loop.run_in_executor(
449
+ self._prepare_pool,
450
+ _postprocess,
451
+ predictions, slot.config, orig_w, orig_h, scale_val, pad, elapsed_ms,
452
+ preprocess_ms, predict_ms, 1,
453
+ )
454
+
455
+ async def run_cached(
456
+ self,
457
+ slot: ModelSlot,
458
+ img: Image.Image,
459
+ cache_key: tuple[int, int],
460
+ prep_cache: dict[tuple[int, int], tuple[dict, float, tuple[int, int], int, int]],
461
+ ) -> dict:
462
+ """Like run() but caches the preprocessed input dict. Second+
463
+ calls for the same (frame_id, model_idx) skip preprocess entirely
464
+ — goes straight to predict + postprocess. Saves ~10ms/call."""
465
+ loop = asyncio.get_event_loop()
466
+ t_start = time.perf_counter()
467
+ orig_w, orig_h = img.size
468
+
469
+ cached = prep_cache.get(cache_key)
470
+ if cached is not None:
471
+ input_dict, scale_val, pad, _, _ = cached
472
+ preprocess_ms = 0.0
473
+ else:
474
+ input_dict, scale_val, pad = await loop.run_in_executor(
475
+ self._prepare_pool, _preprocess, img, slot.config,
476
+ )
477
+ preprocess_ms = (time.perf_counter() - t_start) * 1000
478
+ prep_cache[cache_key] = (input_dict, scale_val, pad, orig_w, orig_h)
479
+
480
+ t_predict_start = time.perf_counter()
481
+ predictions = await loop.run_in_executor(
482
+ self._predict_pool, slot.predict_fn, input_dict,
483
+ )
484
+ t_predict_end = time.perf_counter()
485
+ predict_ms = (t_predict_end - t_predict_start) * 1000
486
+ elapsed_ms = (t_predict_end - t_start) * 1000
487
+ return await loop.run_in_executor(
488
+ self._prepare_pool,
489
+ _postprocess,
490
+ predictions, slot.config, orig_w, orig_h, scale_val, pad, elapsed_ms,
491
+ preprocess_ms, predict_ms, 1,
492
+ )
493
+
494
+ async def run_list(self, slot: ModelSlot, imgs: list[Image.Image]) -> list[dict]:
495
+ """Batch predict path — single CoreML/OV/ORT predict call processing
496
+ all N items, then split + per-item postprocess.
497
+
498
+ For CoreML: passes a list of input dicts to `model.predict(...)`,
499
+ which CoreML iterates internally with shared session state — saves
500
+ per-call ANE setup overhead vs a thread-pool fan-out.
501
+
502
+ For OpenVINO/ONNX: falls back to a loop on the same predict thread
503
+ (no model-level batching since their predict_fn signatures take one
504
+ array). Still saves the asyncio.gather + ThreadPoolExecutor dispatch
505
+ overhead.
506
+ """
507
+ if not imgs:
508
+ return []
509
+ loop = asyncio.get_event_loop()
510
+ t_start = time.perf_counter()
511
+ orig_sizes = [img.size for img in imgs]
512
+ batch_size = len(imgs)
513
+
514
+ # Stage 1 — preprocess all in parallel on the prepare pool.
515
+ prepared = await asyncio.gather(*[
516
+ loop.run_in_executor(self._prepare_pool, _preprocess, img, slot.config)
517
+ for img in imgs
518
+ ])
519
+ # prepared = list of (input_dict, scale_val, pad)
520
+ t_predict_start = time.perf_counter()
521
+ preprocess_ms = (t_predict_start - t_start) * 1000
522
+
523
+ # Stage 2 — single predict call on the predict thread.
524
+ runtime = _runtime
525
+ supports_batch = bool(slot.config.get("_supports_batch"))
526
+ max_batch = int(slot.config.get("_max_batch", 1))
527
+ if runtime == "coreml" and supports_batch and batch_size <= max_batch:
528
+ # FAST PATH — stack the N preprocessed (1,3,H,W) tensors
529
+ # into a single (N,3,H,W) tensor and issue ONE predict call.
530
+ # The model was exported with a flexible batch axis so this
531
+ # produces a single ANE dispatch processing all N frames in
532
+ # parallel — amortises CoreML session setup AND the per-call
533
+ # compute_units context-switch.
534
+ input_name = slot.config.get("_input_name", "image")
535
+ stacked = np.concatenate([p[0][input_name] for p in prepared], axis=0)
536
+
537
+ def stacked_predict() -> list[dict]:
538
+ names = slot.input_names
539
+ payload: dict = {input_name: stacked}
540
+ # Some models accept side-channel inputs (iouThreshold
541
+ # etc.). Forward only those that the current model
542
+ # accepts; the stacked input is set above.
543
+ first = prepared[0][0]
544
+ for k, v in first.items():
545
+ if k != input_name and (not names or k in names):
546
+ payload[k] = v
547
+ out = slot.model.predict(payload)
548
+ # Split each output along axis 0 back into N items.
549
+ # Output shape (N, ...); slice per index and rebuild
550
+ # per-item dicts so postprocess sees the same shape it
551
+ # would for a single-frame predict.
552
+ results: list[dict] = []
553
+ for i in range(batch_size):
554
+ item: dict = {}
555
+ for k, v in out.items():
556
+ if hasattr(v, "shape") and len(v.shape) > 0 and v.shape[0] == batch_size:
557
+ item[k] = v[i:i + 1]
558
+ else:
559
+ item[k] = v
560
+ results.append(item)
561
+ return results
562
+
563
+ predictions_list = await loop.run_in_executor(self._predict_pool, stacked_predict)
564
+ elif runtime == "coreml":
565
+ # Fallback — model.predict([list of input dicts]) iterates
566
+ # internally on shared session state. Not as fast as stacked
567
+ # but still saves the asyncio.gather + ThreadPoolExecutor
568
+ # dispatch overhead vs N separate calls.
569
+ def batched_predict() -> list[dict]:
570
+ names = slot.input_names
571
+ if names:
572
+ cleaned = [{k: v for k, v in p[0].items() if k in names} for p in prepared]
573
+ else:
574
+ cleaned = [p[0] for p in prepared]
575
+ return slot.model.predict(cleaned)
576
+
577
+ predictions_list = await loop.run_in_executor(self._predict_pool, batched_predict)
578
+ else:
579
+ # OpenVINO / ONNX — loop on the predict thread to keep IPC
580
+ # amortisation (one executor dispatch instead of N) without
581
+ # requiring model-level batched inputs.
582
+ def looped_predict() -> list[dict]:
583
+ return [slot.predict_fn(p[0]) for p in prepared]
584
+
585
+ predictions_list = await loop.run_in_executor(self._predict_pool, looped_predict)
586
+
587
+ # Stage 3 — postprocess each result. Per-item elapsed shares the
588
+ # batch wall (fair attribution) — callers see it as `inferenceMs`.
589
+ t_predict_end = time.perf_counter()
590
+ predict_ms = (t_predict_end - t_predict_start) * 1000
591
+ elapsed_ms = (t_predict_end - t_start) * 1000
592
+ return await asyncio.gather(*[
593
+ loop.run_in_executor(
594
+ self._prepare_pool,
595
+ _postprocess,
596
+ predictions, slot.config,
597
+ orig_sizes[i][0], orig_sizes[i][1],
598
+ prepared[i][1], prepared[i][2],
599
+ elapsed_ms,
600
+ preprocess_ms, predict_ms, batch_size,
601
+ )
602
+ for i, predictions in enumerate(predictions_list)
603
+ ])
604
+
605
+ def close(self) -> None:
606
+ self._prepare_pool.shutdown(wait=False)
607
+ self._predict_pool.shutdown(wait=False)
608
+
609
+
610
+ # ---------------------------------------------------------------------------
611
+ # IPC — binary framing with request_id multiplexing
612
+ # ---------------------------------------------------------------------------
613
+
614
+
615
+ HEADER_LEN = 4 # total_len prefix
616
+ PREFIX_LEN = HEADER_LEN + 4 + 1 # total_len + req_id + msg_type
617
+
618
+
619
+ async def _read_exact(reader: asyncio.StreamReader, n: int) -> bytes:
620
+ return await reader.readexactly(n)
621
+
622
+
623
+ class ResponseWriter:
624
+ """Serialises writes to the stdout pipe; safe from multiple coroutines."""
625
+
626
+ def __init__(self, writer: asyncio.StreamWriter) -> None:
627
+ self._writer = writer
628
+ self._lock = asyncio.Lock()
629
+
630
+ async def send(self, req_id: int, payload: dict) -> None:
631
+ # numpy scalars (float32 / int64 / bool_) are NOT JSON-serializable
632
+ # under the stdlib encoder. With numpy 2.x the previous lenient
633
+ # path is gone, so we must coerce before encoding. Cheap default
634
+ # callback only kicks in for unhandled types — Python natives skip
635
+ # it entirely.
636
+ data = json.dumps(payload, default=_json_default).encode("utf-8")
637
+ total_len = 4 + len(data) # req_id(4) + json
638
+ header = struct.pack("<II", total_len, req_id)
639
+ async with self._lock:
640
+ self._writer.write(header)
641
+ self._writer.write(data)
642
+ await self._writer.drain()
643
+
644
+
645
+ def _json_default(obj: Any) -> Any:
646
+ """Coerce numpy scalars/arrays to JSON-friendly Python natives."""
647
+ if isinstance(obj, np.generic):
648
+ return obj.item()
649
+ if isinstance(obj, np.ndarray):
650
+ return obj.tolist()
651
+ raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
652
+
653
+
654
+ # ---------------------------------------------------------------------------
655
+ # Command handlers
656
+ # ---------------------------------------------------------------------------
657
+
658
+
659
+ def _handle_command(models: list[ModelSlot], cmd: dict) -> dict:
660
+ action = cmd.get("cmd")
661
+ if action == "load":
662
+ index = cmd["index"]
663
+ config = cmd["config"]
664
+ while len(models) <= index:
665
+ models.append(ModelSlot())
666
+ slot = models[index]
667
+ if slot.loaded:
668
+ _unload_model(slot)
669
+ try:
670
+ t0 = time.perf_counter()
671
+ _load_model(slot, config)
672
+ load_ms = round((time.perf_counter() - t0) * 1000)
673
+ sys.stderr.write(f"Model {index} loaded: {config['path']} ({load_ms}ms)\n")
674
+ sys.stderr.flush()
675
+ return {"cmd": "load", "index": index, "status": "ok", "loadMs": load_ms}
676
+ except Exception as exc:
677
+ return {"cmd": "load", "index": index, "status": "error", "error": str(exc)}
678
+
679
+ if action == "unload":
680
+ index = cmd["index"]
681
+ if index < len(models) and models[index].loaded:
682
+ _unload_model(models[index])
683
+ sys.stderr.write(f"Model {index} unloaded\n")
684
+ sys.stderr.flush()
685
+ return {"cmd": "unload", "index": index, "status": "ok"}
686
+
687
+ if action == "replace":
688
+ index = cmd["index"]
689
+ config = cmd["config"]
690
+ while len(models) <= index:
691
+ models.append(ModelSlot())
692
+ slot = models[index]
693
+ if slot.loaded:
694
+ _unload_model(slot)
695
+ try:
696
+ t0 = time.perf_counter()
697
+ _load_model(slot, config)
698
+ load_ms = round((time.perf_counter() - t0) * 1000)
699
+ sys.stderr.write(f"Model {index} replaced: {config['path']} ({load_ms}ms)\n")
700
+ sys.stderr.flush()
701
+ return {"cmd": "replace", "index": index, "status": "ok", "loadMs": load_ms}
702
+ except Exception as exc:
703
+ return {"cmd": "replace", "index": index, "status": "error", "error": str(exc)}
704
+
705
+ if action == "status":
706
+ status = []
707
+ for i, slot in enumerate(models):
708
+ status.append({
709
+ "index": i,
710
+ "path": slot.config.get("path") if slot.loaded else None,
711
+ "loaded": slot.loaded,
712
+ "postprocessor": slot.config.get("postprocessor") if slot.loaded else None,
713
+ })
714
+ return {"cmd": "status", "models": status}
715
+
716
+ return {"cmd": action or "unknown", "status": "error", "error": f"Unknown command: {action}"}
717
+
718
+
719
+ # ---------------------------------------------------------------------------
720
+ # Main async loop
721
+ # ---------------------------------------------------------------------------
722
+
723
+
724
+ async def _run() -> None:
725
+ loop = asyncio.get_event_loop()
726
+ reader = asyncio.StreamReader(loop=loop)
727
+ protocol = asyncio.StreamReaderProtocol(reader, loop=loop)
728
+ await loop.connect_read_pipe(lambda: protocol, sys.stdin.buffer)
729
+
730
+ transport, stdout_protocol = await loop.connect_write_pipe(
731
+ asyncio.streams.FlowControlMixin, sys.stdout.buffer,
732
+ )
733
+ stream_writer = asyncio.StreamWriter(transport, stdout_protocol, None, loop)
734
+ writer = ResponseWriter(stream_writer)
735
+
736
+ t_start = time.perf_counter()
737
+
738
+ # ── Startup — first message is config (req_id=0, msg_type=COMMAND) ──
739
+ header = await _read_exact(reader, PREFIX_LEN)
740
+ total_len, req_id = struct.unpack("<II", header[0:8])
741
+ msg_type = header[8]
742
+ remaining = total_len - 5
743
+ payload = await _read_exact(reader, remaining)
744
+ if msg_type != MSG_COMMAND:
745
+ sys.stderr.write(f"Expected startup config (cmd), got msg_type={msg_type}\n")
746
+ sys.exit(1)
747
+ config = json.loads(payload)
748
+ runtime = config.get("runtime", "coreml")
749
+ concurrency = int(config.get("concurrency", 1) or 1)
750
+ batch_mode = str(config.get("batch_mode", "none"))
751
+ if batch_mode not in ("none", "list", "window"):
752
+ batch_mode = "none"
753
+ window_ms = max(0, int(config.get("window_ms", 0) or 0))
754
+ max_batch_size = max(1, int(config.get("max_batch_size", 1) or 1))
755
+
756
+ sys.stderr.write(
757
+ f"Initializing runtime: {runtime} (concurrency={concurrency}, "
758
+ f"batch_mode={batch_mode}, window_ms={window_ms}, max_batch={max_batch_size})\n"
759
+ )
760
+ sys.stderr.flush()
761
+ _init_runtime(runtime)
762
+
763
+ models: list[ModelSlot] = []
764
+ for i, mc in enumerate(config.get("models", [])):
765
+ slot = ModelSlot()
766
+ sys.stderr.write(f"Loading model {i}: {mc['path']}\n")
767
+ sys.stderr.flush()
768
+ try:
769
+ _load_model(slot, mc)
770
+ except Exception as exc:
771
+ sys.stderr.write(f"ERROR loading model {i}: {exc}\n")
772
+ sys.stderr.flush()
773
+ models.append(slot)
774
+
775
+ dispatcher = RuntimeDispatcher(runtime, concurrency)
776
+ startup_ms = round((time.perf_counter() - t_start) * 1000)
777
+ loaded_count = sum(1 for s in models if s.loaded)
778
+ sys.stderr.write(
779
+ f"Ready — {loaded_count}/{len(models)} model(s) in {startup_ms}ms "
780
+ f"(runtime={runtime}, workers={dispatcher.workers})\n"
781
+ )
782
+ sys.stderr.flush()
783
+
784
+ await writer.send(req_id, {
785
+ "status": "ready",
786
+ "models": loaded_count,
787
+ "total": len(models),
788
+ "startupMs": startup_ms,
789
+ "runtime": runtime,
790
+ "workers": dispatcher.workers,
791
+ })
792
+
793
+ # ── Window accumulator (per-model) ──────────────────────────────
794
+ #
795
+ # In `window` mode, every MSG_INFER_RAW arrival pushes (req_id, img)
796
+ # into a per-model pending queue and arms a `window_ms` timer. When
797
+ # the timer fires, OR the queue reaches `max_batch_size`, we flush
798
+ # via `dispatcher.run_list(slot, imgs)` — single batched predict.
799
+ # Concurrent inferRaw calls from N cameras coalesce into one CoreML
800
+ # predict call, amortising ANE setup.
801
+
802
+ class WindowAccumulator:
803
+ def __init__(self, model_idx: int) -> None:
804
+ self.model_idx = model_idx
805
+ self.pending: list[tuple[int, Image.Image]] = []
806
+ self.flush_task: Optional[asyncio.Task[None]] = None
807
+
808
+ async def submit(self, req_id: int, img: Image.Image) -> None:
809
+ self.pending.append((req_id, img))
810
+ if len(self.pending) >= max_batch_size:
811
+ # Reached cap — cancel any pending timer (we are flushing
812
+ # immediately from the caller's coroutine, not the timer)
813
+ # and flush now.
814
+ if self.flush_task is not None:
815
+ self.flush_task.cancel()
816
+ self.flush_task = None
817
+ await self._flush()
818
+ return
819
+ if self.flush_task is None:
820
+ self.flush_task = asyncio.create_task(self._delayed_flush())
821
+
822
+ async def _delayed_flush(self) -> None:
823
+ try:
824
+ await asyncio.sleep(window_ms / 1000.0)
825
+ except asyncio.CancelledError:
826
+ return
827
+ # Mark the timer task as complete BEFORE the dispatch await so
828
+ # `_flush` does not try to cancel the running task (which would
829
+ # raise CancelledError at `await dispatcher.run_list` and drop
830
+ # responses on the floor — the symptom the TS side observed as
831
+ # `runPipeline.mutate` hanging forever).
832
+ self.flush_task = None
833
+ await self._flush()
834
+
835
+ async def _flush(self) -> None:
836
+ batch = self.pending
837
+ self.pending = []
838
+ if not batch:
839
+ return
840
+ slot = models[self.model_idx]
841
+ try:
842
+ imgs = [b[1] for b in batch]
843
+ results = await dispatcher.run_list(slot, imgs)
844
+ except Exception as exc:
845
+ sys.stderr.write(f"Window flush error (model {self.model_idx}): {exc}\n")
846
+ sys.stderr.flush()
847
+ err_payload = {
848
+ "error": str(exc),
849
+ "kind": "detections",
850
+ "detections": [],
851
+ "inferenceMs": 0,
852
+ }
853
+ for rid, _ in batch:
854
+ await writer.send(rid, err_payload)
855
+ return
856
+ for (rid, _), result in zip(batch, results):
857
+ await writer.send(rid, result)
858
+
859
+ accumulators: dict[int, WindowAccumulator] = {}
860
+
861
+ # ── Bench frame cache ──────────────────────────────────────────
862
+ # Stores raw PIL Images keyed by uint32 frame_id. The Node side
863
+ # sends MSG_CACHE_FRAME once with the full 1.2MB payload; all
864
+ # subsequent MSG_INFER_CACHED calls send only a 5-byte header
865
+ # (model_idx + frame_id). Eliminates the 35ms/call pipe transfer
866
+ # that dominates bench throughput.
867
+ frame_cache: dict[int, Image.Image] = {}
868
+
869
+ # Preprocessed-input cache: keyed by (frame_id, model_idx). First
870
+ # inferCached call preprocesses and caches; subsequent calls skip
871
+ # preprocess entirely and go straight to predict + postprocess.
872
+ # Saves ~10ms/call (the PIL resize + numpy + tensor pack cost).
873
+ _preprocess_cache: dict[tuple[int, int], tuple[dict, float, tuple[int, int], int, int]] = {}
874
+
875
+ def get_accumulator(model_idx: int) -> WindowAccumulator:
876
+ acc = accumulators.get(model_idx)
877
+ if acc is None:
878
+ acc = WindowAccumulator(model_idx)
879
+ accumulators[model_idx] = acc
880
+ return acc
881
+
882
+ # ── Main loop ───────────────────────────────────────────────────
883
+ async def handle_inference(req_id: int, img: Image.Image, model_idx: int) -> None:
884
+ try:
885
+ if model_idx >= len(models) or not models[model_idx].loaded:
886
+ await writer.send(req_id, {
887
+ "error": f"Model {model_idx} not loaded",
888
+ "kind": "detections",
889
+ "detections": [],
890
+ "inferenceMs": 0,
891
+ })
892
+ return
893
+ if batch_mode == "window":
894
+ await get_accumulator(model_idx).submit(req_id, img)
895
+ return
896
+ result = await dispatcher.run(models[model_idx], img)
897
+ await writer.send(req_id, result)
898
+ except Exception as exc:
899
+ sys.stderr.write(f"Inference error (model {model_idx}, req {req_id}): {exc}\n")
900
+ sys.stderr.flush()
901
+ await writer.send(req_id, {
902
+ "error": str(exc),
903
+ "kind": "detections",
904
+ "detections": [],
905
+ "inferenceMs": 0,
906
+ })
907
+
908
+ async def handle_batch(req_id: int, model_idx: int, items: list[Image.Image]) -> None:
909
+ if model_idx >= len(models) or not models[model_idx].loaded:
910
+ await writer.send(req_id, {
911
+ "error": f"Model {model_idx} not loaded",
912
+ "results": [],
913
+ })
914
+ return
915
+ try:
916
+ slot = models[model_idx]
917
+ if batch_mode in ("list", "window"):
918
+ # `list` mode: send the batch through dispatcher.run_list
919
+ # — single predict call, internal CoreML iteration.
920
+ # `window` mode: same dispatch path (no point queueing into
921
+ # the accumulator when caller already packed N items).
922
+ payload = await dispatcher.run_list(slot, items)
923
+ else:
924
+ tasks = [dispatcher.run(slot, img) for img in items]
925
+ results = await asyncio.gather(*tasks, return_exceptions=True)
926
+ payload = []
927
+ for r in results:
928
+ if isinstance(r, Exception):
929
+ payload.append({
930
+ "error": str(r),
931
+ "kind": "detections",
932
+ "detections": [],
933
+ "inferenceMs": 0,
934
+ })
935
+ else:
936
+ payload.append(r)
937
+ await writer.send(req_id, {"results": payload})
938
+ except Exception as exc:
939
+ sys.stderr.write(f"Batch error (model {model_idx}, req {req_id}): {exc}\n")
940
+ sys.stderr.flush()
941
+ await writer.send(req_id, {"error": str(exc), "results": []})
942
+
943
+ while True:
944
+ try:
945
+ header = await _read_exact(reader, PREFIX_LEN)
946
+ except asyncio.IncompleteReadError:
947
+ return # stdin closed
948
+ total_len, req_id = struct.unpack("<II", header[0:8])
949
+ msg_type = header[8]
950
+ remaining = total_len - 5
951
+ try:
952
+ payload = await _read_exact(reader, remaining) if remaining > 0 else b""
953
+ except asyncio.IncompleteReadError:
954
+ return
955
+
956
+ if msg_type == MSG_COMMAND:
957
+ try:
958
+ cmd = json.loads(payload)
959
+ # Handle uncache_frame inline (needs access to frame_cache)
960
+ if cmd.get("cmd") == "uncache_frame":
961
+ fid = cmd.get("frameId", -1)
962
+ removed_img = frame_cache.pop(fid, None)
963
+ # Purge preprocessed tensor caches for this frame
964
+ to_del = [k for k in _preprocess_cache if k[0] == fid]
965
+ for k in to_del:
966
+ del _preprocess_cache[k]
967
+ # Also purge the bench preprocess cache
968
+ to_del2 = [k for k in _bench_preprocess_cache if k[0] == fid]
969
+ for k in to_del2:
970
+ del _bench_preprocess_cache[k]
971
+ response = {"cmd": "uncache_frame", "status": "ok", "frameId": fid}
972
+ else:
973
+ response = _handle_command(models, cmd)
974
+ except Exception as exc:
975
+ response = {"cmd": "unknown", "status": "error", "error": str(exc)}
976
+ await writer.send(req_id, response)
977
+
978
+ elif msg_type == MSG_INFER_JPEG:
979
+ if len(payload) < 1:
980
+ await writer.send(req_id, {"error": "empty infer_jpeg payload"})
981
+ continue
982
+ model_idx = payload[0]
983
+ jpeg = payload[1:]
984
+ try:
985
+ img = decode_jpeg(jpeg)
986
+ except Exception as exc:
987
+ await writer.send(req_id, {"error": f"jpeg decode failed: {exc}"})
988
+ continue
989
+ asyncio.create_task(handle_inference(req_id, img, model_idx))
990
+
991
+ elif msg_type == MSG_INFER_RAW:
992
+ if len(payload) < 10:
993
+ await writer.send(req_id, {"error": "truncated infer_raw header"})
994
+ continue
995
+ model_idx = payload[0]
996
+ width, height = struct.unpack("<II", payload[1:9])
997
+ fmt = payload[9]
998
+ raw = payload[10:]
999
+ try:
1000
+ img = wrap_raw(raw, width, height, fmt)
1001
+ except Exception as exc:
1002
+ await writer.send(req_id, {"error": f"raw wrap failed: {exc}"})
1003
+ continue
1004
+ asyncio.create_task(handle_inference(req_id, img, model_idx))
1005
+
1006
+ elif msg_type == MSG_INFER_BATCH:
1007
+ # Header: [1B model_idx][1B count]
1008
+ if len(payload) < 2:
1009
+ await writer.send(req_id, {"error": "truncated infer_batch header"})
1010
+ continue
1011
+ model_idx = payload[0]
1012
+ count = payload[1]
1013
+ offset = 2
1014
+ items: list[Image.Image] = []
1015
+ parse_err: Optional[str] = None
1016
+ for _ in range(count):
1017
+ # Per-item header: [4B w][4B h][1B fmt][4B size]
1018
+ if offset + 13 > len(payload):
1019
+ parse_err = "truncated batch item header"
1020
+ break
1021
+ width, height = struct.unpack("<II", payload[offset:offset + 8])
1022
+ fmt = payload[offset + 8]
1023
+ size = struct.unpack("<I", payload[offset + 9:offset + 13])[0]
1024
+ offset += 13
1025
+ if offset + size > len(payload):
1026
+ parse_err = "truncated batch item payload"
1027
+ break
1028
+ raw = payload[offset:offset + size]
1029
+ offset += size
1030
+ try:
1031
+ items.append(wrap_raw(raw, width, height, fmt))
1032
+ except Exception as exc:
1033
+ parse_err = f"raw wrap failed: {exc}"
1034
+ break
1035
+ if parse_err is not None:
1036
+ await writer.send(req_id, {"error": parse_err, "results": []})
1037
+ continue
1038
+ asyncio.create_task(handle_batch(req_id, model_idx, items))
1039
+
1040
+ elif msg_type == MSG_CACHE_FRAME:
1041
+ # Wire: [4B frame_id][4B width][4B height][1B fmt][raw bytes]
1042
+ if len(payload) < 13:
1043
+ await writer.send(req_id, {"error": "truncated cache_frame header"})
1044
+ continue
1045
+ fid = struct.unpack("<I", payload[0:4])[0]
1046
+ width, height = struct.unpack("<II", payload[4:12])
1047
+ fmt = payload[12]
1048
+ raw = payload[13:]
1049
+ try:
1050
+ img = wrap_raw(raw, width, height, fmt)
1051
+ img._bench_frame_id = fid # Tag for preprocess cache keying
1052
+ frame_cache[fid] = img
1053
+ await writer.send(req_id, {"status": "cached", "frameId": fid, "width": width, "height": height})
1054
+ except Exception as exc:
1055
+ await writer.send(req_id, {"error": f"cache_frame wrap failed: {exc}"})
1056
+
1057
+ elif msg_type == MSG_INFER_CACHED:
1058
+ # Wire: [1B model_idx][4B frame_id]
1059
+ # Routes through handle_inference → window accumulator for
1060
+ # batching. The preprocess runs each time (~4ms) but batch=4
1061
+ # predict saves much more (8ms vs 4×9ms=36ms).
1062
+ if len(payload) < 5:
1063
+ await writer.send(req_id, {"error": "truncated infer_cached header"})
1064
+ continue
1065
+ model_idx = payload[0]
1066
+ fid = struct.unpack("<I", payload[1:5])[0]
1067
+ img = frame_cache.get(fid)
1068
+ if img is None:
1069
+ await writer.send(req_id, {"error": f"frame {fid} not in cache"})
1070
+ continue
1071
+ if model_idx >= len(models) or not models[model_idx].loaded:
1072
+ await writer.send(req_id, {"error": f"model {model_idx} not loaded"})
1073
+ continue
1074
+ asyncio.create_task(handle_inference(req_id, img, model_idx))
1075
+
1076
+ else:
1077
+ await writer.send(req_id, {"error": f"unknown msg_type: {msg_type}"})
1078
+
1079
+
1080
+ def main() -> None:
1081
+ try:
1082
+ asyncio.run(_run())
1083
+ except KeyboardInterrupt:
1084
+ pass
1085
+
1086
+
1087
+ if __name__ == "__main__":
1088
+ main()