camera-ui-ml 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020-2024 seydx <hi@seydx.dev>
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.4
2
+ Name: camera-ui-ml
3
+ Version: 1.0.0
4
+ Summary: Shared ML inference framework for camera.ui detection plugins (onnx, openvino, coreml, ncnn)
5
+ Author-email: seydx <hi@seydx.dev>
6
+ Maintainer-email: seydx <hi@seydx.dev>
7
+ Project-URL: Homepage, https://github.com/cameraui/ml
8
+ Project-URL: Documentation, https://github.com/cameraui/ml
9
+ Project-URL: Repository, https://github.com/cameraui/ml
10
+ Project-URL: Bug Tracker, https://github.com/cameraui/ml/issues
11
+ Project-URL: Changelog, https://github.com/cameraui/ml/blob/main/CHANGELOG.md
12
+ Keywords: camera.ui,python,ml,inference,detection
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Python: >=3.11
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE.md
23
+ Requires-Dist: typing_extensions>=4.15.0
24
+ Requires-Dist: camera-ui-sdk>=1.0.95
25
+ Requires-Dist: numpy>=2.0
26
+ Requires-Dist: pillow>=12.0
27
+ Requires-Dist: aiohttp>=3.12
28
+ Provides-Extra: test
29
+ Requires-Dist: pytest>=8; extra == "test"
30
+ Provides-Extra: clip
31
+ Requires-Dist: transformers>=4.40; extra == "clip"
32
+ Dynamic: license-file
33
+
34
+ # camera-ui-ml
35
+
36
+ [![PyPI](https://img.shields.io/pypi/v/camera-ui-ml?label=pypi&logo=pypi&logoColor=white)](https://pypi.org/project/camera-ui-ml/)
37
+
38
+ Shared ML inference framework for camera.ui's detection plugins.
39
+
40
+ ---
41
+
42
+ _Part of the camera.ui ecosystem - A comprehensive camera management solution._
@@ -0,0 +1,9 @@
1
+ # camera-ui-ml
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/camera-ui-ml?label=pypi&logo=pypi&logoColor=white)](https://pypi.org/project/camera-ui-ml/)
4
+
5
+ Shared ML inference framework for camera.ui's detection plugins.
6
+
7
+ ---
8
+
9
+ _Part of the camera.ui ecosystem - A comprehensive camera management solution._
@@ -0,0 +1,88 @@
1
+ from __future__ import annotations
2
+
3
+ from .backend import (
4
+ DType,
5
+ InferenceBackend,
6
+ InputSpec,
7
+ Layout,
8
+ NDArray,
9
+ Normalize,
10
+ Outputs,
11
+ )
12
+ from .detectors import (
13
+ BaseDetector,
14
+ BoxDetector,
15
+ Embedder,
16
+ NormalizedDetection,
17
+ OcrResult,
18
+ ParseKind,
19
+ PlateOcr,
20
+ )
21
+ from .geometry import (
22
+ Box,
23
+ NormalizedBox,
24
+ clamp_box,
25
+ cxcywh_to_xyxy,
26
+ normalize_box,
27
+ pad_box,
28
+ scale_box,
29
+ )
30
+ from .model_manager import BaseModelManager
31
+ from .parsing import (
32
+ RawDetection,
33
+ channels_first,
34
+ decode_ocr,
35
+ l2_normalize,
36
+ nms,
37
+ parse_end2end,
38
+ parse_yolov9,
39
+ )
40
+ from .pipeline import prepare_executor, run_prepare
41
+ from .preprocess import crop_rgb, decode_image, frame_to_rgb, resize_rgb, to_pil, to_tensor
42
+
43
+ __all__ = [
44
+ # backend seam
45
+ "InferenceBackend",
46
+ "InputSpec",
47
+ "Outputs",
48
+ "NDArray",
49
+ "Layout",
50
+ "Normalize",
51
+ "DType",
52
+ # model manager
53
+ "BaseModelManager",
54
+ # detectors (ClipEncoder is opt-in via camera_ui_ml.detectors.clip)
55
+ "BaseDetector",
56
+ "BoxDetector",
57
+ "NormalizedDetection",
58
+ "ParseKind",
59
+ "Embedder",
60
+ "PlateOcr",
61
+ "OcrResult",
62
+ # geometry
63
+ "Box",
64
+ "NormalizedBox",
65
+ "cxcywh_to_xyxy",
66
+ "scale_box",
67
+ "clamp_box",
68
+ "normalize_box",
69
+ "pad_box",
70
+ # parsing
71
+ "RawDetection",
72
+ "channels_first",
73
+ "parse_yolov9",
74
+ "parse_end2end",
75
+ "decode_ocr",
76
+ "l2_normalize",
77
+ "nms",
78
+ # preprocess
79
+ "frame_to_rgb",
80
+ "resize_rgb",
81
+ "to_pil",
82
+ "to_tensor",
83
+ "crop_rgb",
84
+ "decode_image",
85
+ # pipeline
86
+ "run_prepare",
87
+ "prepare_executor",
88
+ ]
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from collections.abc import Mapping, Sequence
5
+ from dataclasses import dataclass
6
+ from typing import Any, Literal
7
+
8
+ import numpy as np
9
+
10
+ from .pipeline import run_prepare
11
+
12
+ NDArray = np.ndarray[Any, Any]
13
+
14
+ Layout = Literal["nchw", "nhwc"]
15
+ Normalize = Literal["unit", "facenet", "none"]
16
+ DType = Literal["float32", "uint8"]
17
+
18
+ # Inference outputs in model output order; detectors index `outputs[0]` for the
19
+ # primary output (OCR scans all outputs, CLIP uses the single embedding output).
20
+ Outputs = Sequence[NDArray]
21
+
22
+
23
+ @dataclass(frozen=True)
24
+ class InputSpec:
25
+ width: int
26
+ height: int
27
+ layout: Layout = "nchw"
28
+ normalize: Normalize = "unit" # "unit" = /255, "facenet" = (x-127.5)/128, "none" = raw
29
+ dtype: DType = "float32"
30
+
31
+
32
+ class InferenceBackend(ABC):
33
+ @property
34
+ @abstractmethod
35
+ def input_size(self) -> tuple[int, int]: ...
36
+
37
+ @abstractmethod
38
+ def metadata(self) -> Mapping[str, str]: ...
39
+
40
+ @abstractmethod
41
+ async def infer(self, inputs: Sequence[Any]) -> Outputs: ...
42
+
43
+ @abstractmethod
44
+ def close(self) -> None: ...
45
+
46
+ def adapt(self, image: NDArray, spec: InputSpec) -> Sequence[Any]:
47
+ """Realize the canonical HWC uint8 RGB image into runtime model inputs.
48
+
49
+ Default: one normalized tensor per ``spec`` (onnx/openvino/ncnn). CoreML
50
+ image-input models override this to pass a PIL image instead. Runs on the
51
+ shared prepare-executor (CPU)."""
52
+ from .preprocess import to_tensor
53
+
54
+ return [to_tensor(image, spec)]
55
+
56
+ async def run(self, image: NDArray, spec: InputSpec) -> Outputs:
57
+ prepared = await run_prepare(lambda: list(self.adapt(image, spec)))
58
+ return await self.infer(prepared)
@@ -0,0 +1,17 @@
1
+ from __future__ import annotations
2
+
3
+ from .base import BaseDetector
4
+ from .box import BoxDetector, NormalizedDetection, ParseKind
5
+ from .embedder import Embedder
6
+ from .ocr import DEFAULT_ALPHABET, OcrResult, PlateOcr
7
+
8
+ __all__ = [
9
+ "BaseDetector",
10
+ "BoxDetector",
11
+ "NormalizedDetection",
12
+ "ParseKind",
13
+ "Embedder",
14
+ "PlateOcr",
15
+ "OcrResult",
16
+ "DEFAULT_ALPHABET",
17
+ ]
@@ -0,0 +1,57 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+
5
+ from camera_ui_sdk import LoggerService
6
+
7
+ from ..backend import InferenceBackend
8
+ from ..model_manager import BaseModelManager
9
+
10
+
11
+ class BaseDetector:
12
+ name: str = "detector"
13
+
14
+ def __init__(self, manager: BaseModelManager, logger: LoggerService) -> None:
15
+ self.manager = manager
16
+ self.logger = logger
17
+ self.backend: InferenceBackend | None = None
18
+ self.initialized = False
19
+ self.closed = False
20
+ self._init_task: asyncio.Task[None] | None = None
21
+
22
+ async def initialize(self, model_name: str) -> None:
23
+ if self.initialized:
24
+ return
25
+ if self._init_task is None:
26
+ self._init_task = asyncio.create_task(self._do_initialize(model_name))
27
+ await self._init_task
28
+
29
+ async def close(self) -> None:
30
+ self.closed = True
31
+ if self._init_task is not None:
32
+ self._init_task.cancel()
33
+ self._init_task = None
34
+ self.initialized = False
35
+ if self.backend is not None:
36
+ self.backend.close()
37
+ self.backend = None
38
+
39
+ async def _do_initialize(self, model_name: str) -> None:
40
+ try:
41
+ self.backend = await self.manager.ensure_backend(model_name)
42
+ if self.closed:
43
+ return
44
+ await self._configure(model_name)
45
+ self.initialized = True
46
+ self.logger.success(f"Loaded {self.name}: {model_name}")
47
+ except Exception as error:
48
+ self.logger.error(f"Failed to initialize {self.name}: {error}")
49
+ finally:
50
+ self._init_task = None
51
+
52
+ async def _configure(self, model_name: str) -> None:
53
+ """Read model-derived config (input size, labels) from ``self.backend``.
54
+ Overridden by subclasses that need it; the default is a no-op."""
55
+
56
+ def _ready(self) -> bool:
57
+ return self.initialized and self.backend is not None
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ import ast
4
+ import re
5
+ from collections.abc import Mapping
6
+ from typing import Any, Literal
7
+
8
+ import numpy as np
9
+ from camera_ui_sdk import ImageMetadata, LoggerService, VideoFrameData
10
+
11
+ from ..backend import InputSpec, NDArray, Normalize
12
+ from ..geometry import NormalizedBox, normalize_box, scale_box
13
+ from ..model_manager import BaseModelManager
14
+ from ..parsing import RawDetection, channels_first, nms, parse_end2end, parse_yolov9
15
+ from ..preprocess import decode_image, frame_to_rgb
16
+ from .base import BaseDetector
17
+
18
+ ParseKind = Literal["yolov9", "end2end"]
19
+ NormalizedDetection = tuple[int, float, NormalizedBox]
20
+
21
+
22
+ class BoxDetector(BaseDetector):
23
+ def __init__(
24
+ self,
25
+ manager: BaseModelManager,
26
+ logger: LoggerService,
27
+ *,
28
+ name: str = "detector",
29
+ parse: ParseKind = "yolov9",
30
+ size: tuple[int, int] = (320, 320),
31
+ normalize: Normalize = "unit",
32
+ multiclass: bool = False,
33
+ apply_nms: bool = False,
34
+ threshold: float = 0.5,
35
+ ) -> None:
36
+ super().__init__(manager, logger)
37
+ self.name = name
38
+ self.threshold = threshold
39
+ self.input_size = size
40
+ self.labels: dict[int, str] = {}
41
+
42
+ self._parse = parse
43
+ self._normalize = normalize
44
+ self._multiclass = multiclass
45
+ self._apply_nms = apply_nms
46
+
47
+ async def _configure(self, model_name: str) -> None:
48
+ assert self.backend is not None
49
+ self.input_size = self.backend.input_size
50
+ if self._multiclass:
51
+ self.labels = _parse_labels(self.backend.metadata())
52
+
53
+ @property
54
+ def _spec(self) -> InputSpec:
55
+ return InputSpec(
56
+ self.input_size[0],
57
+ self.input_size[1],
58
+ layout="nchw",
59
+ normalize=self._normalize,
60
+ )
61
+
62
+ async def detect(self, image: NDArray, threshold: float | None = None) -> list[RawDetection]:
63
+ """Detect on an HWC uint8 RGB array; boxes are in model-input pixel coords."""
64
+ if not self._ready():
65
+ return []
66
+ assert self.backend is not None
67
+
68
+ outputs = await self.backend.run(image, self._spec)
69
+ primary = np.asarray(outputs[0])
70
+ limit = self.threshold if threshold is None else threshold
71
+
72
+ if self._parse == "yolov9":
73
+ detections = parse_yolov9(channels_first(primary), limit)
74
+ else:
75
+ detections = parse_end2end(np.squeeze(primary), limit)
76
+
77
+ return nms(detections) if self._apply_nms else detections
78
+
79
+ async def detect_frame(self, frame: VideoFrameData, threshold: float | None = None) -> list[RawDetection]:
80
+ """Detect on a raw video frame; boxes are rescaled to frame pixel coords."""
81
+ if not self._ready():
82
+ return []
83
+
84
+ rgb = frame_to_rgb(frame["data"], frame["width"], frame["height"], frame["format"])
85
+ raw = await self.detect(rgb, threshold)
86
+
87
+ scale_x = frame["width"] / self.input_size[0]
88
+ scale_y = frame["height"] / self.input_size[1]
89
+ return [(cid, conf, scale_box(box, scale_x, scale_y)) for cid, conf, box in raw]
90
+
91
+ async def detect_single(
92
+ self, image_data: bytes, metadata: ImageMetadata, threshold: float | None = None
93
+ ) -> list[NormalizedDetection]:
94
+ """Detect on an encoded image; boxes are normalized to 0..1."""
95
+ if not self._ready():
96
+ return []
97
+
98
+ raw = await self.detect(decode_image(image_data), threshold)
99
+ return [
100
+ (cid, conf, normalize_box(box, self.input_size[0], self.input_size[1])) for cid, conf, box in raw
101
+ ]
102
+
103
+
104
+ def _parse_labels(metadata: Mapping[str, str]) -> dict[int, str]:
105
+ names = metadata.get("names") or metadata.get("yolo.names")
106
+ if names:
107
+ parsed: dict[Any, Any] = ast.literal_eval(names)
108
+ return {int(key): str(value) for key, value in parsed.items()}
109
+
110
+ classes = metadata.get("classes")
111
+ if not classes:
112
+ return {}
113
+
114
+ labels: dict[int, str] = {}
115
+ for row, line in enumerate(classes.split(",")):
116
+ pair = re.split(r"[:\s]+", line.strip(), maxsplit=1)
117
+ if len(pair) == 2 and pair[0].strip().isdigit():
118
+ labels[int(pair[0])] = pair[1].strip()
119
+ else:
120
+ labels[row] = line.strip()
121
+ return labels
@@ -0,0 +1,119 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+ from camera_ui_sdk import LoggerService, VideoFrameData
8
+ from transformers import CLIPProcessor
9
+
10
+ from ..backend import InferenceBackend, NDArray
11
+ from ..model_manager import BaseModelManager
12
+ from ..parsing import l2_normalize
13
+ from ..pipeline import run_prepare
14
+ from ..preprocess import frame_to_rgb, to_pil
15
+
16
+
17
+ class ClipEncoder:
18
+ def __init__(
19
+ self,
20
+ manager: BaseModelManager,
21
+ logger: LoggerService,
22
+ *,
23
+ pretrained: str = "openai/clip-vit-base-patch32",
24
+ embedding_model: str = "clip-vit-base-patch32",
25
+ ) -> None:
26
+ self.manager = manager
27
+ self.logger = logger
28
+ self.embedding_model = embedding_model
29
+
30
+ self.vision: InferenceBackend | None = None
31
+ self.text: InferenceBackend | None = None
32
+ self.processor: Any = None
33
+
34
+ self.initialized = False
35
+ self.closed = False
36
+ self._pretrained = pretrained
37
+ self._init_task: asyncio.Task[None] | None = None
38
+
39
+ async def initialize(self, vision_model: str, text_model: str) -> None:
40
+ if self.initialized:
41
+ return
42
+ if self._init_task is None:
43
+ self._init_task = asyncio.create_task(self._do_initialize(vision_model, text_model))
44
+ await self._init_task
45
+
46
+ async def close(self) -> None:
47
+ self.closed = True
48
+ if self._init_task is not None:
49
+ self._init_task.cancel()
50
+ self._init_task = None
51
+ self.initialized = False
52
+ if self.vision is not None:
53
+ self.vision.close()
54
+ self.vision = None
55
+ if self.text is not None:
56
+ self.text.close()
57
+ self.text = None
58
+ self.processor = None
59
+
60
+ async def embed_image(self, image: NDArray) -> list[float]:
61
+ """Embed an HWC uint8 RGB image."""
62
+ if not self._ready():
63
+ return []
64
+ assert self.vision is not None
65
+
66
+ pil = to_pil(image)
67
+ pixel_values = await run_prepare(lambda: self._vision_input(pil))
68
+ outputs = await self.vision.infer([pixel_values])
69
+ return [float(value) for value in l2_normalize(outputs[0])]
70
+
71
+ async def embed_text(self, text: str) -> list[float]:
72
+ if not self._ready():
73
+ return []
74
+ assert self.text is not None
75
+
76
+ input_ids, attention_mask = await run_prepare(lambda: self._text_input(text))
77
+ outputs = await self.text.infer([input_ids, attention_mask])
78
+ return [float(value) for value in l2_normalize(outputs[0])]
79
+
80
+ async def embed_frame(self, width: int, height: int, data: bytes) -> list[float]:
81
+ return await self.embed_image(frame_to_rgb(data, width, height))
82
+
83
+ async def embed_frames(self, frames: list[VideoFrameData]) -> list[list[float]]:
84
+ if not self._ready():
85
+ return [[] for _ in frames]
86
+ return [await self.embed_frame(frame["width"], frame["height"], frame["data"]) for frame in frames]
87
+
88
+ def _ready(self) -> bool:
89
+ return (
90
+ self.initialized
91
+ and self.vision is not None
92
+ and self.text is not None
93
+ and self.processor is not None
94
+ )
95
+
96
+ async def _do_initialize(self, vision_model: str, text_model: str) -> None:
97
+ try:
98
+ self.vision = await self.manager.ensure_backend(vision_model)
99
+ self.text = await self.manager.ensure_backend(text_model)
100
+ self.processor = await asyncio.to_thread(CLIPProcessor.from_pretrained, self._pretrained)
101
+ if self.closed:
102
+ return
103
+ self.initialized = True
104
+ self.logger.success(f"Loaded CLIP: {vision_model} + {text_model}")
105
+ except Exception as error:
106
+ self.logger.error(f"Failed to initialize CLIP encoder: {error}")
107
+ finally:
108
+ self._init_task = None
109
+
110
+ def _vision_input(self, pil: Any) -> NDArray:
111
+ inputs = self.processor(images=pil, return_tensors="np", padding="max_length", truncation=True)
112
+ return np.asarray(inputs["pixel_values"], dtype=np.float32)
113
+
114
+ def _text_input(self, text: str) -> tuple[NDArray, NDArray]:
115
+ inputs = self.processor(text=text, return_tensors="np", padding="max_length", truncation=True)
116
+ return (
117
+ np.asarray(inputs["input_ids"], dtype=np.int64),
118
+ np.asarray(inputs["attention_mask"], dtype=np.int64),
119
+ )
@@ -0,0 +1,50 @@
1
+ from __future__ import annotations
2
+
3
+ from camera_ui_sdk import LoggerService
4
+
5
+ from ..backend import InputSpec, NDArray
6
+ from ..geometry import Box
7
+ from ..model_manager import BaseModelManager
8
+ from ..parsing import l2_normalize
9
+ from ..preprocess import frame_to_rgb
10
+ from .base import BaseDetector
11
+
12
+
13
+ class Embedder(BaseDetector):
14
+ def __init__(
15
+ self,
16
+ manager: BaseModelManager,
17
+ logger: LoggerService,
18
+ *,
19
+ size: int = 160,
20
+ name: str = "face embedder",
21
+ ) -> None:
22
+ super().__init__(manager, logger)
23
+ self.name = name
24
+ self.input_size = (size, size)
25
+
26
+ @property
27
+ def _spec(self) -> InputSpec:
28
+ return InputSpec(self.input_size[0], self.input_size[1], layout="nchw", normalize="facenet")
29
+
30
+ async def embed(self, image: NDArray) -> list[float]:
31
+ """Embed an HWC uint8 RGB face crop."""
32
+ if not self._ready():
33
+ return []
34
+ assert self.backend is not None
35
+ outputs = await self.backend.run(image, self._spec)
36
+ return [float(value) for value in l2_normalize(outputs[0])]
37
+
38
+ async def embed_from_crop(self, frame_data: bytes, width: int, height: int, box: Box) -> list[float]:
39
+ if not self._ready():
40
+ return []
41
+
42
+ x1 = max(0, int(box[0]))
43
+ y1 = max(0, int(box[1]))
44
+ x2 = min(width, int(box[2]))
45
+ y2 = min(height, int(box[3]))
46
+ if x2 <= x1 or y2 <= y1:
47
+ return []
48
+
49
+ rgb = frame_to_rgb(frame_data, width, height)
50
+ return await self.embed(rgb[y1:y2, x1:x2])
@@ -0,0 +1,90 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ import numpy as np
6
+ from camera_ui_sdk import LoggerService
7
+
8
+ from ..backend import InputSpec, NDArray, Outputs
9
+ from ..geometry import Box
10
+ from ..model_manager import BaseModelManager
11
+ from ..parsing import decode_ocr
12
+ from ..preprocess import frame_to_rgb
13
+ from .base import BaseDetector
14
+
15
+ #: Default plate alphabet: digits + uppercase letters + pad. Plugins pass the
16
+ #: exact constant their model was trained with.
17
+ DEFAULT_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_"
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class OcrResult:
22
+ text: str
23
+ confidence: float
24
+
25
+
26
+ class PlateOcr(BaseDetector):
27
+ def __init__(
28
+ self,
29
+ manager: BaseModelManager,
30
+ logger: LoggerService,
31
+ *,
32
+ width: int = 128,
33
+ height: int = 64,
34
+ slots: int = 10,
35
+ alphabet: str = DEFAULT_ALPHABET,
36
+ pad_char: str = "_",
37
+ name: str = "OCR",
38
+ ) -> None:
39
+ super().__init__(manager, logger)
40
+ self.name = name
41
+ self.input_size = (width, height)
42
+ self._slots = slots
43
+ self._alphabet = alphabet
44
+ self._pad = pad_char
45
+
46
+ @property
47
+ def _spec(self) -> InputSpec:
48
+ return InputSpec(
49
+ self.input_size[0],
50
+ self.input_size[1],
51
+ layout="nhwc",
52
+ normalize="none",
53
+ dtype="uint8",
54
+ )
55
+
56
+ async def recognize(self, image: NDArray) -> OcrResult | None:
57
+ """Recognize an HWC uint8 RGB plate crop."""
58
+ if not self._ready():
59
+ return None
60
+ assert self.backend is not None
61
+ return self._decode(await self.backend.run(image, self._spec))
62
+
63
+ async def recognize_from_crop(
64
+ self, frame_data: bytes, width: int, height: int, box: Box
65
+ ) -> OcrResult | None:
66
+ if not self._ready():
67
+ return None
68
+
69
+ x1 = max(0, int(box[0]))
70
+ y1 = max(0, int(box[1]))
71
+ x2 = min(width, int(box[2]))
72
+ y2 = min(height, int(box[3]))
73
+ if x2 <= x1 or y2 <= y1:
74
+ return None
75
+
76
+ rgb = frame_to_rgb(frame_data, width, height)
77
+ return await self.recognize(rgb[y1:y2, x1:x2])
78
+
79
+ def _decode(self, outputs: Outputs) -> OcrResult | None:
80
+ logits: NDArray | None = None
81
+ for value in outputs:
82
+ arr = np.asarray(value)
83
+ if arr.ndim == 3 and arr.shape[1] == self._slots:
84
+ logits = arr[0]
85
+
86
+ if logits is None:
87
+ return None
88
+
89
+ text, confidence = decode_ocr(logits, self._alphabet, self._pad)
90
+ return OcrResult(text, confidence) if text else None
@@ -0,0 +1,58 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TypedDict
4
+
5
+ Box = tuple[float, float, float, float]
6
+
7
+
8
+ class NormalizedBox(TypedDict):
9
+ x: float
10
+ y: float
11
+ width: float
12
+ height: float
13
+
14
+
15
+ def cxcywh_to_xyxy(cx: float, cy: float, w: float, h: float) -> Box:
16
+ half_w = w / 2.0
17
+ half_h = h / 2.0
18
+ return (cx - half_w, cy - half_h, cx + half_w, cy + half_h)
19
+
20
+
21
+ def scale_box(box: Box, scale_x: float, scale_y: float) -> Box:
22
+ x1, y1, x2, y2 = box
23
+ return (x1 * scale_x, y1 * scale_y, x2 * scale_x, y2 * scale_y)
24
+
25
+
26
+ def clamp_box(box: Box, width: float, height: float) -> Box:
27
+ x1, y1, x2, y2 = box
28
+ return (
29
+ min(max(x1, 0.0), width),
30
+ min(max(y1, 0.0), height),
31
+ min(max(x2, 0.0), width),
32
+ min(max(y2, 0.0), height),
33
+ )
34
+
35
+
36
+ def normalize_box(box: Box, width: float, height: float) -> NormalizedBox:
37
+ x1, y1, x2, y2 = box
38
+ return {
39
+ "x": x1 / width,
40
+ "y": y1 / height,
41
+ "width": (x2 - x1) / width,
42
+ "height": (y2 - y1) / height,
43
+ }
44
+
45
+
46
+ def pad_box(box: Box, width: float, height: float, padding: float, min_size: int = 0) -> Box:
47
+ """Expand a box by ``padding`` (fraction of its size) and an optional minimum
48
+ pixel size, clamped to the frame. Used to crop faces/plates with context."""
49
+ x1, y1, x2, y2 = box
50
+ w = x2 - x1
51
+ h = y2 - y1
52
+
53
+ extra_x = max(0.0, (min_size - w) / 2.0) if min_size > 0 else 0.0
54
+ extra_y = max(0.0, (min_size - h) / 2.0) if min_size > 0 else 0.0
55
+ pad_x = w * padding + extra_x
56
+ pad_y = h * padding + extra_y
57
+
58
+ return clamp_box((x1 - pad_x, y1 - pad_y, x2 + pad_x, y2 + pad_y), width, height)
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ from abc import ABC, abstractmethod
6
+ from collections.abc import Mapping
7
+
8
+ import aiohttp
9
+ from camera_ui_sdk import LoggerService
10
+
11
+ from .backend import InferenceBackend
12
+
13
+ _CHUNK = 1024 * 1024
14
+
15
+
16
+ class BaseModelManager(ABC):
17
+ def __init__(self, storage_path: str, logger: LoggerService, version: str) -> None:
18
+ self.logger = logger
19
+ self.model_path = os.path.join(storage_path, "models", version)
20
+ self._load_tasks: dict[str, asyncio.Task[InferenceBackend]] = {}
21
+
22
+ async def ensure_backend(self, model_name: str) -> InferenceBackend:
23
+ task = self._load_tasks.get(model_name)
24
+ if task is None:
25
+ task = asyncio.create_task(self._load(model_name))
26
+ self._load_tasks[model_name] = task
27
+ return await task
28
+
29
+ def reset(self) -> None:
30
+ self._load_tasks.clear()
31
+
32
+ @abstractmethod
33
+ def model_files(self, model_name: str) -> Mapping[str, tuple[str, str]]:
34
+ """Map of file key → (download_url, path relative to ``model_path``)."""
35
+
36
+ @abstractmethod
37
+ async def build_backend(self, model_name: str, paths: Mapping[str, str]) -> InferenceBackend:
38
+ """Build the runtime backend from the (already downloaded) local paths."""
39
+
40
+ async def _load(self, model_name: str) -> InferenceBackend:
41
+ files = self.model_files(model_name)
42
+ for url, rel in files.values():
43
+ await self._download(url, rel)
44
+
45
+ paths = {key: os.path.join(self.model_path, rel) for key, (_url, rel) in files.items()}
46
+ return await self.build_backend(model_name, paths)
47
+
48
+ async def _download(self, url: str, rel: str) -> None:
49
+ full_path = os.path.join(self.model_path, rel)
50
+ if os.path.isfile(full_path):
51
+ return
52
+
53
+ os.makedirs(os.path.dirname(full_path), exist_ok=True)
54
+ tmp_path = full_path + ".tmp"
55
+ name = os.path.basename(rel)
56
+ self.logger.log(f"Downloading {name}...")
57
+
58
+ async with aiohttp.ClientSession() as session, session.get(url) as response:
59
+ if response.status < 200 or response.status >= 300:
60
+ raise RuntimeError(f"Error downloading {url}: {response.status}")
61
+
62
+ total = int(response.headers.get("content-length", 0))
63
+ downloaded = 0
64
+ last_step = 0
65
+
66
+ with open(tmp_path, "wb") as handle:
67
+ async for chunk in response.content.iter_chunked(_CHUNK):
68
+ if not chunk:
69
+ continue
70
+ downloaded += len(chunk)
71
+ handle.write(chunk)
72
+ if total > _CHUNK:
73
+ percent = min(100, downloaded * 100 // total)
74
+ if percent >= last_step + 25:
75
+ last_step = (percent // 25) * 25
76
+ self.logger.log(f"Downloading {name}... {last_step}%")
77
+
78
+ os.rename(tmp_path, full_path)
79
+ self.logger.log(f"Downloaded {name} ({downloaded / _CHUNK:.1f} MB)")
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any
4
+
5
+ import numpy as np
6
+
7
+ from .geometry import Box, cxcywh_to_xyxy
8
+
9
+ NDArray = np.ndarray[Any, Any]
10
+ RawDetection = tuple[int, float, Box]
11
+
12
+
13
+ def channels_first(output: NDArray) -> NDArray:
14
+ squeezed = np.squeeze(output)
15
+ if squeezed.ndim != 2:
16
+ return squeezed
17
+ return squeezed if squeezed.shape[0] <= squeezed.shape[1] else squeezed.T
18
+
19
+
20
+ def parse_yolov9(results: NDArray, threshold: float) -> list[RawDetection]:
21
+ scores = results[4:]
22
+ detections: list[RawDetection] = []
23
+ for class_id, index in np.argwhere(scores > threshold):
24
+ confidence = float(results[class_id + 4, index])
25
+ cx = float(results[0, index])
26
+ cy = float(results[1, index])
27
+ w = float(results[2, index])
28
+ h = float(results[3, index])
29
+ detections.append((int(class_id), confidence, cxcywh_to_xyxy(cx, cy, w, h)))
30
+ return detections
31
+
32
+
33
+ def parse_end2end(rows: NDArray, threshold: float) -> list[RawDetection]:
34
+ detections: list[RawDetection] = []
35
+ for row in rows:
36
+ score = float(row[6])
37
+ if score <= threshold:
38
+ continue
39
+ detections.append(
40
+ (
41
+ int(row[5]),
42
+ score,
43
+ (float(row[1]), float(row[2]), float(row[3]), float(row[4])),
44
+ )
45
+ )
46
+ return detections
47
+
48
+
49
+ def decode_ocr(logits: NDArray, alphabet: str, pad_char: str = "_") -> tuple[str, float]:
50
+ chars: list[str] = []
51
+ confidences: list[float] = []
52
+ for slot in logits:
53
+ index = int(np.argmax(slot))
54
+ char = alphabet[index] if index < len(alphabet) else pad_char
55
+ if char == pad_char:
56
+ break
57
+ shifted = np.exp(slot - np.max(slot))
58
+ confidences.append(float(shifted[index] / np.sum(shifted)))
59
+ chars.append(char)
60
+ return "".join(chars), (float(np.mean(confidences)) if confidences else 0.0)
61
+
62
+
63
+ def l2_normalize(vector: NDArray) -> NDArray:
64
+ flat = np.asarray(vector, dtype=np.float32).flatten()
65
+ norm = float(np.linalg.norm(flat))
66
+ return flat / norm if norm > 0.0 else flat
67
+
68
+
69
+ def nms(detections: list[RawDetection], iou_threshold: float = 0.45) -> list[RawDetection]:
70
+ if not detections:
71
+ return []
72
+
73
+ boxes = np.array([d[2] for d in detections], dtype=np.float32)
74
+ scores = np.array([d[1] for d in detections], dtype=np.float32)
75
+ order = scores.argsort()[::-1]
76
+
77
+ keep: list[int] = []
78
+ while order.size > 0:
79
+ best = int(order[0])
80
+ keep.append(best)
81
+ if order.size == 1:
82
+ break
83
+ rest = order[1:]
84
+ order = rest[_iou(boxes[best], boxes[rest]) <= iou_threshold]
85
+
86
+ return [detections[i] for i in keep]
87
+
88
+
89
+ def _iou(box: NDArray, others: NDArray) -> NDArray:
90
+ x1 = np.maximum(box[0], others[:, 0])
91
+ y1 = np.maximum(box[1], others[:, 1])
92
+ x2 = np.minimum(box[2], others[:, 2])
93
+ y2 = np.minimum(box[3], others[:, 3])
94
+
95
+ inter = np.clip(x2 - x1, 0.0, None) * np.clip(y2 - y1, 0.0, None)
96
+ area = (box[2] - box[0]) * (box[3] - box[1])
97
+ areas = (others[:, 2] - others[:, 0]) * (others[:, 3] - others[:, 1])
98
+ union = area + areas - inter
99
+ return np.where(union > 0.0, inter / union, 0.0)
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import os
5
+ from collections.abc import Callable
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from typing import TypeVar
8
+
9
+ T = TypeVar("T")
10
+
11
+ _PREPARE_WORKERS = min(8, os.cpu_count() or 4)
12
+ _prepare_executor = ThreadPoolExecutor(max_workers=_PREPARE_WORKERS, thread_name_prefix="ml-prepare")
13
+
14
+
15
+ async def run_prepare(fn: Callable[[], T]) -> T:
16
+ return await asyncio.get_event_loop().run_in_executor(_prepare_executor, fn)
17
+
18
+
19
+ def prepare_executor() -> ThreadPoolExecutor:
20
+ return _prepare_executor
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ from typing import Any
5
+
6
+ import numpy as np
7
+ from PIL import Image
8
+
9
+ from .backend import InputSpec
10
+ from .geometry import Box
11
+
12
+ NDArray = np.ndarray[Any, Any]
13
+
14
+
15
+ def frame_to_rgb(data: bytes, width: int, height: int, fmt: str = "rgb") -> NDArray:
16
+ pixels = np.frombuffer(data, dtype=np.uint8)
17
+ if fmt == "rgb":
18
+ return pixels.reshape((height, width, 3))
19
+ if fmt == "rgba":
20
+ return pixels.reshape((height, width, 4))[:, :, :3]
21
+ if fmt == "gray":
22
+ return np.repeat(pixels.reshape((height, width, 1)), 3, axis=2)
23
+ raise ValueError(f"Unsupported frame format for detection: {fmt}")
24
+
25
+
26
+ def decode_image(data: bytes) -> NDArray:
27
+ return np.asarray(Image.open(io.BytesIO(data)).convert("RGB"))
28
+
29
+
30
+ def resize_rgb(rgb: NDArray, width: int, height: int) -> NDArray:
31
+ if rgb.shape[1] == width and rgb.shape[0] == height:
32
+ return rgb
33
+ return np.asarray(Image.fromarray(rgb, mode="RGB").resize((width, height)))
34
+
35
+
36
+ def to_pil(rgb: NDArray, width: int | None = None, height: int | None = None) -> Image.Image:
37
+ image = Image.fromarray(rgb, mode="RGB")
38
+ if width is not None and height is not None and (image.width != width or image.height != height):
39
+ image = image.resize((width, height))
40
+ return image
41
+
42
+
43
+ def to_tensor(rgb: NDArray, spec: InputSpec) -> NDArray:
44
+ resized = resize_rgb(rgb, spec.width, spec.height)
45
+
46
+ arr: NDArray
47
+ if spec.normalize == "unit":
48
+ arr = resized.astype(np.float32) / 255.0
49
+ elif spec.normalize == "facenet":
50
+ arr = (resized.astype(np.float32) - 127.5) / 128.0
51
+ else: # "none"
52
+ arr = resized.astype(np.uint8 if spec.dtype == "uint8" else np.float32)
53
+
54
+ if spec.layout == "nchw":
55
+ arr = arr.transpose(2, 0, 1)
56
+
57
+ return np.ascontiguousarray(np.expand_dims(arr, axis=0))
58
+
59
+
60
+ def crop_rgb(rgb: NDArray, box: Box) -> NDArray:
61
+ x1, y1, x2, y2 = (int(round(v)) for v in box)
62
+ return rgb[y1:y2, x1:x2]
File without changes
@@ -0,0 +1,42 @@
1
+ Metadata-Version: 2.4
2
+ Name: camera-ui-ml
3
+ Version: 1.0.0
4
+ Summary: Shared ML inference framework for camera.ui detection plugins (onnx, openvino, coreml, ncnn)
5
+ Author-email: seydx <hi@seydx.dev>
6
+ Maintainer-email: seydx <hi@seydx.dev>
7
+ Project-URL: Homepage, https://github.com/cameraui/ml
8
+ Project-URL: Documentation, https://github.com/cameraui/ml
9
+ Project-URL: Repository, https://github.com/cameraui/ml
10
+ Project-URL: Bug Tracker, https://github.com/cameraui/ml/issues
11
+ Project-URL: Changelog, https://github.com/cameraui/ml/blob/main/CHANGELOG.md
12
+ Keywords: camera.ui,python,ml,inference,detection
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Requires-Python: >=3.11
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE.md
23
+ Requires-Dist: typing_extensions>=4.15.0
24
+ Requires-Dist: camera-ui-sdk>=1.0.95
25
+ Requires-Dist: numpy>=2.0
26
+ Requires-Dist: pillow>=12.0
27
+ Requires-Dist: aiohttp>=3.12
28
+ Provides-Extra: test
29
+ Requires-Dist: pytest>=8; extra == "test"
30
+ Provides-Extra: clip
31
+ Requires-Dist: transformers>=4.40; extra == "clip"
32
+ Dynamic: license-file
33
+
34
+ # camera-ui-ml
35
+
36
+ [![PyPI](https://img.shields.io/pypi/v/camera-ui-ml?label=pypi&logo=pypi&logoColor=white)](https://pypi.org/project/camera-ui-ml/)
37
+
38
+ Shared ML inference framework for camera.ui's detection plugins.
39
+
40
+ ---
41
+
42
+ _Part of the camera.ui ecosystem - A comprehensive camera management solution._
@@ -0,0 +1,22 @@
1
+ LICENSE.md
2
+ README.md
3
+ pyproject.toml
4
+ camera_ui_ml/__init__.py
5
+ camera_ui_ml/backend.py
6
+ camera_ui_ml/geometry.py
7
+ camera_ui_ml/model_manager.py
8
+ camera_ui_ml/parsing.py
9
+ camera_ui_ml/pipeline.py
10
+ camera_ui_ml/preprocess.py
11
+ camera_ui_ml/py.typed
12
+ camera_ui_ml.egg-info/PKG-INFO
13
+ camera_ui_ml.egg-info/SOURCES.txt
14
+ camera_ui_ml.egg-info/dependency_links.txt
15
+ camera_ui_ml.egg-info/requires.txt
16
+ camera_ui_ml.egg-info/top_level.txt
17
+ camera_ui_ml/detectors/__init__.py
18
+ camera_ui_ml/detectors/base.py
19
+ camera_ui_ml/detectors/box.py
20
+ camera_ui_ml/detectors/clip.py
21
+ camera_ui_ml/detectors/embedder.py
22
+ camera_ui_ml/detectors/ocr.py
@@ -0,0 +1,11 @@
1
+ typing_extensions>=4.15.0
2
+ camera-ui-sdk>=1.0.95
3
+ numpy>=2.0
4
+ pillow>=12.0
5
+ aiohttp>=3.12
6
+
7
+ [clip]
8
+ transformers>=4.40
9
+
10
+ [test]
11
+ pytest>=8
@@ -0,0 +1 @@
1
+ camera_ui_ml
@@ -0,0 +1,45 @@
1
+ [build-system]
2
+ requires = ["setuptools>=80.9.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "camera-ui-ml"
7
+ version = "1.0.0"
8
+ description = "Shared ML inference framework for camera.ui detection plugins (onnx, openvino, coreml, ncnn)"
9
+ keywords = ["camera.ui", "python", "ml", "inference", "detection"]
10
+ readme = "README.md"
11
+ license-files = ["LICENSE.md"]
12
+ authors = [{ name = "seydx", email = "hi@seydx.dev" }]
13
+ maintainers = [{ name = "seydx", email = "hi@seydx.dev" }]
14
+ requires-python = ">=3.11"
15
+ classifiers = [
16
+ "Development Status :: 5 - Production/Stable",
17
+ "Intended Audience :: Developers",
18
+ "Operating System :: OS Independent",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.11",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Programming Language :: Python :: 3.13",
23
+ ]
24
+ dependencies = ["typing_extensions>=4.15.0", "camera-ui-sdk>=1.0.95", "numpy>=2.0", "pillow>=12.0", "aiohttp>=3.12"]
25
+
26
+ [project.optional-dependencies]
27
+ test = ["pytest>=8"]
28
+ clip = ["transformers>=4.40"]
29
+
30
+ [project.urls]
31
+ Homepage = "https://github.com/cameraui/ml"
32
+ Documentation = "https://github.com/cameraui/ml"
33
+ Repository = "https://github.com/cameraui/ml"
34
+ "Bug Tracker" = "https://github.com/cameraui/ml/issues"
35
+ Changelog = "https://github.com/cameraui/ml/blob/main/CHANGELOG.md"
36
+
37
+ [tool.pytest.ini_options]
38
+ testpaths = ["tests"]
39
+
40
+ [tool.setuptools.packages.find]
41
+ where = ["."]
42
+ include = ["camera_ui_ml*"]
43
+
44
+ [tool.setuptools.package-data]
45
+ camera_ui_ml = ["py.typed"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+