camera-ui-ml 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- camera_ui_ml-1.0.0/LICENSE.md +22 -0
- camera_ui_ml-1.0.0/PKG-INFO +42 -0
- camera_ui_ml-1.0.0/README.md +9 -0
- camera_ui_ml-1.0.0/camera_ui_ml/__init__.py +88 -0
- camera_ui_ml-1.0.0/camera_ui_ml/backend.py +58 -0
- camera_ui_ml-1.0.0/camera_ui_ml/detectors/__init__.py +17 -0
- camera_ui_ml-1.0.0/camera_ui_ml/detectors/base.py +57 -0
- camera_ui_ml-1.0.0/camera_ui_ml/detectors/box.py +121 -0
- camera_ui_ml-1.0.0/camera_ui_ml/detectors/clip.py +119 -0
- camera_ui_ml-1.0.0/camera_ui_ml/detectors/embedder.py +50 -0
- camera_ui_ml-1.0.0/camera_ui_ml/detectors/ocr.py +90 -0
- camera_ui_ml-1.0.0/camera_ui_ml/geometry.py +58 -0
- camera_ui_ml-1.0.0/camera_ui_ml/model_manager.py +79 -0
- camera_ui_ml-1.0.0/camera_ui_ml/parsing.py +99 -0
- camera_ui_ml-1.0.0/camera_ui_ml/pipeline.py +20 -0
- camera_ui_ml-1.0.0/camera_ui_ml/preprocess.py +62 -0
- camera_ui_ml-1.0.0/camera_ui_ml/py.typed +0 -0
- camera_ui_ml-1.0.0/camera_ui_ml.egg-info/PKG-INFO +42 -0
- camera_ui_ml-1.0.0/camera_ui_ml.egg-info/SOURCES.txt +22 -0
- camera_ui_ml-1.0.0/camera_ui_ml.egg-info/dependency_links.txt +1 -0
- camera_ui_ml-1.0.0/camera_ui_ml.egg-info/requires.txt +11 -0
- camera_ui_ml-1.0.0/camera_ui_ml.egg-info/top_level.txt +1 -0
- camera_ui_ml-1.0.0/pyproject.toml +45 -0
- camera_ui_ml-1.0.0/setup.cfg +4 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2020-2024 seydx <hi@seydx.dev>
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
|
6
|
+
a copy of this software and associated documentation files (the
|
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
|
11
|
+
the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be
|
|
14
|
+
included in all copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: camera-ui-ml
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Shared ML inference framework for camera.ui detection plugins (onnx, openvino, coreml, ncnn)
|
|
5
|
+
Author-email: seydx <hi@seydx.dev>
|
|
6
|
+
Maintainer-email: seydx <hi@seydx.dev>
|
|
7
|
+
Project-URL: Homepage, https://github.com/cameraui/ml
|
|
8
|
+
Project-URL: Documentation, https://github.com/cameraui/ml
|
|
9
|
+
Project-URL: Repository, https://github.com/cameraui/ml
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/cameraui/ml/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/cameraui/ml/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: camera.ui,python,ml,inference,detection
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE.md
|
|
23
|
+
Requires-Dist: typing_extensions>=4.15.0
|
|
24
|
+
Requires-Dist: camera-ui-sdk>=1.0.95
|
|
25
|
+
Requires-Dist: numpy>=2.0
|
|
26
|
+
Requires-Dist: pillow>=12.0
|
|
27
|
+
Requires-Dist: aiohttp>=3.12
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: pytest>=8; extra == "test"
|
|
30
|
+
Provides-Extra: clip
|
|
31
|
+
Requires-Dist: transformers>=4.40; extra == "clip"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# camera-ui-ml
|
|
35
|
+
|
|
36
|
+
[](https://pypi.org/project/camera-ui-ml/)
|
|
37
|
+
|
|
38
|
+
Shared ML inference framework for camera.ui's detection plugins.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
_Part of the camera.ui ecosystem - A comprehensive camera management solution._
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# camera-ui-ml
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/camera-ui-ml/)
|
|
4
|
+
|
|
5
|
+
Shared ML inference framework for camera.ui's detection plugins.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
_Part of the camera.ui ecosystem - A comprehensive camera management solution._
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .backend import (
|
|
4
|
+
DType,
|
|
5
|
+
InferenceBackend,
|
|
6
|
+
InputSpec,
|
|
7
|
+
Layout,
|
|
8
|
+
NDArray,
|
|
9
|
+
Normalize,
|
|
10
|
+
Outputs,
|
|
11
|
+
)
|
|
12
|
+
from .detectors import (
|
|
13
|
+
BaseDetector,
|
|
14
|
+
BoxDetector,
|
|
15
|
+
Embedder,
|
|
16
|
+
NormalizedDetection,
|
|
17
|
+
OcrResult,
|
|
18
|
+
ParseKind,
|
|
19
|
+
PlateOcr,
|
|
20
|
+
)
|
|
21
|
+
from .geometry import (
|
|
22
|
+
Box,
|
|
23
|
+
NormalizedBox,
|
|
24
|
+
clamp_box,
|
|
25
|
+
cxcywh_to_xyxy,
|
|
26
|
+
normalize_box,
|
|
27
|
+
pad_box,
|
|
28
|
+
scale_box,
|
|
29
|
+
)
|
|
30
|
+
from .model_manager import BaseModelManager
|
|
31
|
+
from .parsing import (
|
|
32
|
+
RawDetection,
|
|
33
|
+
channels_first,
|
|
34
|
+
decode_ocr,
|
|
35
|
+
l2_normalize,
|
|
36
|
+
nms,
|
|
37
|
+
parse_end2end,
|
|
38
|
+
parse_yolov9,
|
|
39
|
+
)
|
|
40
|
+
from .pipeline import prepare_executor, run_prepare
|
|
41
|
+
from .preprocess import crop_rgb, decode_image, frame_to_rgb, resize_rgb, to_pil, to_tensor
|
|
42
|
+
|
|
43
|
+
__all__ = [
|
|
44
|
+
# backend seam
|
|
45
|
+
"InferenceBackend",
|
|
46
|
+
"InputSpec",
|
|
47
|
+
"Outputs",
|
|
48
|
+
"NDArray",
|
|
49
|
+
"Layout",
|
|
50
|
+
"Normalize",
|
|
51
|
+
"DType",
|
|
52
|
+
# model manager
|
|
53
|
+
"BaseModelManager",
|
|
54
|
+
# detectors (ClipEncoder is opt-in via camera_ui_ml.detectors.clip)
|
|
55
|
+
"BaseDetector",
|
|
56
|
+
"BoxDetector",
|
|
57
|
+
"NormalizedDetection",
|
|
58
|
+
"ParseKind",
|
|
59
|
+
"Embedder",
|
|
60
|
+
"PlateOcr",
|
|
61
|
+
"OcrResult",
|
|
62
|
+
# geometry
|
|
63
|
+
"Box",
|
|
64
|
+
"NormalizedBox",
|
|
65
|
+
"cxcywh_to_xyxy",
|
|
66
|
+
"scale_box",
|
|
67
|
+
"clamp_box",
|
|
68
|
+
"normalize_box",
|
|
69
|
+
"pad_box",
|
|
70
|
+
# parsing
|
|
71
|
+
"RawDetection",
|
|
72
|
+
"channels_first",
|
|
73
|
+
"parse_yolov9",
|
|
74
|
+
"parse_end2end",
|
|
75
|
+
"decode_ocr",
|
|
76
|
+
"l2_normalize",
|
|
77
|
+
"nms",
|
|
78
|
+
# preprocess
|
|
79
|
+
"frame_to_rgb",
|
|
80
|
+
"resize_rgb",
|
|
81
|
+
"to_pil",
|
|
82
|
+
"to_tensor",
|
|
83
|
+
"crop_rgb",
|
|
84
|
+
"decode_image",
|
|
85
|
+
# pipeline
|
|
86
|
+
"run_prepare",
|
|
87
|
+
"prepare_executor",
|
|
88
|
+
]
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from collections.abc import Mapping, Sequence
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from .pipeline import run_prepare
|
|
11
|
+
|
|
12
|
+
NDArray = np.ndarray[Any, Any]
|
|
13
|
+
|
|
14
|
+
Layout = Literal["nchw", "nhwc"]
|
|
15
|
+
Normalize = Literal["unit", "facenet", "none"]
|
|
16
|
+
DType = Literal["float32", "uint8"]
|
|
17
|
+
|
|
18
|
+
# Inference outputs in model output order; detectors index `outputs[0]` for the
|
|
19
|
+
# primary output (OCR scans all outputs, CLIP uses the single embedding output).
|
|
20
|
+
Outputs = Sequence[NDArray]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class InputSpec:
|
|
25
|
+
width: int
|
|
26
|
+
height: int
|
|
27
|
+
layout: Layout = "nchw"
|
|
28
|
+
normalize: Normalize = "unit" # "unit" = /255, "facenet" = (x-127.5)/128, "none" = raw
|
|
29
|
+
dtype: DType = "float32"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class InferenceBackend(ABC):
|
|
33
|
+
@property
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def input_size(self) -> tuple[int, int]: ...
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def metadata(self) -> Mapping[str, str]: ...
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
async def infer(self, inputs: Sequence[Any]) -> Outputs: ...
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
def close(self) -> None: ...
|
|
45
|
+
|
|
46
|
+
def adapt(self, image: NDArray, spec: InputSpec) -> Sequence[Any]:
|
|
47
|
+
"""Realize the canonical HWC uint8 RGB image into runtime model inputs.
|
|
48
|
+
|
|
49
|
+
Default: one normalized tensor per ``spec`` (onnx/openvino/ncnn). CoreML
|
|
50
|
+
image-input models override this to pass a PIL image instead. Runs on the
|
|
51
|
+
shared prepare-executor (CPU)."""
|
|
52
|
+
from .preprocess import to_tensor
|
|
53
|
+
|
|
54
|
+
return [to_tensor(image, spec)]
|
|
55
|
+
|
|
56
|
+
async def run(self, image: NDArray, spec: InputSpec) -> Outputs:
|
|
57
|
+
prepared = await run_prepare(lambda: list(self.adapt(image, spec)))
|
|
58
|
+
return await self.infer(prepared)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .base import BaseDetector
|
|
4
|
+
from .box import BoxDetector, NormalizedDetection, ParseKind
|
|
5
|
+
from .embedder import Embedder
|
|
6
|
+
from .ocr import DEFAULT_ALPHABET, OcrResult, PlateOcr
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BaseDetector",
|
|
10
|
+
"BoxDetector",
|
|
11
|
+
"NormalizedDetection",
|
|
12
|
+
"ParseKind",
|
|
13
|
+
"Embedder",
|
|
14
|
+
"PlateOcr",
|
|
15
|
+
"OcrResult",
|
|
16
|
+
"DEFAULT_ALPHABET",
|
|
17
|
+
]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
from camera_ui_sdk import LoggerService
|
|
6
|
+
|
|
7
|
+
from ..backend import InferenceBackend
|
|
8
|
+
from ..model_manager import BaseModelManager
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BaseDetector:
|
|
12
|
+
name: str = "detector"
|
|
13
|
+
|
|
14
|
+
def __init__(self, manager: BaseModelManager, logger: LoggerService) -> None:
|
|
15
|
+
self.manager = manager
|
|
16
|
+
self.logger = logger
|
|
17
|
+
self.backend: InferenceBackend | None = None
|
|
18
|
+
self.initialized = False
|
|
19
|
+
self.closed = False
|
|
20
|
+
self._init_task: asyncio.Task[None] | None = None
|
|
21
|
+
|
|
22
|
+
async def initialize(self, model_name: str) -> None:
|
|
23
|
+
if self.initialized:
|
|
24
|
+
return
|
|
25
|
+
if self._init_task is None:
|
|
26
|
+
self._init_task = asyncio.create_task(self._do_initialize(model_name))
|
|
27
|
+
await self._init_task
|
|
28
|
+
|
|
29
|
+
async def close(self) -> None:
|
|
30
|
+
self.closed = True
|
|
31
|
+
if self._init_task is not None:
|
|
32
|
+
self._init_task.cancel()
|
|
33
|
+
self._init_task = None
|
|
34
|
+
self.initialized = False
|
|
35
|
+
if self.backend is not None:
|
|
36
|
+
self.backend.close()
|
|
37
|
+
self.backend = None
|
|
38
|
+
|
|
39
|
+
async def _do_initialize(self, model_name: str) -> None:
|
|
40
|
+
try:
|
|
41
|
+
self.backend = await self.manager.ensure_backend(model_name)
|
|
42
|
+
if self.closed:
|
|
43
|
+
return
|
|
44
|
+
await self._configure(model_name)
|
|
45
|
+
self.initialized = True
|
|
46
|
+
self.logger.success(f"Loaded {self.name}: {model_name}")
|
|
47
|
+
except Exception as error:
|
|
48
|
+
self.logger.error(f"Failed to initialize {self.name}: {error}")
|
|
49
|
+
finally:
|
|
50
|
+
self._init_task = None
|
|
51
|
+
|
|
52
|
+
async def _configure(self, model_name: str) -> None:
|
|
53
|
+
"""Read model-derived config (input size, labels) from ``self.backend``.
|
|
54
|
+
Overridden by subclasses that need it; the default is a no-op."""
|
|
55
|
+
|
|
56
|
+
def _ready(self) -> bool:
|
|
57
|
+
return self.initialized and self.backend is not None
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import ast
|
|
4
|
+
import re
|
|
5
|
+
from collections.abc import Mapping
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from camera_ui_sdk import ImageMetadata, LoggerService, VideoFrameData
|
|
10
|
+
|
|
11
|
+
from ..backend import InputSpec, NDArray, Normalize
|
|
12
|
+
from ..geometry import NormalizedBox, normalize_box, scale_box
|
|
13
|
+
from ..model_manager import BaseModelManager
|
|
14
|
+
from ..parsing import RawDetection, channels_first, nms, parse_end2end, parse_yolov9
|
|
15
|
+
from ..preprocess import decode_image, frame_to_rgb
|
|
16
|
+
from .base import BaseDetector
|
|
17
|
+
|
|
18
|
+
ParseKind = Literal["yolov9", "end2end"]
|
|
19
|
+
NormalizedDetection = tuple[int, float, NormalizedBox]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BoxDetector(BaseDetector):
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
manager: BaseModelManager,
|
|
26
|
+
logger: LoggerService,
|
|
27
|
+
*,
|
|
28
|
+
name: str = "detector",
|
|
29
|
+
parse: ParseKind = "yolov9",
|
|
30
|
+
size: tuple[int, int] = (320, 320),
|
|
31
|
+
normalize: Normalize = "unit",
|
|
32
|
+
multiclass: bool = False,
|
|
33
|
+
apply_nms: bool = False,
|
|
34
|
+
threshold: float = 0.5,
|
|
35
|
+
) -> None:
|
|
36
|
+
super().__init__(manager, logger)
|
|
37
|
+
self.name = name
|
|
38
|
+
self.threshold = threshold
|
|
39
|
+
self.input_size = size
|
|
40
|
+
self.labels: dict[int, str] = {}
|
|
41
|
+
|
|
42
|
+
self._parse = parse
|
|
43
|
+
self._normalize = normalize
|
|
44
|
+
self._multiclass = multiclass
|
|
45
|
+
self._apply_nms = apply_nms
|
|
46
|
+
|
|
47
|
+
async def _configure(self, model_name: str) -> None:
|
|
48
|
+
assert self.backend is not None
|
|
49
|
+
self.input_size = self.backend.input_size
|
|
50
|
+
if self._multiclass:
|
|
51
|
+
self.labels = _parse_labels(self.backend.metadata())
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def _spec(self) -> InputSpec:
|
|
55
|
+
return InputSpec(
|
|
56
|
+
self.input_size[0],
|
|
57
|
+
self.input_size[1],
|
|
58
|
+
layout="nchw",
|
|
59
|
+
normalize=self._normalize,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
async def detect(self, image: NDArray, threshold: float | None = None) -> list[RawDetection]:
|
|
63
|
+
"""Detect on an HWC uint8 RGB array; boxes are in model-input pixel coords."""
|
|
64
|
+
if not self._ready():
|
|
65
|
+
return []
|
|
66
|
+
assert self.backend is not None
|
|
67
|
+
|
|
68
|
+
outputs = await self.backend.run(image, self._spec)
|
|
69
|
+
primary = np.asarray(outputs[0])
|
|
70
|
+
limit = self.threshold if threshold is None else threshold
|
|
71
|
+
|
|
72
|
+
if self._parse == "yolov9":
|
|
73
|
+
detections = parse_yolov9(channels_first(primary), limit)
|
|
74
|
+
else:
|
|
75
|
+
detections = parse_end2end(np.squeeze(primary), limit)
|
|
76
|
+
|
|
77
|
+
return nms(detections) if self._apply_nms else detections
|
|
78
|
+
|
|
79
|
+
async def detect_frame(self, frame: VideoFrameData, threshold: float | None = None) -> list[RawDetection]:
|
|
80
|
+
"""Detect on a raw video frame; boxes are rescaled to frame pixel coords."""
|
|
81
|
+
if not self._ready():
|
|
82
|
+
return []
|
|
83
|
+
|
|
84
|
+
rgb = frame_to_rgb(frame["data"], frame["width"], frame["height"], frame["format"])
|
|
85
|
+
raw = await self.detect(rgb, threshold)
|
|
86
|
+
|
|
87
|
+
scale_x = frame["width"] / self.input_size[0]
|
|
88
|
+
scale_y = frame["height"] / self.input_size[1]
|
|
89
|
+
return [(cid, conf, scale_box(box, scale_x, scale_y)) for cid, conf, box in raw]
|
|
90
|
+
|
|
91
|
+
async def detect_single(
|
|
92
|
+
self, image_data: bytes, metadata: ImageMetadata, threshold: float | None = None
|
|
93
|
+
) -> list[NormalizedDetection]:
|
|
94
|
+
"""Detect on an encoded image; boxes are normalized to 0..1."""
|
|
95
|
+
if not self._ready():
|
|
96
|
+
return []
|
|
97
|
+
|
|
98
|
+
raw = await self.detect(decode_image(image_data), threshold)
|
|
99
|
+
return [
|
|
100
|
+
(cid, conf, normalize_box(box, self.input_size[0], self.input_size[1])) for cid, conf, box in raw
|
|
101
|
+
]
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _parse_labels(metadata: Mapping[str, str]) -> dict[int, str]:
|
|
105
|
+
names = metadata.get("names") or metadata.get("yolo.names")
|
|
106
|
+
if names:
|
|
107
|
+
parsed: dict[Any, Any] = ast.literal_eval(names)
|
|
108
|
+
return {int(key): str(value) for key, value in parsed.items()}
|
|
109
|
+
|
|
110
|
+
classes = metadata.get("classes")
|
|
111
|
+
if not classes:
|
|
112
|
+
return {}
|
|
113
|
+
|
|
114
|
+
labels: dict[int, str] = {}
|
|
115
|
+
for row, line in enumerate(classes.split(",")):
|
|
116
|
+
pair = re.split(r"[:\s]+", line.strip(), maxsplit=1)
|
|
117
|
+
if len(pair) == 2 and pair[0].strip().isdigit():
|
|
118
|
+
labels[int(pair[0])] = pair[1].strip()
|
|
119
|
+
else:
|
|
120
|
+
labels[row] = line.strip()
|
|
121
|
+
return labels
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from camera_ui_sdk import LoggerService, VideoFrameData
|
|
8
|
+
from transformers import CLIPProcessor
|
|
9
|
+
|
|
10
|
+
from ..backend import InferenceBackend, NDArray
|
|
11
|
+
from ..model_manager import BaseModelManager
|
|
12
|
+
from ..parsing import l2_normalize
|
|
13
|
+
from ..pipeline import run_prepare
|
|
14
|
+
from ..preprocess import frame_to_rgb, to_pil
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ClipEncoder:
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
manager: BaseModelManager,
|
|
21
|
+
logger: LoggerService,
|
|
22
|
+
*,
|
|
23
|
+
pretrained: str = "openai/clip-vit-base-patch32",
|
|
24
|
+
embedding_model: str = "clip-vit-base-patch32",
|
|
25
|
+
) -> None:
|
|
26
|
+
self.manager = manager
|
|
27
|
+
self.logger = logger
|
|
28
|
+
self.embedding_model = embedding_model
|
|
29
|
+
|
|
30
|
+
self.vision: InferenceBackend | None = None
|
|
31
|
+
self.text: InferenceBackend | None = None
|
|
32
|
+
self.processor: Any = None
|
|
33
|
+
|
|
34
|
+
self.initialized = False
|
|
35
|
+
self.closed = False
|
|
36
|
+
self._pretrained = pretrained
|
|
37
|
+
self._init_task: asyncio.Task[None] | None = None
|
|
38
|
+
|
|
39
|
+
async def initialize(self, vision_model: str, text_model: str) -> None:
|
|
40
|
+
if self.initialized:
|
|
41
|
+
return
|
|
42
|
+
if self._init_task is None:
|
|
43
|
+
self._init_task = asyncio.create_task(self._do_initialize(vision_model, text_model))
|
|
44
|
+
await self._init_task
|
|
45
|
+
|
|
46
|
+
async def close(self) -> None:
|
|
47
|
+
self.closed = True
|
|
48
|
+
if self._init_task is not None:
|
|
49
|
+
self._init_task.cancel()
|
|
50
|
+
self._init_task = None
|
|
51
|
+
self.initialized = False
|
|
52
|
+
if self.vision is not None:
|
|
53
|
+
self.vision.close()
|
|
54
|
+
self.vision = None
|
|
55
|
+
if self.text is not None:
|
|
56
|
+
self.text.close()
|
|
57
|
+
self.text = None
|
|
58
|
+
self.processor = None
|
|
59
|
+
|
|
60
|
+
async def embed_image(self, image: NDArray) -> list[float]:
|
|
61
|
+
"""Embed an HWC uint8 RGB image."""
|
|
62
|
+
if not self._ready():
|
|
63
|
+
return []
|
|
64
|
+
assert self.vision is not None
|
|
65
|
+
|
|
66
|
+
pil = to_pil(image)
|
|
67
|
+
pixel_values = await run_prepare(lambda: self._vision_input(pil))
|
|
68
|
+
outputs = await self.vision.infer([pixel_values])
|
|
69
|
+
return [float(value) for value in l2_normalize(outputs[0])]
|
|
70
|
+
|
|
71
|
+
async def embed_text(self, text: str) -> list[float]:
|
|
72
|
+
if not self._ready():
|
|
73
|
+
return []
|
|
74
|
+
assert self.text is not None
|
|
75
|
+
|
|
76
|
+
input_ids, attention_mask = await run_prepare(lambda: self._text_input(text))
|
|
77
|
+
outputs = await self.text.infer([input_ids, attention_mask])
|
|
78
|
+
return [float(value) for value in l2_normalize(outputs[0])]
|
|
79
|
+
|
|
80
|
+
async def embed_frame(self, width: int, height: int, data: bytes) -> list[float]:
|
|
81
|
+
return await self.embed_image(frame_to_rgb(data, width, height))
|
|
82
|
+
|
|
83
|
+
async def embed_frames(self, frames: list[VideoFrameData]) -> list[list[float]]:
|
|
84
|
+
if not self._ready():
|
|
85
|
+
return [[] for _ in frames]
|
|
86
|
+
return [await self.embed_frame(frame["width"], frame["height"], frame["data"]) for frame in frames]
|
|
87
|
+
|
|
88
|
+
def _ready(self) -> bool:
|
|
89
|
+
return (
|
|
90
|
+
self.initialized
|
|
91
|
+
and self.vision is not None
|
|
92
|
+
and self.text is not None
|
|
93
|
+
and self.processor is not None
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
async def _do_initialize(self, vision_model: str, text_model: str) -> None:
|
|
97
|
+
try:
|
|
98
|
+
self.vision = await self.manager.ensure_backend(vision_model)
|
|
99
|
+
self.text = await self.manager.ensure_backend(text_model)
|
|
100
|
+
self.processor = await asyncio.to_thread(CLIPProcessor.from_pretrained, self._pretrained)
|
|
101
|
+
if self.closed:
|
|
102
|
+
return
|
|
103
|
+
self.initialized = True
|
|
104
|
+
self.logger.success(f"Loaded CLIP: {vision_model} + {text_model}")
|
|
105
|
+
except Exception as error:
|
|
106
|
+
self.logger.error(f"Failed to initialize CLIP encoder: {error}")
|
|
107
|
+
finally:
|
|
108
|
+
self._init_task = None
|
|
109
|
+
|
|
110
|
+
def _vision_input(self, pil: Any) -> NDArray:
|
|
111
|
+
inputs = self.processor(images=pil, return_tensors="np", padding="max_length", truncation=True)
|
|
112
|
+
return np.asarray(inputs["pixel_values"], dtype=np.float32)
|
|
113
|
+
|
|
114
|
+
def _text_input(self, text: str) -> tuple[NDArray, NDArray]:
|
|
115
|
+
inputs = self.processor(text=text, return_tensors="np", padding="max_length", truncation=True)
|
|
116
|
+
return (
|
|
117
|
+
np.asarray(inputs["input_ids"], dtype=np.int64),
|
|
118
|
+
np.asarray(inputs["attention_mask"], dtype=np.int64),
|
|
119
|
+
)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from camera_ui_sdk import LoggerService
|
|
4
|
+
|
|
5
|
+
from ..backend import InputSpec, NDArray
|
|
6
|
+
from ..geometry import Box
|
|
7
|
+
from ..model_manager import BaseModelManager
|
|
8
|
+
from ..parsing import l2_normalize
|
|
9
|
+
from ..preprocess import frame_to_rgb
|
|
10
|
+
from .base import BaseDetector
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Embedder(BaseDetector):
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
manager: BaseModelManager,
|
|
17
|
+
logger: LoggerService,
|
|
18
|
+
*,
|
|
19
|
+
size: int = 160,
|
|
20
|
+
name: str = "face embedder",
|
|
21
|
+
) -> None:
|
|
22
|
+
super().__init__(manager, logger)
|
|
23
|
+
self.name = name
|
|
24
|
+
self.input_size = (size, size)
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def _spec(self) -> InputSpec:
|
|
28
|
+
return InputSpec(self.input_size[0], self.input_size[1], layout="nchw", normalize="facenet")
|
|
29
|
+
|
|
30
|
+
async def embed(self, image: NDArray) -> list[float]:
|
|
31
|
+
"""Embed an HWC uint8 RGB face crop."""
|
|
32
|
+
if not self._ready():
|
|
33
|
+
return []
|
|
34
|
+
assert self.backend is not None
|
|
35
|
+
outputs = await self.backend.run(image, self._spec)
|
|
36
|
+
return [float(value) for value in l2_normalize(outputs[0])]
|
|
37
|
+
|
|
38
|
+
async def embed_from_crop(self, frame_data: bytes, width: int, height: int, box: Box) -> list[float]:
|
|
39
|
+
if not self._ready():
|
|
40
|
+
return []
|
|
41
|
+
|
|
42
|
+
x1 = max(0, int(box[0]))
|
|
43
|
+
y1 = max(0, int(box[1]))
|
|
44
|
+
x2 = min(width, int(box[2]))
|
|
45
|
+
y2 = min(height, int(box[3]))
|
|
46
|
+
if x2 <= x1 or y2 <= y1:
|
|
47
|
+
return []
|
|
48
|
+
|
|
49
|
+
rgb = frame_to_rgb(frame_data, width, height)
|
|
50
|
+
return await self.embed(rgb[y1:y2, x1:x2])
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
from camera_ui_sdk import LoggerService
|
|
7
|
+
|
|
8
|
+
from ..backend import InputSpec, NDArray, Outputs
|
|
9
|
+
from ..geometry import Box
|
|
10
|
+
from ..model_manager import BaseModelManager
|
|
11
|
+
from ..parsing import decode_ocr
|
|
12
|
+
from ..preprocess import frame_to_rgb
|
|
13
|
+
from .base import BaseDetector
|
|
14
|
+
|
|
15
|
+
#: Default plate alphabet: digits + uppercase letters + pad. Plugins pass the
|
|
16
|
+
#: exact constant their model was trained with.
|
|
17
|
+
DEFAULT_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class OcrResult:
|
|
22
|
+
text: str
|
|
23
|
+
confidence: float
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class PlateOcr(BaseDetector):
|
|
27
|
+
def __init__(
|
|
28
|
+
self,
|
|
29
|
+
manager: BaseModelManager,
|
|
30
|
+
logger: LoggerService,
|
|
31
|
+
*,
|
|
32
|
+
width: int = 128,
|
|
33
|
+
height: int = 64,
|
|
34
|
+
slots: int = 10,
|
|
35
|
+
alphabet: str = DEFAULT_ALPHABET,
|
|
36
|
+
pad_char: str = "_",
|
|
37
|
+
name: str = "OCR",
|
|
38
|
+
) -> None:
|
|
39
|
+
super().__init__(manager, logger)
|
|
40
|
+
self.name = name
|
|
41
|
+
self.input_size = (width, height)
|
|
42
|
+
self._slots = slots
|
|
43
|
+
self._alphabet = alphabet
|
|
44
|
+
self._pad = pad_char
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def _spec(self) -> InputSpec:
|
|
48
|
+
return InputSpec(
|
|
49
|
+
self.input_size[0],
|
|
50
|
+
self.input_size[1],
|
|
51
|
+
layout="nhwc",
|
|
52
|
+
normalize="none",
|
|
53
|
+
dtype="uint8",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
async def recognize(self, image: NDArray) -> OcrResult | None:
|
|
57
|
+
"""Recognize an HWC uint8 RGB plate crop."""
|
|
58
|
+
if not self._ready():
|
|
59
|
+
return None
|
|
60
|
+
assert self.backend is not None
|
|
61
|
+
return self._decode(await self.backend.run(image, self._spec))
|
|
62
|
+
|
|
63
|
+
async def recognize_from_crop(
|
|
64
|
+
self, frame_data: bytes, width: int, height: int, box: Box
|
|
65
|
+
) -> OcrResult | None:
|
|
66
|
+
if not self._ready():
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
x1 = max(0, int(box[0]))
|
|
70
|
+
y1 = max(0, int(box[1]))
|
|
71
|
+
x2 = min(width, int(box[2]))
|
|
72
|
+
y2 = min(height, int(box[3]))
|
|
73
|
+
if x2 <= x1 or y2 <= y1:
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
rgb = frame_to_rgb(frame_data, width, height)
|
|
77
|
+
return await self.recognize(rgb[y1:y2, x1:x2])
|
|
78
|
+
|
|
79
|
+
def _decode(self, outputs: Outputs) -> OcrResult | None:
|
|
80
|
+
logits: NDArray | None = None
|
|
81
|
+
for value in outputs:
|
|
82
|
+
arr = np.asarray(value)
|
|
83
|
+
if arr.ndim == 3 and arr.shape[1] == self._slots:
|
|
84
|
+
logits = arr[0]
|
|
85
|
+
|
|
86
|
+
if logits is None:
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
text, confidence = decode_ocr(logits, self._alphabet, self._pad)
|
|
90
|
+
return OcrResult(text, confidence) if text else None
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TypedDict
|
|
4
|
+
|
|
5
|
+
Box = tuple[float, float, float, float]
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NormalizedBox(TypedDict):
|
|
9
|
+
x: float
|
|
10
|
+
y: float
|
|
11
|
+
width: float
|
|
12
|
+
height: float
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def cxcywh_to_xyxy(cx: float, cy: float, w: float, h: float) -> Box:
|
|
16
|
+
half_w = w / 2.0
|
|
17
|
+
half_h = h / 2.0
|
|
18
|
+
return (cx - half_w, cy - half_h, cx + half_w, cy + half_h)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def scale_box(box: Box, scale_x: float, scale_y: float) -> Box:
|
|
22
|
+
x1, y1, x2, y2 = box
|
|
23
|
+
return (x1 * scale_x, y1 * scale_y, x2 * scale_x, y2 * scale_y)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def clamp_box(box: Box, width: float, height: float) -> Box:
|
|
27
|
+
x1, y1, x2, y2 = box
|
|
28
|
+
return (
|
|
29
|
+
min(max(x1, 0.0), width),
|
|
30
|
+
min(max(y1, 0.0), height),
|
|
31
|
+
min(max(x2, 0.0), width),
|
|
32
|
+
min(max(y2, 0.0), height),
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def normalize_box(box: Box, width: float, height: float) -> NormalizedBox:
|
|
37
|
+
x1, y1, x2, y2 = box
|
|
38
|
+
return {
|
|
39
|
+
"x": x1 / width,
|
|
40
|
+
"y": y1 / height,
|
|
41
|
+
"width": (x2 - x1) / width,
|
|
42
|
+
"height": (y2 - y1) / height,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def pad_box(box: Box, width: float, height: float, padding: float, min_size: int = 0) -> Box:
|
|
47
|
+
"""Expand a box by ``padding`` (fraction of its size) and an optional minimum
|
|
48
|
+
pixel size, clamped to the frame. Used to crop faces/plates with context."""
|
|
49
|
+
x1, y1, x2, y2 = box
|
|
50
|
+
w = x2 - x1
|
|
51
|
+
h = y2 - y1
|
|
52
|
+
|
|
53
|
+
extra_x = max(0.0, (min_size - w) / 2.0) if min_size > 0 else 0.0
|
|
54
|
+
extra_y = max(0.0, (min_size - h) / 2.0) if min_size > 0 else 0.0
|
|
55
|
+
pad_x = w * padding + extra_x
|
|
56
|
+
pad_y = h * padding + extra_y
|
|
57
|
+
|
|
58
|
+
return clamp_box((x1 - pad_x, y1 - pad_y, x2 + pad_x, y2 + pad_y), width, height)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from collections.abc import Mapping
|
|
7
|
+
|
|
8
|
+
import aiohttp
|
|
9
|
+
from camera_ui_sdk import LoggerService
|
|
10
|
+
|
|
11
|
+
from .backend import InferenceBackend
|
|
12
|
+
|
|
13
|
+
_CHUNK = 1024 * 1024
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseModelManager(ABC):
|
|
17
|
+
def __init__(self, storage_path: str, logger: LoggerService, version: str) -> None:
|
|
18
|
+
self.logger = logger
|
|
19
|
+
self.model_path = os.path.join(storage_path, "models", version)
|
|
20
|
+
self._load_tasks: dict[str, asyncio.Task[InferenceBackend]] = {}
|
|
21
|
+
|
|
22
|
+
async def ensure_backend(self, model_name: str) -> InferenceBackend:
|
|
23
|
+
task = self._load_tasks.get(model_name)
|
|
24
|
+
if task is None:
|
|
25
|
+
task = asyncio.create_task(self._load(model_name))
|
|
26
|
+
self._load_tasks[model_name] = task
|
|
27
|
+
return await task
|
|
28
|
+
|
|
29
|
+
def reset(self) -> None:
|
|
30
|
+
self._load_tasks.clear()
|
|
31
|
+
|
|
32
|
+
@abstractmethod
|
|
33
|
+
def model_files(self, model_name: str) -> Mapping[str, tuple[str, str]]:
|
|
34
|
+
"""Map of file key → (download_url, path relative to ``model_path``)."""
|
|
35
|
+
|
|
36
|
+
@abstractmethod
|
|
37
|
+
async def build_backend(self, model_name: str, paths: Mapping[str, str]) -> InferenceBackend:
|
|
38
|
+
"""Build the runtime backend from the (already downloaded) local paths."""
|
|
39
|
+
|
|
40
|
+
async def _load(self, model_name: str) -> InferenceBackend:
|
|
41
|
+
files = self.model_files(model_name)
|
|
42
|
+
for url, rel in files.values():
|
|
43
|
+
await self._download(url, rel)
|
|
44
|
+
|
|
45
|
+
paths = {key: os.path.join(self.model_path, rel) for key, (_url, rel) in files.items()}
|
|
46
|
+
return await self.build_backend(model_name, paths)
|
|
47
|
+
|
|
48
|
+
async def _download(self, url: str, rel: str) -> None:
|
|
49
|
+
full_path = os.path.join(self.model_path, rel)
|
|
50
|
+
if os.path.isfile(full_path):
|
|
51
|
+
return
|
|
52
|
+
|
|
53
|
+
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
|
54
|
+
tmp_path = full_path + ".tmp"
|
|
55
|
+
name = os.path.basename(rel)
|
|
56
|
+
self.logger.log(f"Downloading {name}...")
|
|
57
|
+
|
|
58
|
+
async with aiohttp.ClientSession() as session, session.get(url) as response:
|
|
59
|
+
if response.status < 200 or response.status >= 300:
|
|
60
|
+
raise RuntimeError(f"Error downloading {url}: {response.status}")
|
|
61
|
+
|
|
62
|
+
total = int(response.headers.get("content-length", 0))
|
|
63
|
+
downloaded = 0
|
|
64
|
+
last_step = 0
|
|
65
|
+
|
|
66
|
+
with open(tmp_path, "wb") as handle:
|
|
67
|
+
async for chunk in response.content.iter_chunked(_CHUNK):
|
|
68
|
+
if not chunk:
|
|
69
|
+
continue
|
|
70
|
+
downloaded += len(chunk)
|
|
71
|
+
handle.write(chunk)
|
|
72
|
+
if total > _CHUNK:
|
|
73
|
+
percent = min(100, downloaded * 100 // total)
|
|
74
|
+
if percent >= last_step + 25:
|
|
75
|
+
last_step = (percent // 25) * 25
|
|
76
|
+
self.logger.log(f"Downloading {name}... {last_step}%")
|
|
77
|
+
|
|
78
|
+
os.rename(tmp_path, full_path)
|
|
79
|
+
self.logger.log(f"Downloaded {name} ({downloaded / _CHUNK:.1f} MB)")
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from .geometry import Box, cxcywh_to_xyxy
|
|
8
|
+
|
|
9
|
+
NDArray = np.ndarray[Any, Any]
|
|
10
|
+
RawDetection = tuple[int, float, Box]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def channels_first(output: NDArray) -> NDArray:
|
|
14
|
+
squeezed = np.squeeze(output)
|
|
15
|
+
if squeezed.ndim != 2:
|
|
16
|
+
return squeezed
|
|
17
|
+
return squeezed if squeezed.shape[0] <= squeezed.shape[1] else squeezed.T
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def parse_yolov9(results: NDArray, threshold: float) -> list[RawDetection]:
|
|
21
|
+
scores = results[4:]
|
|
22
|
+
detections: list[RawDetection] = []
|
|
23
|
+
for class_id, index in np.argwhere(scores > threshold):
|
|
24
|
+
confidence = float(results[class_id + 4, index])
|
|
25
|
+
cx = float(results[0, index])
|
|
26
|
+
cy = float(results[1, index])
|
|
27
|
+
w = float(results[2, index])
|
|
28
|
+
h = float(results[3, index])
|
|
29
|
+
detections.append((int(class_id), confidence, cxcywh_to_xyxy(cx, cy, w, h)))
|
|
30
|
+
return detections
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def parse_end2end(rows: NDArray, threshold: float) -> list[RawDetection]:
|
|
34
|
+
detections: list[RawDetection] = []
|
|
35
|
+
for row in rows:
|
|
36
|
+
score = float(row[6])
|
|
37
|
+
if score <= threshold:
|
|
38
|
+
continue
|
|
39
|
+
detections.append(
|
|
40
|
+
(
|
|
41
|
+
int(row[5]),
|
|
42
|
+
score,
|
|
43
|
+
(float(row[1]), float(row[2]), float(row[3]), float(row[4])),
|
|
44
|
+
)
|
|
45
|
+
)
|
|
46
|
+
return detections
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def decode_ocr(logits: NDArray, alphabet: str, pad_char: str = "_") -> tuple[str, float]:
|
|
50
|
+
chars: list[str] = []
|
|
51
|
+
confidences: list[float] = []
|
|
52
|
+
for slot in logits:
|
|
53
|
+
index = int(np.argmax(slot))
|
|
54
|
+
char = alphabet[index] if index < len(alphabet) else pad_char
|
|
55
|
+
if char == pad_char:
|
|
56
|
+
break
|
|
57
|
+
shifted = np.exp(slot - np.max(slot))
|
|
58
|
+
confidences.append(float(shifted[index] / np.sum(shifted)))
|
|
59
|
+
chars.append(char)
|
|
60
|
+
return "".join(chars), (float(np.mean(confidences)) if confidences else 0.0)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def l2_normalize(vector: NDArray) -> NDArray:
|
|
64
|
+
flat = np.asarray(vector, dtype=np.float32).flatten()
|
|
65
|
+
norm = float(np.linalg.norm(flat))
|
|
66
|
+
return flat / norm if norm > 0.0 else flat
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def nms(detections: list[RawDetection], iou_threshold: float = 0.45) -> list[RawDetection]:
|
|
70
|
+
if not detections:
|
|
71
|
+
return []
|
|
72
|
+
|
|
73
|
+
boxes = np.array([d[2] for d in detections], dtype=np.float32)
|
|
74
|
+
scores = np.array([d[1] for d in detections], dtype=np.float32)
|
|
75
|
+
order = scores.argsort()[::-1]
|
|
76
|
+
|
|
77
|
+
keep: list[int] = []
|
|
78
|
+
while order.size > 0:
|
|
79
|
+
best = int(order[0])
|
|
80
|
+
keep.append(best)
|
|
81
|
+
if order.size == 1:
|
|
82
|
+
break
|
|
83
|
+
rest = order[1:]
|
|
84
|
+
order = rest[_iou(boxes[best], boxes[rest]) <= iou_threshold]
|
|
85
|
+
|
|
86
|
+
return [detections[i] for i in keep]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _iou(box: NDArray, others: NDArray) -> NDArray:
|
|
90
|
+
x1 = np.maximum(box[0], others[:, 0])
|
|
91
|
+
y1 = np.maximum(box[1], others[:, 1])
|
|
92
|
+
x2 = np.minimum(box[2], others[:, 2])
|
|
93
|
+
y2 = np.minimum(box[3], others[:, 3])
|
|
94
|
+
|
|
95
|
+
inter = np.clip(x2 - x1, 0.0, None) * np.clip(y2 - y1, 0.0, None)
|
|
96
|
+
area = (box[2] - box[0]) * (box[3] - box[1])
|
|
97
|
+
areas = (others[:, 2] - others[:, 0]) * (others[:, 3] - others[:, 1])
|
|
98
|
+
union = area + areas - inter
|
|
99
|
+
return np.where(union > 0.0, inter / union, 0.0)
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import os
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
7
|
+
from typing import TypeVar
|
|
8
|
+
|
|
9
|
+
T = TypeVar("T")
|
|
10
|
+
|
|
11
|
+
_PREPARE_WORKERS = min(8, os.cpu_count() or 4)
|
|
12
|
+
_prepare_executor = ThreadPoolExecutor(max_workers=_PREPARE_WORKERS, thread_name_prefix="ml-prepare")
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
async def run_prepare(fn: Callable[[], T]) -> T:
|
|
16
|
+
return await asyncio.get_event_loop().run_in_executor(_prepare_executor, fn)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def prepare_executor() -> ThreadPoolExecutor:
|
|
20
|
+
return _prepare_executor
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
from PIL import Image
|
|
8
|
+
|
|
9
|
+
from .backend import InputSpec
|
|
10
|
+
from .geometry import Box
|
|
11
|
+
|
|
12
|
+
NDArray = np.ndarray[Any, Any]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def frame_to_rgb(data: bytes, width: int, height: int, fmt: str = "rgb") -> NDArray:
|
|
16
|
+
pixels = np.frombuffer(data, dtype=np.uint8)
|
|
17
|
+
if fmt == "rgb":
|
|
18
|
+
return pixels.reshape((height, width, 3))
|
|
19
|
+
if fmt == "rgba":
|
|
20
|
+
return pixels.reshape((height, width, 4))[:, :, :3]
|
|
21
|
+
if fmt == "gray":
|
|
22
|
+
return np.repeat(pixels.reshape((height, width, 1)), 3, axis=2)
|
|
23
|
+
raise ValueError(f"Unsupported frame format for detection: {fmt}")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def decode_image(data: bytes) -> NDArray:
|
|
27
|
+
return np.asarray(Image.open(io.BytesIO(data)).convert("RGB"))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def resize_rgb(rgb: NDArray, width: int, height: int) -> NDArray:
|
|
31
|
+
if rgb.shape[1] == width and rgb.shape[0] == height:
|
|
32
|
+
return rgb
|
|
33
|
+
return np.asarray(Image.fromarray(rgb, mode="RGB").resize((width, height)))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def to_pil(rgb: NDArray, width: int | None = None, height: int | None = None) -> Image.Image:
|
|
37
|
+
image = Image.fromarray(rgb, mode="RGB")
|
|
38
|
+
if width is not None and height is not None and (image.width != width or image.height != height):
|
|
39
|
+
image = image.resize((width, height))
|
|
40
|
+
return image
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def to_tensor(rgb: NDArray, spec: InputSpec) -> NDArray:
|
|
44
|
+
resized = resize_rgb(rgb, spec.width, spec.height)
|
|
45
|
+
|
|
46
|
+
arr: NDArray
|
|
47
|
+
if spec.normalize == "unit":
|
|
48
|
+
arr = resized.astype(np.float32) / 255.0
|
|
49
|
+
elif spec.normalize == "facenet":
|
|
50
|
+
arr = (resized.astype(np.float32) - 127.5) / 128.0
|
|
51
|
+
else: # "none"
|
|
52
|
+
arr = resized.astype(np.uint8 if spec.dtype == "uint8" else np.float32)
|
|
53
|
+
|
|
54
|
+
if spec.layout == "nchw":
|
|
55
|
+
arr = arr.transpose(2, 0, 1)
|
|
56
|
+
|
|
57
|
+
return np.ascontiguousarray(np.expand_dims(arr, axis=0))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def crop_rgb(rgb: NDArray, box: Box) -> NDArray:
|
|
61
|
+
x1, y1, x2, y2 = (int(round(v)) for v in box)
|
|
62
|
+
return rgb[y1:y2, x1:x2]
|
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: camera-ui-ml
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Shared ML inference framework for camera.ui detection plugins (onnx, openvino, coreml, ncnn)
|
|
5
|
+
Author-email: seydx <hi@seydx.dev>
|
|
6
|
+
Maintainer-email: seydx <hi@seydx.dev>
|
|
7
|
+
Project-URL: Homepage, https://github.com/cameraui/ml
|
|
8
|
+
Project-URL: Documentation, https://github.com/cameraui/ml
|
|
9
|
+
Project-URL: Repository, https://github.com/cameraui/ml
|
|
10
|
+
Project-URL: Bug Tracker, https://github.com/cameraui/ml/issues
|
|
11
|
+
Project-URL: Changelog, https://github.com/cameraui/ml/blob/main/CHANGELOG.md
|
|
12
|
+
Keywords: camera.ui,python,ml,inference,detection
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE.md
|
|
23
|
+
Requires-Dist: typing_extensions>=4.15.0
|
|
24
|
+
Requires-Dist: camera-ui-sdk>=1.0.95
|
|
25
|
+
Requires-Dist: numpy>=2.0
|
|
26
|
+
Requires-Dist: pillow>=12.0
|
|
27
|
+
Requires-Dist: aiohttp>=3.12
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: pytest>=8; extra == "test"
|
|
30
|
+
Provides-Extra: clip
|
|
31
|
+
Requires-Dist: transformers>=4.40; extra == "clip"
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# camera-ui-ml
|
|
35
|
+
|
|
36
|
+
[](https://pypi.org/project/camera-ui-ml/)
|
|
37
|
+
|
|
38
|
+
Shared ML inference framework for camera.ui's detection plugins.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
_Part of the camera.ui ecosystem - A comprehensive camera management solution._
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
LICENSE.md
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
camera_ui_ml/__init__.py
|
|
5
|
+
camera_ui_ml/backend.py
|
|
6
|
+
camera_ui_ml/geometry.py
|
|
7
|
+
camera_ui_ml/model_manager.py
|
|
8
|
+
camera_ui_ml/parsing.py
|
|
9
|
+
camera_ui_ml/pipeline.py
|
|
10
|
+
camera_ui_ml/preprocess.py
|
|
11
|
+
camera_ui_ml/py.typed
|
|
12
|
+
camera_ui_ml.egg-info/PKG-INFO
|
|
13
|
+
camera_ui_ml.egg-info/SOURCES.txt
|
|
14
|
+
camera_ui_ml.egg-info/dependency_links.txt
|
|
15
|
+
camera_ui_ml.egg-info/requires.txt
|
|
16
|
+
camera_ui_ml.egg-info/top_level.txt
|
|
17
|
+
camera_ui_ml/detectors/__init__.py
|
|
18
|
+
camera_ui_ml/detectors/base.py
|
|
19
|
+
camera_ui_ml/detectors/box.py
|
|
20
|
+
camera_ui_ml/detectors/clip.py
|
|
21
|
+
camera_ui_ml/detectors/embedder.py
|
|
22
|
+
camera_ui_ml/detectors/ocr.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
camera_ui_ml
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=80.9.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "camera-ui-ml"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Shared ML inference framework for camera.ui detection plugins (onnx, openvino, coreml, ncnn)"
|
|
9
|
+
keywords = ["camera.ui", "python", "ml", "inference", "detection"]
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
license-files = ["LICENSE.md"]
|
|
12
|
+
authors = [{ name = "seydx", email = "hi@seydx.dev" }]
|
|
13
|
+
maintainers = [{ name = "seydx", email = "hi@seydx.dev" }]
|
|
14
|
+
requires-python = ">=3.11"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 5 - Production/Stable",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Programming Language :: Python :: 3.13",
|
|
23
|
+
]
|
|
24
|
+
dependencies = ["typing_extensions>=4.15.0", "camera-ui-sdk>=1.0.95", "numpy>=2.0", "pillow>=12.0", "aiohttp>=3.12"]
|
|
25
|
+
|
|
26
|
+
[project.optional-dependencies]
|
|
27
|
+
test = ["pytest>=8"]
|
|
28
|
+
clip = ["transformers>=4.40"]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/cameraui/ml"
|
|
32
|
+
Documentation = "https://github.com/cameraui/ml"
|
|
33
|
+
Repository = "https://github.com/cameraui/ml"
|
|
34
|
+
"Bug Tracker" = "https://github.com/cameraui/ml/issues"
|
|
35
|
+
Changelog = "https://github.com/cameraui/ml/blob/main/CHANGELOG.md"
|
|
36
|
+
|
|
37
|
+
[tool.pytest.ini_options]
|
|
38
|
+
testpaths = ["tests"]
|
|
39
|
+
|
|
40
|
+
[tool.setuptools.packages.find]
|
|
41
|
+
where = ["."]
|
|
42
|
+
include = ["camera_ui_ml*"]
|
|
43
|
+
|
|
44
|
+
[tool.setuptools.package-data]
|
|
45
|
+
camera_ui_ml = ["py.typed"]
|