dgenerate-ultralytics-headless 8.3.178__py3-none-any.whl → 8.3.180__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.178.dist-info → dgenerate_ultralytics_headless-8.3.180.dist-info}/METADATA +3 -3
- {dgenerate_ultralytics_headless-8.3.178.dist-info → dgenerate_ultralytics_headless-8.3.180.dist-info}/RECORD +14 -14
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +4 -1
- ultralytics/cfg/datasets/VOC.yaml +1 -1
- ultralytics/cfg/datasets/VisDrone.yaml +1 -1
- ultralytics/models/sam/predict.py +141 -85
- ultralytics/models/yolo/model.py +5 -1
- ultralytics/utils/checks.py +2 -2
- ultralytics/utils/downloads.py +5 -3
- {dgenerate_ultralytics_headless-8.3.178.dist-info → dgenerate_ultralytics_headless-8.3.180.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.178.dist-info → dgenerate_ultralytics_headless-8.3.180.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.178.dist-info → dgenerate_ultralytics_headless-8.3.180.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.178.dist-info → dgenerate_ultralytics_headless-8.3.180.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: dgenerate-ultralytics-headless
|
3
|
-
Version: 8.3.
|
3
|
+
Version: 8.3.180
|
4
4
|
Summary: Automatically built Ultralytics package with python-opencv-headless dependency instead of python-opencv
|
5
5
|
Author-email: Glenn Jocher <glenn.jocher@ultralytics.com>, Jing Qiu <jing.qiu@ultralytics.com>
|
6
6
|
Maintainer-email: Ultralytics <hello@ultralytics.com>
|
@@ -57,13 +57,13 @@ Requires-Dist: mkdocs-material>=9.5.9; extra == "dev"
|
|
57
57
|
Requires-Dist: mkdocstrings[python]; extra == "dev"
|
58
58
|
Requires-Dist: mkdocs-ultralytics-plugin>=0.1.26; extra == "dev"
|
59
59
|
Requires-Dist: mkdocs-macros-plugin>=1.0.5; extra == "dev"
|
60
|
-
Requires-Dist: click==8.2.1; extra == "dev"
|
61
60
|
Provides-Extra: export
|
61
|
+
Requires-Dist: numpy<2.0.0; extra == "export"
|
62
62
|
Requires-Dist: onnx<1.18.0,>=1.12.0; extra == "export"
|
63
63
|
Requires-Dist: coremltools>=8.0; (platform_system != "Windows" and python_version <= "3.13") and extra == "export"
|
64
64
|
Requires-Dist: scikit-learn>=1.3.2; (platform_system != "Windows" and python_version <= "3.13") and extra == "export"
|
65
65
|
Requires-Dist: openvino>=2024.0.0; extra == "export"
|
66
|
-
Requires-Dist: tensorflow
|
66
|
+
Requires-Dist: tensorflow<=2.19.0,>=2.0.0; extra == "export"
|
67
67
|
Requires-Dist: tensorflowjs>=2.0.0; extra == "export"
|
68
68
|
Requires-Dist: tensorstore>=0.1.63; (platform_machine == "aarch64" and python_version >= "3.9") and extra == "export"
|
69
69
|
Requires-Dist: h5py!=3.11.0; platform_machine == "aarch64" and extra == "export"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
dgenerate_ultralytics_headless-8.3.
|
1
|
+
dgenerate_ultralytics_headless-8.3.180.dist-info/licenses/LICENSE,sha256=DZak_2itbUtvHzD3E7GNUYSRK6jdOJ-GqncQ2weavLA,34523
|
2
2
|
tests/__init__.py,sha256=b4KP5_q-2IO8Br8YHOSLYnn7IwZS81l_vfEF2YPa2lM,894
|
3
3
|
tests/conftest.py,sha256=LXtQJcFNWPGuzauTGkiXgsvVC3llJKfg22WcmhRzuQc,2593
|
4
4
|
tests/test_cli.py,sha256=EMf5gTAopOnIz8VvzaM-Qb044o7D0flnUHYQ-2ffOM4,5670
|
@@ -8,10 +8,10 @@ tests/test_exports.py,sha256=CY-4xVZlVM16vdyIC0mSR3Ix59aiZm1qjFGIhSNmB20,11007
|
|
8
8
|
tests/test_integrations.py,sha256=kl_AKmE_Qs1GB0_91iVwbzNxofm_hFTt0zzU6JF-pg4,6323
|
9
9
|
tests/test_python.py,sha256=-qvdeg-hEcKU5mWSDEU24iFZ-i8FAwQRznSXpkp6WQ4,27928
|
10
10
|
tests/test_solutions.py,sha256=tuf6n_fsI8KvSdJrnc-cqP2qYdiYqCWuVrx0z9dOz3Q,13213
|
11
|
-
ultralytics/__init__.py,sha256=
|
11
|
+
ultralytics/__init__.py,sha256=fHQo0GhHdl2c_XkrtYaXY22EnCd1IZeIe5C59ZLWimc,730
|
12
12
|
ultralytics/assets/bus.jpg,sha256=wCAZxJecGR63Od3ZRERe9Aja1Weayrb9Ug751DS_vGM,137419
|
13
13
|
ultralytics/assets/zidane.jpg,sha256=Ftc4aeMmen1O0A3o6GCDO9FlfBslLpTAw0gnetx7bts,50427
|
14
|
-
ultralytics/cfg/__init__.py,sha256=
|
14
|
+
ultralytics/cfg/__init__.py,sha256=Uj1br3-NVFvP6VY5CL4PK63mAQAom93XFC5cqSbM6t4,39887
|
15
15
|
ultralytics/cfg/default.yaml,sha256=1SspGAK_K_DT7DBfEScJh4jsJUTOxahehZYj92xmj7o,8347
|
16
16
|
ultralytics/cfg/datasets/Argoverse.yaml,sha256=4SGaJio9JFUkrscHJTPnH_QSbYm48Wbk8EFwl39zntc,3262
|
17
17
|
ultralytics/cfg/datasets/DOTAv1.5.yaml,sha256=VZ_KKFX0H2YvlFVJ8JHcLWYBZ2xiQ6Z-ROSTiKWpS7c,1211
|
@@ -21,8 +21,8 @@ ultralytics/cfg/datasets/HomeObjects-3K.yaml,sha256=xEtSqEad-rtfGuIrERjjhdISggmP
|
|
21
21
|
ultralytics/cfg/datasets/ImageNet.yaml,sha256=GvDWypLVG_H3H67Ai8IC1pvK6fwcTtF5FRhzO1OXXDU,42530
|
22
22
|
ultralytics/cfg/datasets/Objects365.yaml,sha256=vLzbT3xgpLR-bHhrHOiYyzYvDIniRdevgSyPetm8QHk,9354
|
23
23
|
ultralytics/cfg/datasets/SKU-110K.yaml,sha256=a52le1-JQ2YH6b1WLMUxVz7RkZ36YsmXgWyw0z3q9nQ,2542
|
24
|
-
ultralytics/cfg/datasets/VOC.yaml,sha256=
|
25
|
-
ultralytics/cfg/datasets/VisDrone.yaml,sha256=
|
24
|
+
ultralytics/cfg/datasets/VOC.yaml,sha256=o09FWAAsr1MH3ftBJ_n-4Tmc3zxnVJL1HqlqKRUYVTQ,3774
|
25
|
+
ultralytics/cfg/datasets/VisDrone.yaml,sha256=dYAewe84CrGmxAA_z6UnZUAd7peaw5l3ARDcssojADk,3604
|
26
26
|
ultralytics/cfg/datasets/african-wildlife.yaml,sha256=SuloMp9WAZBigGC8az-VLACsFhTM76_O29yhTvUqdnU,915
|
27
27
|
ultralytics/cfg/datasets/brain-tumor.yaml,sha256=qrxPO_t9wxbn2kHFwP3vGTzSWj2ELTLelUwYL3_b6nc,800
|
28
28
|
ultralytics/cfg/datasets/carparts-seg.yaml,sha256=A4e9hM1unTY2jjZIXGiKSarF6R-Ad9R99t57OgRJ37w,1253
|
@@ -151,7 +151,7 @@ ultralytics/models/sam/__init__.py,sha256=iR7B06rAEni21eptg8n4rLOP0Z_qV9y9PL-L93
|
|
151
151
|
ultralytics/models/sam/amg.py,sha256=IpcuIfC5KBRiF4sdrsPl1ecWEJy75axo1yG23r5BFsw,11783
|
152
152
|
ultralytics/models/sam/build.py,sha256=J6n-_QOYLa63jldEZmhRe9D3Is_AJE8xyZLUjzfRyTY,12629
|
153
153
|
ultralytics/models/sam/model.py,sha256=j1TwsLmtxhiXyceU31VPzGVkjRXGylphKrdPSzUJRJc,7231
|
154
|
-
ultralytics/models/sam/predict.py,sha256=
|
154
|
+
ultralytics/models/sam/predict.py,sha256=VBClks5F74BX0uZWpMfUjaXM_H222BYJ09lRYF6A2u8,86184
|
155
155
|
ultralytics/models/sam/modules/__init__.py,sha256=lm6MckFYCPTbqIoX7w0s_daxdjNeBeKW6DXppv1-QUM,70
|
156
156
|
ultralytics/models/sam/modules/blocks.py,sha256=n8oe9sx91_RktsF2_2UYNKH7qk8bFXuJtEaIEpQQ3ws,46059
|
157
157
|
ultralytics/models/sam/modules/decoders.py,sha256=-1fhBO47hA-3CzkU-PzkCK4Nsi_VJ_CH6Q9SMjydN4I,25609
|
@@ -165,7 +165,7 @@ ultralytics/models/utils/__init__.py,sha256=lm6MckFYCPTbqIoX7w0s_daxdjNeBeKW6DXp
|
|
165
165
|
ultralytics/models/utils/loss.py,sha256=E-61TfLPc04IdeL6IlFDityDoPju-ov0ouWV_cNY4Kg,21254
|
166
166
|
ultralytics/models/utils/ops.py,sha256=Pr77n8XW25SUEx4X3bBvXcVIbRdJPoaXJuG0KWWawRQ,15253
|
167
167
|
ultralytics/models/yolo/__init__.py,sha256=or0j5xvcM0usMlsFTYhNAOcQUri7reD0cD9JR5b7zDk,307
|
168
|
-
ultralytics/models/yolo/model.py,sha256=
|
168
|
+
ultralytics/models/yolo/model.py,sha256=96PDREUJwDiPb3w4lp2HCesc3c3y1WGyLttOUhUYPxk,18715
|
169
169
|
ultralytics/models/yolo/classify/__init__.py,sha256=9--HVaNOfI1K7rn_rRqclL8FUAnpfeBrRqEQIaQw2xM,383
|
170
170
|
ultralytics/models/yolo/classify/predict.py,sha256=FqAC2YXe25bRwedMZhF3Lw0waoY-a60xMKELhxApP9I,4149
|
171
171
|
ultralytics/models/yolo/classify/train.py,sha256=V-hevc6X7xemnpyru84OfTRA77eNnkVSMEz16_OUvo4,10244
|
@@ -239,9 +239,9 @@ ultralytics/utils/__init__.py,sha256=KyczXn2SLR0g8xkCVpfNaPkhDCQ4A8vLt99teYu7mss
|
|
239
239
|
ultralytics/utils/autobatch.py,sha256=33m8YgggLIhltDqMXZ5OE-FGs2QiHrl2-LfgY1mI4cw,5119
|
240
240
|
ultralytics/utils/autodevice.py,sha256=AvgXFt8c1Cg4icKh0Hbhhz8UmVQ2Wjyfdfkeb2C8zck,8855
|
241
241
|
ultralytics/utils/benchmarks.py,sha256=btsi_B0mfLPfhE8GrsBpi79vl7SRam0YYngNFAsY8Ak,31035
|
242
|
-
ultralytics/utils/checks.py,sha256=
|
242
|
+
ultralytics/utils/checks.py,sha256=q64U5wKyejD-2W2fCPqJ0Oiaa4_4vq2pVxV9wp6lMz4,34707
|
243
243
|
ultralytics/utils/dist.py,sha256=A9lDGtGefTjSVvVS38w86GOdbtLzNBDZuDGK0MT4PRI,4170
|
244
|
-
ultralytics/utils/downloads.py,sha256=
|
244
|
+
ultralytics/utils/downloads.py,sha256=A7r4LpWUojGkam9-VQ3Ylu-Cn1lAUGKyJE6VzwQbp7M,22016
|
245
245
|
ultralytics/utils/errors.py,sha256=XT9Ru7ivoBgofK6PlnyigGoa7Fmf5nEhyHtnD-8TRXI,1584
|
246
246
|
ultralytics/utils/export.py,sha256=LK-wlTlyb_zIKtSvOmfmvR70RcUU9Ct9UBDt5wn9_rY,9880
|
247
247
|
ultralytics/utils/files.py,sha256=ZCbLGleiF0f-PqYfaxMFAWop88w7U1hpreHXl8b2ko0,8238
|
@@ -266,8 +266,8 @@ ultralytics/utils/callbacks/neptune.py,sha256=j8pecmlcsM8FGzLKWoBw5xUsi5t8E5HuxY
|
|
266
266
|
ultralytics/utils/callbacks/raytune.py,sha256=S6Bq16oQDQ8BQgnZzA0zJHGN_BBr8iAM_WtGoLiEcwg,1283
|
267
267
|
ultralytics/utils/callbacks/tensorboard.py,sha256=MDPBW7aDes-66OE6YqKXXvqA_EocjzEMHWGM-8z9vUQ,5281
|
268
268
|
ultralytics/utils/callbacks/wb.py,sha256=Tm_-aRr2CN32MJkY9tylpMBJkb007-MSRNSQ7rDJ5QU,7521
|
269
|
-
dgenerate_ultralytics_headless-8.3.
|
270
|
-
dgenerate_ultralytics_headless-8.3.
|
271
|
-
dgenerate_ultralytics_headless-8.3.
|
272
|
-
dgenerate_ultralytics_headless-8.3.
|
273
|
-
dgenerate_ultralytics_headless-8.3.
|
269
|
+
dgenerate_ultralytics_headless-8.3.180.dist-info/METADATA,sha256=-yv2HSf0JD7vFjkRFBIrwXoyHrcdbdQdhB5ocRW3_hk,38727
|
270
|
+
dgenerate_ultralytics_headless-8.3.180.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
271
|
+
dgenerate_ultralytics_headless-8.3.180.dist-info/entry_points.txt,sha256=YM_wiKyTe9yRrsEfqvYolNO5ngwfoL4-NwgKzc8_7sI,93
|
272
|
+
dgenerate_ultralytics_headless-8.3.180.dist-info/top_level.txt,sha256=XP49TwiMw4QGsvTLSYiJhz1xF_k7ev5mQ8jJXaXi45Q,12
|
273
|
+
dgenerate_ultralytics_headless-8.3.180.dist-info/RECORD,,
|
ultralytics/__init__.py
CHANGED
ultralytics/cfg/__init__.py
CHANGED
@@ -954,7 +954,10 @@ def entrypoint(debug: str = "") -> None:
|
|
954
954
|
from ultralytics import YOLO
|
955
955
|
|
956
956
|
model = YOLO(model, task=task)
|
957
|
-
|
957
|
+
if "yoloe" in stem or "world" in stem:
|
958
|
+
cls_list = overrides.pop("classes", DEFAULT_CFG.classes)
|
959
|
+
if cls_list is not None and isinstance(cls_list, str):
|
960
|
+
model.set_classes(cls_list.split(",")) # convert "person, bus" -> ['person', ' bus'].
|
958
961
|
# Task Update
|
959
962
|
if task != model.task:
|
960
963
|
if task:
|
@@ -87,7 +87,7 @@ download: |
|
|
87
87
|
f"{url}VOCtest_06-Nov-2007.zip", # 438MB, 4953 images
|
88
88
|
f"{url}VOCtrainval_11-May-2012.zip", # 1.95GB, 17126 images
|
89
89
|
]
|
90
|
-
download(urls, dir=dir / "images",
|
90
|
+
download(urls, dir=dir / "images", threads=3, exist_ok=True) # download and unzip over existing (required)
|
91
91
|
|
92
92
|
# Convert
|
93
93
|
path = dir / "images/VOCdevkit"
|
@@ -78,7 +78,7 @@ download: |
|
|
78
78
|
"https://github.com/ultralytics/assets/releases/download/v0.0.0/VisDrone2019-DET-test-dev.zip",
|
79
79
|
# "https://github.com/ultralytics/assets/releases/download/v0.0.0/VisDrone2019-DET-test-challenge.zip",
|
80
80
|
]
|
81
|
-
download(urls, dir=dir,
|
81
|
+
download(urls, dir=dir, threads=4)
|
82
82
|
|
83
83
|
# Convert
|
84
84
|
splits = {"VisDrone2019-DET-train": "train", "VisDrone2019-DET-val": "val", "VisDrone2019-DET-test-dev": "test"}
|
@@ -182,9 +182,8 @@ class Predictor(BasePredictor):
|
|
182
182
|
**kwargs (Any): Additional keyword arguments.
|
183
183
|
|
184
184
|
Returns:
|
185
|
-
pred_masks (
|
186
|
-
pred_scores (
|
187
|
-
pred_logits (np.ndarray): Low-resolution logits of shape (C, H, W) for subsequent inference, where H=W=256.
|
185
|
+
pred_masks (torch.Tensor): The output masks in shape (C, H, W), where C is the number of generated masks.
|
186
|
+
pred_scores (torch.Tensor): An array of length C containing quality scores predicted by the model for each mask.
|
188
187
|
|
189
188
|
Examples:
|
190
189
|
>>> predictor = Predictor()
|
@@ -219,8 +218,8 @@ class Predictor(BasePredictor):
|
|
219
218
|
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
220
219
|
|
221
220
|
Returns:
|
222
|
-
pred_masks (
|
223
|
-
pred_scores (
|
221
|
+
pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
|
222
|
+
pred_scores (torch.Tensor): Quality scores predicted by the model for each mask, with length C.
|
224
223
|
|
225
224
|
Examples:
|
226
225
|
>>> predictor = Predictor()
|
@@ -230,7 +229,34 @@ class Predictor(BasePredictor):
|
|
230
229
|
"""
|
231
230
|
features = self.get_im_features(im) if self.features is None else self.features
|
232
231
|
|
233
|
-
|
232
|
+
prompts = self._prepare_prompts(im.shape[2:], self.batch[1][0].shape[:2], bboxes, points, labels, masks)
|
233
|
+
return self._inference_features(features, *prompts, multimask_output)
|
234
|
+
|
235
|
+
def _inference_features(
|
236
|
+
self,
|
237
|
+
features,
|
238
|
+
bboxes=None,
|
239
|
+
points=None,
|
240
|
+
labels=None,
|
241
|
+
masks=None,
|
242
|
+
multimask_output=False,
|
243
|
+
):
|
244
|
+
"""
|
245
|
+
Perform inference on image features using the SAM model.
|
246
|
+
|
247
|
+
Args:
|
248
|
+
features (torch.Tensor): Extracted image features with shape (B, C, H, W) from the SAM model image encoder.
|
249
|
+
bboxes (np.ndarray | List[List[float]] | None): Bounding boxes in XYXY format with shape (N, 4).
|
250
|
+
points (np.ndarray | List[List[float]] | None): Object location points with shape (N, 2), in pixels.
|
251
|
+
labels (np.ndarray | List[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
|
252
|
+
masks (List[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
|
253
|
+
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
254
|
+
img_idx (int): Index of the image in the batch to process.
|
255
|
+
|
256
|
+
Returns:
|
257
|
+
pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
|
258
|
+
pred_scores (torch.Tensor): Quality scores for each mask, with length C.
|
259
|
+
"""
|
234
260
|
points = (points, labels) if points is not None else None
|
235
261
|
# Embed prompts
|
236
262
|
sparse_embeddings, dense_embeddings = self.model.prompt_encoder(points=points, boxes=bboxes, masks=masks)
|
@@ -248,12 +274,13 @@ class Predictor(BasePredictor):
|
|
248
274
|
# `d` could be 1 or 3 depends on `multimask_output`.
|
249
275
|
return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
|
250
276
|
|
251
|
-
def _prepare_prompts(self, dst_shape, bboxes=None, points=None, labels=None, masks=None):
|
277
|
+
def _prepare_prompts(self, dst_shape, src_shape, bboxes=None, points=None, labels=None, masks=None):
|
252
278
|
"""
|
253
279
|
Prepare and transform the input prompts for processing based on the destination shape.
|
254
280
|
|
255
281
|
Args:
|
256
|
-
dst_shape (
|
282
|
+
dst_shape (Tuple[int, int]): The target shape (height, width) for the prompts.
|
283
|
+
src_shape (Tuple[int, int]): The source shape (height, width) of the input image.
|
257
284
|
bboxes (np.ndarray | List | None): Bounding boxes in XYXY format with shape (N, 4).
|
258
285
|
points (np.ndarray | List | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
|
259
286
|
labels (np.ndarray | List | None): Point prompt labels with shape (N) or (N, num_points). 1 for foreground, 0 for background.
|
@@ -268,7 +295,6 @@ class Predictor(BasePredictor):
|
|
268
295
|
Raises:
|
269
296
|
AssertionError: If the number of points don't match the number of labels, in case labels were passed.
|
270
297
|
"""
|
271
|
-
src_shape = self.batch[1][0].shape[:2]
|
272
298
|
r = 1.0 if self.segment_all else min(dst_shape[0] / src_shape[0], dst_shape[1] / src_shape[1])
|
273
299
|
# Transform input prompts
|
274
300
|
if points is not None:
|
@@ -543,7 +569,7 @@ class Predictor(BasePredictor):
|
|
543
569
|
- The extracted features are stored in the `self.features` attribute for later use.
|
544
570
|
"""
|
545
571
|
if self.model is None:
|
546
|
-
self.setup_model(
|
572
|
+
self.setup_model()
|
547
573
|
self.setup_source(image)
|
548
574
|
assert len(self.dataset) == 1, "`set_image` only supports setting one image!"
|
549
575
|
for batch in self.dataset:
|
@@ -620,6 +646,53 @@ class Predictor(BasePredictor):
|
|
620
646
|
|
621
647
|
return new_masks[keep].to(device=masks.device, dtype=masks.dtype), keep
|
622
648
|
|
649
|
+
@smart_inference_mode()
|
650
|
+
def inference_features(
|
651
|
+
self,
|
652
|
+
features,
|
653
|
+
src_shape,
|
654
|
+
dst_shape=None,
|
655
|
+
bboxes=None,
|
656
|
+
points=None,
|
657
|
+
labels=None,
|
658
|
+
masks=None,
|
659
|
+
multimask_output=False,
|
660
|
+
):
|
661
|
+
"""
|
662
|
+
Perform prompts preprocessing and inference on provided image features using the SAM model.
|
663
|
+
|
664
|
+
Args:
|
665
|
+
features (torch.Tensor | Dict[str, Any]): Extracted image features from the SAM/SAM2 model image encoder.
|
666
|
+
src_shape (Tuple[int, int]): The source shape (height, width) of the input image.
|
667
|
+
dst_shape (Tuple[int, int] | None): The target shape (height, width) for the prompts. If None, defaults to (imgsz, imgsz).
|
668
|
+
bboxes (np.ndarray | List[List[float]] | None): Bounding boxes in xyxy format with shape (N, 4).
|
669
|
+
points (np.ndarray | List[List[float]] | None): Points indicating object locations with shape (N, 2), in pixels.
|
670
|
+
labels (np.ndarray | List[int] | None): Point prompt labels with shape (N, ).
|
671
|
+
masks (List[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
|
672
|
+
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
673
|
+
|
674
|
+
Returns:
|
675
|
+
pred_masks (torch.Tensor): The output masks in shape (C, H, W), where C is the number of generated masks.
|
676
|
+
pred_bboxes (torch.Tensor): Bounding boxes for each mask with shape (N, 6), where N is the number of boxes.
|
677
|
+
Each box is in xyxy format with additional columns for score and class.
|
678
|
+
|
679
|
+
Notes:
|
680
|
+
- The input features is a torch.Tensor of shape (B, C, H, W) if performing on SAM, or a Dict[str, Any] if performing on SAM2.
|
681
|
+
"""
|
682
|
+
dst_shape = dst_shape or (self.args.imgsz, self.args.imgsz)
|
683
|
+
prompts = self._prepare_prompts(dst_shape, src_shape, bboxes, points, labels, masks)
|
684
|
+
pred_masks, pred_scores = self._inference_features(features, *prompts, multimask_output)
|
685
|
+
if len(pred_masks) == 0:
|
686
|
+
pred_masks, pred_bboxes = None, torch.zeros((0, 6), device=pred_masks.device)
|
687
|
+
else:
|
688
|
+
pred_masks = ops.scale_masks(pred_masks[None].float(), src_shape, padding=False)[0]
|
689
|
+
pred_masks = pred_masks > self.model.mask_threshold # to bool
|
690
|
+
pred_bboxes = batched_mask_to_box(pred_masks)
|
691
|
+
# NOTE: SAM models do not return cls info. This `cls` here is just a placeholder for consistency.
|
692
|
+
cls = torch.arange(len(pred_masks), dtype=torch.int32, device=pred_masks.device)
|
693
|
+
pred_bboxes = torch.cat([pred_bboxes, pred_scores[:, None], cls[:, None]], dim=-1)
|
694
|
+
return pred_masks, pred_bboxes
|
695
|
+
|
623
696
|
|
624
697
|
class SAM2Predictor(Predictor):
|
625
698
|
"""
|
@@ -663,80 +736,13 @@ class SAM2Predictor(Predictor):
|
|
663
736
|
|
664
737
|
return build_sam(self.args.model)
|
665
738
|
|
666
|
-
def
|
667
|
-
self,
|
668
|
-
im,
|
669
|
-
bboxes=None,
|
670
|
-
points=None,
|
671
|
-
labels=None,
|
672
|
-
masks=None,
|
673
|
-
multimask_output=False,
|
674
|
-
img_idx=-1,
|
675
|
-
):
|
676
|
-
"""
|
677
|
-
Perform image segmentation inference based on various prompts using SAM2 architecture.
|
678
|
-
|
679
|
-
This method leverages the Segment Anything Model 2 (SAM2) to generate segmentation masks for input images
|
680
|
-
based on provided prompts such as bounding boxes, points, or existing masks. It supports both single and
|
681
|
-
multi-object prediction scenarios.
|
682
|
-
|
683
|
-
Args:
|
684
|
-
im (torch.Tensor): Preprocessed input image tensor with shape (N, C, H, W).
|
685
|
-
bboxes (np.ndarray | List[List[float]] | None): Bounding boxes in XYXY format with shape (N, 4).
|
686
|
-
points (np.ndarray | List[List[float]] | None): Object location points with shape (N, 2), in pixels.
|
687
|
-
labels (np.ndarray | List[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
|
688
|
-
masks (np.ndarray | None): Low-resolution masks from previous predictions with shape (N, H, W).
|
689
|
-
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
690
|
-
img_idx (int): Index of the image in the batch to process.
|
691
|
-
|
692
|
-
Returns:
|
693
|
-
pred_masks (np.ndarray): Output masks with shape (C, H, W), where C is the number of generated masks.
|
694
|
-
pred_scores (np.ndarray): Quality scores for each mask, with length C.
|
695
|
-
|
696
|
-
Examples:
|
697
|
-
>>> predictor = SAM2Predictor(cfg)
|
698
|
-
>>> image = torch.rand(1, 3, 640, 640)
|
699
|
-
>>> bboxes = [[100, 100, 200, 200]]
|
700
|
-
>>> result = predictor(image, bboxes=bboxes)[0]
|
701
|
-
>>> print(f"Generated {result.masks.shape[0]} masks with average score {result.boxes.conf.mean():.2f}")
|
702
|
-
|
703
|
-
Notes:
|
704
|
-
- The method supports batched inference for multiple objects when points or bboxes are provided.
|
705
|
-
- Input prompts (bboxes, points) are automatically scaled to match the input image dimensions.
|
706
|
-
- When both bboxes and points are provided, they are merged into a single 'points' input for the model.
|
707
|
-
"""
|
708
|
-
features = self.get_im_features(im) if self.features is None else self.features
|
709
|
-
|
710
|
-
points, labels, masks = self._prepare_prompts(im.shape[2:], bboxes, points, labels, masks)
|
711
|
-
points = (points, labels) if points is not None else None
|
712
|
-
|
713
|
-
sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder(
|
714
|
-
points=points,
|
715
|
-
boxes=None,
|
716
|
-
masks=masks,
|
717
|
-
)
|
718
|
-
# Predict masks
|
719
|
-
batched_mode = points is not None and points[0].shape[0] > 1 # multi object prediction
|
720
|
-
high_res_features = [feat_level[img_idx].unsqueeze(0) for feat_level in features["high_res_feats"]]
|
721
|
-
pred_masks, pred_scores, _, _ = self.model.sam_mask_decoder(
|
722
|
-
image_embeddings=features["image_embed"][img_idx].unsqueeze(0),
|
723
|
-
image_pe=self.model.sam_prompt_encoder.get_dense_pe(),
|
724
|
-
sparse_prompt_embeddings=sparse_embeddings,
|
725
|
-
dense_prompt_embeddings=dense_embeddings,
|
726
|
-
multimask_output=multimask_output,
|
727
|
-
repeat_image=batched_mode,
|
728
|
-
high_res_features=high_res_features,
|
729
|
-
)
|
730
|
-
# (N, d, H, W) --> (N*d, H, W), (N, d) --> (N*d, )
|
731
|
-
# `d` could be 1 or 3 depends on `multimask_output`.
|
732
|
-
return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
|
733
|
-
|
734
|
-
def _prepare_prompts(self, dst_shape, bboxes=None, points=None, labels=None, masks=None):
|
739
|
+
def _prepare_prompts(self, dst_shape, src_shape, bboxes=None, points=None, labels=None, masks=None):
|
735
740
|
"""
|
736
741
|
Prepare and transform the input prompts for processing based on the destination shape.
|
737
742
|
|
738
743
|
Args:
|
739
|
-
dst_shape (
|
744
|
+
dst_shape (Tuple[int, int]): The target shape (height, width) for the prompts.
|
745
|
+
src_shape (Tuple[int, int]): The source shape (height, width) of the input image.
|
740
746
|
bboxes (np.ndarray | List | None): Bounding boxes in XYXY format with shape (N, 4).
|
741
747
|
points (np.ndarray | List | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
|
742
748
|
labels (np.ndarray | List | None): Point prompt labels with shape (N,) or (N, num_points). 1 for foreground, 0 for background.
|
@@ -750,7 +756,7 @@ class SAM2Predictor(Predictor):
|
|
750
756
|
Raises:
|
751
757
|
AssertionError: If the number of points don't match the number of labels, in case labels were passed.
|
752
758
|
"""
|
753
|
-
bboxes, points, labels, masks = super()._prepare_prompts(dst_shape, bboxes, points, labels, masks)
|
759
|
+
bboxes, points, labels, masks = super()._prepare_prompts(dst_shape, src_shape, bboxes, points, labels, masks)
|
754
760
|
if bboxes is not None:
|
755
761
|
bboxes = bboxes.view(-1, 2, 2)
|
756
762
|
bbox_labels = torch.tensor([[2, 3]], dtype=torch.int32, device=bboxes.device).expand(len(bboxes), -1)
|
@@ -813,6 +819,54 @@ class SAM2Predictor(Predictor):
|
|
813
819
|
][::-1]
|
814
820
|
return {"image_embed": feats[-1], "high_res_feats": feats[:-1]}
|
815
821
|
|
822
|
+
def _inference_features(
|
823
|
+
self,
|
824
|
+
features,
|
825
|
+
points=None,
|
826
|
+
labels=None,
|
827
|
+
masks=None,
|
828
|
+
multimask_output=False,
|
829
|
+
img_idx=-1,
|
830
|
+
):
|
831
|
+
"""
|
832
|
+
Perform inference on image features using the SAM2 model.
|
833
|
+
|
834
|
+
Args:
|
835
|
+
features (Dict[str, Any]): Extracted image information from the SAM2 model image encoder, it's a dictionary including:
|
836
|
+
image_embed (torch.Tensor): Image embedding with shape (B, C, H, W).
|
837
|
+
high_res_feats (List[torch.Tensor]): List of high-resolution feature maps from the backbone, each with shape (B, C, H, W).
|
838
|
+
points (np.ndarray | List[List[float]] | None): Object location points with shape (N, 2), in pixels.
|
839
|
+
labels (np.ndarray | List[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
|
840
|
+
masks (List[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
|
841
|
+
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
842
|
+
img_idx (int): Index of the image in the batch to process.
|
843
|
+
|
844
|
+
Returns:
|
845
|
+
pred_masks (torch.Tensor): Output masks with shape (C, H, W), where C is the number of generated masks.
|
846
|
+
pred_scores (torch.Tensor): Quality scores for each mask, with length C.
|
847
|
+
"""
|
848
|
+
points = (points, labels) if points is not None else None
|
849
|
+
sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder(
|
850
|
+
points=points,
|
851
|
+
boxes=None,
|
852
|
+
masks=masks,
|
853
|
+
)
|
854
|
+
# Predict masks
|
855
|
+
batched_mode = points is not None and points[0].shape[0] > 1 # multi object prediction
|
856
|
+
high_res_features = [feat_level[img_idx].unsqueeze(0) for feat_level in features["high_res_feats"]]
|
857
|
+
pred_masks, pred_scores, _, _ = self.model.sam_mask_decoder(
|
858
|
+
image_embeddings=features["image_embed"][img_idx].unsqueeze(0),
|
859
|
+
image_pe=self.model.sam_prompt_encoder.get_dense_pe(),
|
860
|
+
sparse_prompt_embeddings=sparse_embeddings,
|
861
|
+
dense_prompt_embeddings=dense_embeddings,
|
862
|
+
multimask_output=multimask_output,
|
863
|
+
repeat_image=batched_mode,
|
864
|
+
high_res_features=high_res_features,
|
865
|
+
)
|
866
|
+
# (N, d, H, W) --> (N*d, H, W), (N, d) --> (N*d, )
|
867
|
+
# `d` could be 1 or 3 depends on `multimask_output`.
|
868
|
+
return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
|
869
|
+
|
816
870
|
|
817
871
|
class SAM2VideoPredictor(SAM2Predictor):
|
818
872
|
"""
|
@@ -900,8 +954,8 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
900
954
|
masks (np.ndarray, optional): Low-resolution masks from previous predictions shape (N,H,W). For SAM H=W=256.
|
901
955
|
|
902
956
|
Returns:
|
903
|
-
pred_masks (
|
904
|
-
pred_scores (
|
957
|
+
pred_masks (torch.Tensor): The output masks in shape CxHxW, where C is the number of generated masks.
|
958
|
+
pred_scores (torch.Tensor): An array of length C containing quality scores predicted by the model for each mask.
|
905
959
|
"""
|
906
960
|
# Override prompts if any stored in self.prompts
|
907
961
|
bboxes = self.prompts.pop("bboxes", bboxes)
|
@@ -912,7 +966,9 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
912
966
|
self.inference_state["im"] = im
|
913
967
|
output_dict = self.inference_state["output_dict"]
|
914
968
|
if len(output_dict["cond_frame_outputs"]) == 0: # initialize prompts
|
915
|
-
points, labels, masks = self._prepare_prompts(
|
969
|
+
points, labels, masks = self._prepare_prompts(
|
970
|
+
im.shape[2:], self.batch[1][0].shape[:2], bboxes, points, labels, masks
|
971
|
+
)
|
916
972
|
if points is not None:
|
917
973
|
for i in range(len(points)):
|
918
974
|
self.add_new_prompts(obj_id=i, points=points[[i]], labels=labels[[i]], frame_idx=frame)
|
@@ -966,7 +1022,7 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
966
1022
|
the masks do not overlap, which can be useful for certain applications.
|
967
1023
|
|
968
1024
|
Args:
|
969
|
-
preds (
|
1025
|
+
preds (Tuple[torch.Tensor, torch.Tensor]): The predicted masks and scores from the model.
|
970
1026
|
img (torch.Tensor): The processed image tensor.
|
971
1027
|
orig_imgs (List[np.ndarray]): The original images before processing.
|
972
1028
|
|
ultralytics/models/yolo/model.py
CHANGED
@@ -3,6 +3,8 @@
|
|
3
3
|
from pathlib import Path
|
4
4
|
from typing import Any, Dict, List, Optional, Union
|
5
5
|
|
6
|
+
import torch
|
7
|
+
|
6
8
|
from ultralytics.data.build import load_inference_source
|
7
9
|
from ultralytics.engine.model import Model
|
8
10
|
from ultralytics.models import yolo
|
@@ -315,7 +317,7 @@ class YOLOE(Model):
|
|
315
317
|
assert isinstance(self.model, YOLOEModel)
|
316
318
|
return self.model.get_vocab(names)
|
317
319
|
|
318
|
-
def set_classes(self, classes: List[str], embeddings) -> None:
|
320
|
+
def set_classes(self, classes: List[str], embeddings: Optional[torch.Tensor] = None) -> None:
|
319
321
|
"""
|
320
322
|
Set the model's class names and embeddings for detection.
|
321
323
|
|
@@ -324,6 +326,8 @@ class YOLOE(Model):
|
|
324
326
|
embeddings (torch.Tensor): Embeddings corresponding to the classes.
|
325
327
|
"""
|
326
328
|
assert isinstance(self.model, YOLOEModel)
|
329
|
+
if embeddings is None:
|
330
|
+
embeddings = self.get_text_pe(classes) # generate text embeddings if not provided
|
327
331
|
self.model.set_classes(classes, embeddings)
|
328
332
|
# Verify no background class is present
|
329
333
|
assert " " not in classes
|
ultralytics/utils/checks.py
CHANGED
@@ -694,7 +694,7 @@ def collect_system_info():
|
|
694
694
|
"GPU count": torch.cuda.device_count() if cuda else None,
|
695
695
|
"CUDA": torch.version.cuda if cuda else None,
|
696
696
|
}
|
697
|
-
LOGGER.info("\n" + "\n".join(f"{k:<
|
697
|
+
LOGGER.info("\n" + "\n".join(f"{k:<23}{v}" for k, v in info_dict.items()) + "\n")
|
698
698
|
|
699
699
|
package_info = {}
|
700
700
|
for r in parse_requirements(package="ultralytics"):
|
@@ -705,7 +705,7 @@ def collect_system_info():
|
|
705
705
|
current = "(not installed)"
|
706
706
|
is_met = "❌ "
|
707
707
|
package_info[r.name] = f"{is_met}{current}{r.specifier}"
|
708
|
-
LOGGER.info(f"{r.name:<
|
708
|
+
LOGGER.info(f"{r.name:<23}{package_info[r.name]}")
|
709
709
|
|
710
710
|
info_dict["Package Info"] = package_info
|
711
711
|
|
ultralytics/utils/downloads.py
CHANGED
@@ -501,7 +501,9 @@ def download(
|
|
501
501
|
"""
|
502
502
|
dir = Path(dir)
|
503
503
|
dir.mkdir(parents=True, exist_ok=True) # make directory
|
504
|
+
urls = [url] if isinstance(url, (str, Path)) else url
|
504
505
|
if threads > 1:
|
506
|
+
LOGGER.info(f"Downloading {len(urls)} file(s) with {threads} threads to {dir}...")
|
505
507
|
with ThreadPool(threads) as pool:
|
506
508
|
pool.map(
|
507
509
|
lambda x: safe_download(
|
@@ -512,12 +514,12 @@ def download(
|
|
512
514
|
curl=curl,
|
513
515
|
retry=retry,
|
514
516
|
exist_ok=exist_ok,
|
515
|
-
progress=
|
517
|
+
progress=True,
|
516
518
|
),
|
517
|
-
zip(
|
519
|
+
zip(urls, repeat(dir)),
|
518
520
|
)
|
519
521
|
pool.close()
|
520
522
|
pool.join()
|
521
523
|
else:
|
522
|
-
for u in
|
524
|
+
for u in urls:
|
523
525
|
safe_download(url=u, dir=dir, unzip=unzip, delete=delete, curl=curl, retry=retry, exist_ok=exist_ok)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|