dgenerate-ultralytics-headless 8.3.194__py3-none-any.whl → 8.3.195__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/METADATA +1 -2
- {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/RECORD +97 -96
- tests/test_python.py +1 -1
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +8 -8
- ultralytics/data/annotator.py +1 -1
- ultralytics/data/augment.py +75 -75
- ultralytics/data/base.py +12 -12
- ultralytics/data/converter.py +4 -4
- ultralytics/data/dataset.py +7 -7
- ultralytics/data/loaders.py +15 -15
- ultralytics/data/split_dota.py +10 -10
- ultralytics/data/utils.py +12 -12
- ultralytics/engine/model.py +13 -13
- ultralytics/engine/predictor.py +13 -13
- ultralytics/engine/results.py +21 -21
- ultralytics/hub/google/__init__.py +2 -2
- ultralytics/hub/session.py +7 -7
- ultralytics/models/fastsam/model.py +5 -5
- ultralytics/models/fastsam/predict.py +11 -11
- ultralytics/models/nas/model.py +1 -1
- ultralytics/models/rtdetr/predict.py +2 -2
- ultralytics/models/rtdetr/val.py +4 -4
- ultralytics/models/sam/amg.py +6 -6
- ultralytics/models/sam/build.py +9 -9
- ultralytics/models/sam/model.py +7 -7
- ultralytics/models/sam/modules/blocks.py +6 -6
- ultralytics/models/sam/modules/decoders.py +1 -1
- ultralytics/models/sam/modules/encoders.py +27 -27
- ultralytics/models/sam/modules/sam.py +4 -4
- ultralytics/models/sam/modules/tiny_encoder.py +18 -18
- ultralytics/models/sam/modules/utils.py +8 -8
- ultralytics/models/sam/predict.py +63 -63
- ultralytics/models/utils/loss.py +22 -22
- ultralytics/models/utils/ops.py +8 -8
- ultralytics/models/yolo/classify/predict.py +2 -2
- ultralytics/models/yolo/classify/train.py +8 -8
- ultralytics/models/yolo/classify/val.py +4 -4
- ultralytics/models/yolo/detect/predict.py +3 -3
- ultralytics/models/yolo/detect/train.py +6 -6
- ultralytics/models/yolo/detect/val.py +32 -32
- ultralytics/models/yolo/model.py +6 -6
- ultralytics/models/yolo/obb/train.py +1 -1
- ultralytics/models/yolo/obb/val.py +13 -13
- ultralytics/models/yolo/pose/val.py +11 -11
- ultralytics/models/yolo/segment/predict.py +4 -4
- ultralytics/models/yolo/segment/train.py +1 -1
- ultralytics/models/yolo/segment/val.py +14 -14
- ultralytics/models/yolo/world/train.py +9 -9
- ultralytics/models/yolo/world/train_world.py +1 -1
- ultralytics/models/yolo/yoloe/predict.py +4 -4
- ultralytics/models/yolo/yoloe/train.py +4 -4
- ultralytics/nn/autobackend.py +2 -2
- ultralytics/nn/modules/block.py +6 -6
- ultralytics/nn/modules/conv.py +2 -2
- ultralytics/nn/modules/head.py +4 -4
- ultralytics/nn/tasks.py +13 -13
- ultralytics/nn/text_model.py +3 -3
- ultralytics/solutions/ai_gym.py +2 -2
- ultralytics/solutions/analytics.py +3 -3
- ultralytics/solutions/config.py +5 -5
- ultralytics/solutions/distance_calculation.py +2 -2
- ultralytics/solutions/heatmap.py +1 -1
- ultralytics/solutions/instance_segmentation.py +4 -4
- ultralytics/solutions/object_counter.py +4 -4
- ultralytics/solutions/parking_management.py +7 -7
- ultralytics/solutions/queue_management.py +3 -3
- ultralytics/solutions/region_counter.py +4 -4
- ultralytics/solutions/similarity_search.py +2 -2
- ultralytics/solutions/solutions.py +48 -48
- ultralytics/solutions/streamlit_inference.py +1 -1
- ultralytics/solutions/trackzone.py +4 -4
- ultralytics/solutions/vision_eye.py +1 -1
- ultralytics/trackers/byte_tracker.py +11 -11
- ultralytics/trackers/utils/gmc.py +3 -3
- ultralytics/trackers/utils/matching.py +5 -5
- ultralytics/utils/autodevice.py +2 -2
- ultralytics/utils/benchmarks.py +10 -10
- ultralytics/utils/callbacks/clearml.py +1 -1
- ultralytics/utils/callbacks/comet.py +5 -5
- ultralytics/utils/checks.py +5 -5
- ultralytics/utils/cpu.py +90 -0
- ultralytics/utils/dist.py +1 -1
- ultralytics/utils/downloads.py +2 -2
- ultralytics/utils/export.py +5 -5
- ultralytics/utils/instance.py +2 -2
- ultralytics/utils/metrics.py +35 -35
- ultralytics/utils/nms.py +4 -4
- ultralytics/utils/ops.py +1 -1
- ultralytics/utils/patches.py +2 -2
- ultralytics/utils/plotting.py +9 -9
- ultralytics/utils/torch_utils.py +2 -6
- ultralytics/utils/triton.py +5 -5
- {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.194.dist-info → dgenerate_ultralytics_headless-8.3.195.dist-info}/top_level.txt +0 -0
@@ -51,7 +51,7 @@ class Predictor(BasePredictor):
|
|
51
51
|
device (torch.device): The device (CPU or GPU) on which the model is loaded.
|
52
52
|
im (torch.Tensor): The preprocessed input image.
|
53
53
|
features (torch.Tensor): Extracted image features.
|
54
|
-
prompts (
|
54
|
+
prompts (dict[str, Any]): Dictionary to store various types of prompts (e.g., bboxes, points, masks).
|
55
55
|
segment_all (bool): Flag to indicate if full image segmentation should be performed.
|
56
56
|
mean (torch.Tensor): Mean values for image normalization.
|
57
57
|
std (torch.Tensor): Standard deviation values for image normalization.
|
@@ -116,7 +116,7 @@ class Predictor(BasePredictor):
|
|
116
116
|
torch.Tensor and list of np.ndarray as input formats.
|
117
117
|
|
118
118
|
Args:
|
119
|
-
im (torch.Tensor |
|
119
|
+
im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or list of HWC numpy arrays.
|
120
120
|
|
121
121
|
Returns:
|
122
122
|
(torch.Tensor): The preprocessed image tensor, normalized and converted to the appropriate dtype.
|
@@ -149,10 +149,10 @@ class Predictor(BasePredictor):
|
|
149
149
|
Currently, batched inference is not supported; hence the list length should be 1.
|
150
150
|
|
151
151
|
Args:
|
152
|
-
im (
|
152
|
+
im (list[np.ndarray]): List containing a single image in HWC numpy array format.
|
153
153
|
|
154
154
|
Returns:
|
155
|
-
(
|
155
|
+
(list[np.ndarray]): List containing the transformed image.
|
156
156
|
|
157
157
|
Raises:
|
158
158
|
AssertionError: If the input list contains more than one image.
|
@@ -177,9 +177,9 @@ class Predictor(BasePredictor):
|
|
177
177
|
|
178
178
|
Args:
|
179
179
|
im (torch.Tensor): The preprocessed input image in tensor format, with shape (N, C, H, W).
|
180
|
-
bboxes (np.ndarray |
|
181
|
-
points (np.ndarray |
|
182
|
-
labels (np.ndarray |
|
180
|
+
bboxes (np.ndarray | list | None): Bounding boxes with shape (N, 4), in XYXY format.
|
181
|
+
points (np.ndarray | list | None): Points indicating object locations with shape (N, 2), in pixels.
|
182
|
+
labels (np.ndarray | list | None): Labels for point prompts, shape (N,). 1 = foreground, 0 = background.
|
183
183
|
masks (np.ndarray | None): Low-resolution masks from previous predictions, shape (N, H, W). For SAM H=W=256.
|
184
184
|
multimask_output (bool): Flag to return multiple masks. Helpful for ambiguous prompts.
|
185
185
|
*args (Any): Additional positional arguments.
|
@@ -215,9 +215,9 @@ class Predictor(BasePredictor):
|
|
215
215
|
|
216
216
|
Args:
|
217
217
|
im (torch.Tensor): Preprocessed input image tensor with shape (N, C, H, W).
|
218
|
-
bboxes (np.ndarray |
|
219
|
-
points (np.ndarray |
|
220
|
-
labels (np.ndarray |
|
218
|
+
bboxes (np.ndarray | list | None): Bounding boxes in XYXY format with shape (N, 4).
|
219
|
+
points (np.ndarray | list | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
|
220
|
+
labels (np.ndarray | list | None): Point prompt labels with shape (N) or (N, num_points). 1 for foreground, 0 for background.
|
221
221
|
masks (np.ndarray | None): Low-res masks from previous predictions with shape (N, H, W). For SAM, H=W=256.
|
222
222
|
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
223
223
|
|
@@ -250,10 +250,10 @@ class Predictor(BasePredictor):
|
|
250
250
|
|
251
251
|
Args:
|
252
252
|
features (torch.Tensor): Extracted image features with shape (B, C, H, W) from the SAM model image encoder.
|
253
|
-
bboxes (np.ndarray |
|
254
|
-
points (np.ndarray |
|
255
|
-
labels (np.ndarray |
|
256
|
-
masks (
|
253
|
+
bboxes (np.ndarray | list[list[float]] | None): Bounding boxes in XYXY format with shape (N, 4).
|
254
|
+
points (np.ndarray | list[list[float]] | None): Object location points with shape (N, 2), in pixels.
|
255
|
+
labels (np.ndarray | list[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
|
256
|
+
masks (list[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
|
257
257
|
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
258
258
|
|
259
259
|
Returns:
|
@@ -282,12 +282,12 @@ class Predictor(BasePredictor):
|
|
282
282
|
Prepare and transform the input prompts for processing based on the destination shape.
|
283
283
|
|
284
284
|
Args:
|
285
|
-
dst_shape (
|
286
|
-
src_shape (
|
287
|
-
bboxes (np.ndarray |
|
288
|
-
points (np.ndarray |
|
289
|
-
labels (np.ndarray |
|
290
|
-
masks (
|
285
|
+
dst_shape (tuple[int, int]): The target shape (height, width) for the prompts.
|
286
|
+
src_shape (tuple[int, int]): The source shape (height, width) of the input image.
|
287
|
+
bboxes (np.ndarray | list | None): Bounding boxes in XYXY format with shape (N, 4).
|
288
|
+
points (np.ndarray | list | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
|
289
|
+
labels (np.ndarray | list | None): Point prompt labels with shape (N) or (N, num_points). 1 for foreground, 0 for background.
|
290
|
+
masks (list[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array with shape (H, W).
|
291
291
|
|
292
292
|
Returns:
|
293
293
|
bboxes (torch.Tensor | None): Transformed bounding boxes.
|
@@ -351,7 +351,7 @@ class Predictor(BasePredictor):
|
|
351
351
|
crop_n_layers (int): Number of layers for additional mask predictions on image crops.
|
352
352
|
crop_overlap_ratio (float): Overlap between crops, scaled down in subsequent layers.
|
353
353
|
crop_downscale_factor (int): Scaling factor for sampled points-per-side in each layer.
|
354
|
-
point_grids (
|
354
|
+
point_grids (list[np.ndarray] | None): Custom grids for point sampling normalized to [0,1].
|
355
355
|
points_stride (int): Number of points to sample along each side of the image.
|
356
356
|
points_batch_size (int): Batch size for the number of points processed simultaneously.
|
357
357
|
conf_thres (float): Confidence threshold [0,1] for filtering based on mask quality prediction.
|
@@ -490,10 +490,10 @@ class Predictor(BasePredictor):
|
|
490
490
|
- pred_scores (torch.Tensor): Confidence scores for each mask with shape (N, 1).
|
491
491
|
- pred_bboxes (torch.Tensor, optional): Predicted bounding boxes if segment_all is True.
|
492
492
|
img (torch.Tensor): The processed input image tensor with shape (C, H, W).
|
493
|
-
orig_imgs (
|
493
|
+
orig_imgs (list[np.ndarray] | torch.Tensor): The original, unprocessed images.
|
494
494
|
|
495
495
|
Returns:
|
496
|
-
(
|
496
|
+
(list[Results]): List of Results objects containing detection masks, bounding boxes, and other
|
497
497
|
metadata for each processed image.
|
498
498
|
|
499
499
|
Examples:
|
@@ -623,7 +623,7 @@ class Predictor(BasePredictor):
|
|
623
623
|
|
624
624
|
Returns:
|
625
625
|
new_masks (torch.Tensor): Processed masks with small regions removed, shape (N, H, W).
|
626
|
-
keep (
|
626
|
+
keep (list[int]): Indices of remaining masks after NMS, for filtering corresponding boxes.
|
627
627
|
|
628
628
|
Examples:
|
629
629
|
>>> masks = torch.rand(5, 640, 640) > 0.5 # 5 random binary masks
|
@@ -673,13 +673,13 @@ class Predictor(BasePredictor):
|
|
673
673
|
Perform prompts preprocessing and inference on provided image features using the SAM model.
|
674
674
|
|
675
675
|
Args:
|
676
|
-
features (torch.Tensor |
|
677
|
-
src_shape (
|
678
|
-
dst_shape (
|
679
|
-
bboxes (np.ndarray |
|
680
|
-
points (np.ndarray |
|
681
|
-
labels (np.ndarray |
|
682
|
-
masks (
|
676
|
+
features (torch.Tensor | dict[str, Any]): Extracted image features from the SAM/SAM2 model image encoder.
|
677
|
+
src_shape (tuple[int, int]): The source shape (height, width) of the input image.
|
678
|
+
dst_shape (tuple[int, int] | None): The target shape (height, width) for the prompts. If None, defaults to (imgsz, imgsz).
|
679
|
+
bboxes (np.ndarray | list[list[float]] | None): Bounding boxes in xyxy format with shape (N, 4).
|
680
|
+
points (np.ndarray | list[list[float]] | None): Points indicating object locations with shape (N, 2), in pixels.
|
681
|
+
labels (np.ndarray | list[int] | None): Point prompt labels with shape (N, ).
|
682
|
+
masks (list[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
|
683
683
|
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
684
684
|
|
685
685
|
Returns:
|
@@ -688,7 +688,7 @@ class Predictor(BasePredictor):
|
|
688
688
|
Each box is in xyxy format with additional columns for score and class.
|
689
689
|
|
690
690
|
Notes:
|
691
|
-
- The input features is a torch.Tensor of shape (B, C, H, W) if performing on SAM, or a
|
691
|
+
- The input features is a torch.Tensor of shape (B, C, H, W) if performing on SAM, or a dict[str, Any] if performing on SAM2.
|
692
692
|
"""
|
693
693
|
dst_shape = dst_shape or (self.args.imgsz, self.args.imgsz)
|
694
694
|
prompts = self._prepare_prompts(dst_shape, src_shape, bboxes, points, labels, masks)
|
@@ -714,12 +714,12 @@ class SAM2Predictor(Predictor):
|
|
714
714
|
prompt-based inference.
|
715
715
|
|
716
716
|
Attributes:
|
717
|
-
_bb_feat_sizes (
|
717
|
+
_bb_feat_sizes (list[tuple]): Feature sizes for different backbone levels.
|
718
718
|
model (torch.nn.Module): The loaded SAM2 model.
|
719
719
|
device (torch.device): The device (CPU or GPU) on which the model is loaded.
|
720
720
|
features (dict): Cached image features for efficient inference.
|
721
721
|
segment_all (bool): Flag to indicate if all segments should be predicted.
|
722
|
-
prompts (
|
722
|
+
prompts (dict[str, Any]): Dictionary to store various types of prompts for inference.
|
723
723
|
|
724
724
|
Methods:
|
725
725
|
get_model: Retrieve and initialize the SAM2 model.
|
@@ -752,12 +752,12 @@ class SAM2Predictor(Predictor):
|
|
752
752
|
Prepare and transform the input prompts for processing based on the destination shape.
|
753
753
|
|
754
754
|
Args:
|
755
|
-
dst_shape (
|
756
|
-
src_shape (
|
757
|
-
bboxes (np.ndarray |
|
758
|
-
points (np.ndarray |
|
759
|
-
labels (np.ndarray |
|
760
|
-
masks (
|
755
|
+
dst_shape (tuple[int, int]): The target shape (height, width) for the prompts.
|
756
|
+
src_shape (tuple[int, int]): The source shape (height, width) of the input image.
|
757
|
+
bboxes (np.ndarray | list | None): Bounding boxes in XYXY format with shape (N, 4).
|
758
|
+
points (np.ndarray | list | None): Points indicating object locations with shape (N, 2) or (N, num_points, 2), in pixels.
|
759
|
+
labels (np.ndarray | list | None): Point prompt labels with shape (N,) or (N, num_points). 1 for foreground, 0 for background.
|
760
|
+
masks (list | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
|
761
761
|
|
762
762
|
Returns:
|
763
763
|
points (torch.Tensor | None): Transformed points.
|
@@ -842,13 +842,13 @@ class SAM2Predictor(Predictor):
|
|
842
842
|
Perform inference on image features using the SAM2 model.
|
843
843
|
|
844
844
|
Args:
|
845
|
-
features (torch.Tensor |
|
845
|
+
features (torch.Tensor | dict[str, Any]): Extracted image features with shape (B, C, H, W) from the SAM2 model image encoder, it
|
846
846
|
could also be a dictionary including:
|
847
847
|
- image_embed (torch.Tensor): Image embedding with shape (B, C, H, W).
|
848
|
-
- high_res_feats (
|
849
|
-
points (np.ndarray |
|
850
|
-
labels (np.ndarray |
|
851
|
-
masks (
|
848
|
+
- high_res_feats (list[torch.Tensor]): List of high-resolution feature maps from the backbone, each with shape (B, C, H, W).
|
849
|
+
points (np.ndarray | list[list[float]] | None): Object location points with shape (N, 2), in pixels.
|
850
|
+
labels (np.ndarray | list[int] | None): Point prompt labels with shape (N,). 1 = foreground, 0 = background.
|
851
|
+
masks (list[np.ndarray] | np.ndarray | None): Masks for the objects, where each mask is a 2D array.
|
852
852
|
multimask_output (bool): Flag to return multiple masks for ambiguous prompts.
|
853
853
|
img_idx (int): Index of the image in the batch to process.
|
854
854
|
|
@@ -962,9 +962,9 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
962
962
|
|
963
963
|
Args:
|
964
964
|
im (torch.Tensor): The preprocessed input image in tensor format, with shape (N, C, H, W).
|
965
|
-
bboxes (np.ndarray |
|
966
|
-
points (np.ndarray |
|
967
|
-
labels (np.ndarray |
|
965
|
+
bboxes (np.ndarray | list, optional): Bounding boxes with shape (N, 4), in XYXY format.
|
966
|
+
points (np.ndarray | list, optional): Points indicating object locations with shape (N, 2), in pixels.
|
967
|
+
labels (np.ndarray | list, optional): Labels for point prompts, shape (N, ). 1 = foreground, 0 = background.
|
968
968
|
masks (np.ndarray, optional): Low-resolution masks from previous predictions shape (N,H,W). For SAM H=W=256.
|
969
969
|
|
970
970
|
Returns:
|
@@ -1036,9 +1036,9 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
1036
1036
|
the masks do not overlap, which can be useful for certain applications.
|
1037
1037
|
|
1038
1038
|
Args:
|
1039
|
-
preds (
|
1039
|
+
preds (tuple[torch.Tensor, torch.Tensor]): The predicted masks and scores from the model.
|
1040
1040
|
img (torch.Tensor): The processed image tensor.
|
1041
|
-
orig_imgs (
|
1041
|
+
orig_imgs (list[np.ndarray]): The original images before processing.
|
1042
1042
|
|
1043
1043
|
Returns:
|
1044
1044
|
(list): The post-processed predictions.
|
@@ -1286,7 +1286,7 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
1286
1286
|
Returns:
|
1287
1287
|
vis_feats (torch.Tensor): The visual features extracted from the image.
|
1288
1288
|
vis_pos_embed (torch.Tensor): The positional embeddings for the visual features.
|
1289
|
-
feat_sizes (
|
1289
|
+
feat_sizes (list[tuple]): A list containing the sizes of the extracted features.
|
1290
1290
|
|
1291
1291
|
Note:
|
1292
1292
|
- If `batch` is greater than 1, the features are expanded to fit the batch size.
|
@@ -1442,11 +1442,11 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
1442
1442
|
the current batch size.
|
1443
1443
|
|
1444
1444
|
Args:
|
1445
|
-
out_maskmem_pos_enc (
|
1445
|
+
out_maskmem_pos_enc (list[torch.Tensor] | None): The positional encoding for mask memory.
|
1446
1446
|
Should be a list of tensors or None.
|
1447
1447
|
|
1448
1448
|
Returns:
|
1449
|
-
(
|
1449
|
+
(list[torch.Tensor]): The positional encoding for mask memory, either cached or expanded.
|
1450
1450
|
|
1451
1451
|
Note:
|
1452
1452
|
- The method assumes that `out_maskmem_pos_enc` is a list of tensors or None.
|
@@ -1730,10 +1730,10 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1730
1730
|
specified overrides
|
1731
1731
|
|
1732
1732
|
Args:
|
1733
|
-
cfg (
|
1734
|
-
overrides (
|
1733
|
+
cfg (dict[str, Any]): Configuration dictionary containing default settings.
|
1734
|
+
overrides (dict[str, Any] | None): Dictionary of values to override default configuration.
|
1735
1735
|
max_obj_num (int): Maximum number of objects to track. Default is 3. this is set to keep fix feature size for the model.
|
1736
|
-
_callbacks (
|
1736
|
+
_callbacks (dict[str, Any] | None): Dictionary of callback functions to customize behavior.
|
1737
1737
|
|
1738
1738
|
Examples:
|
1739
1739
|
>>> predictor = SAM2DynamicInteractivePredictor(cfg=DEFAULT_CFG)
|
@@ -1778,11 +1778,11 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1778
1778
|
|
1779
1779
|
Args:
|
1780
1780
|
im (torch.Tensor | np.ndarray): The input image tensor or numpy array.
|
1781
|
-
bboxes (
|
1782
|
-
masks (
|
1783
|
-
points (
|
1784
|
-
labels (
|
1785
|
-
obj_ids (
|
1781
|
+
bboxes (list[list[float]] | None): Optional list of bounding boxes to update the memory.
|
1782
|
+
masks (list[torch.Tensor | np.ndarray] | None): Optional masks to update the memory.
|
1783
|
+
points (list[list[float]] | None): Optional list of points to update the memory, each point is [x, y].
|
1784
|
+
labels (list[int] | None): Optional list of object IDs corresponding to the points (>0 for positive, 0 for negative).
|
1785
|
+
obj_ids (list[int] | None): Optional list of object IDs corresponding to the prompts.
|
1786
1786
|
update_memory (bool): Flag to indicate whether to update the memory with new objects.
|
1787
1787
|
|
1788
1788
|
Returns:
|
@@ -1855,7 +1855,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1855
1855
|
Append the imgState to the memory_bank and update the memory for the model.
|
1856
1856
|
|
1857
1857
|
Args:
|
1858
|
-
obj_ids (
|
1858
|
+
obj_ids (list[int]): List of object IDs corresponding to the prompts.
|
1859
1859
|
points (torch.Tensor | None): Tensor of shape (B, N, 2) representing the input points for N objects.
|
1860
1860
|
labels (torch.Tensor | None): Tensor of shape (B, N) representing the labels for the input points.
|
1861
1861
|
masks (torch.Tensor | None): Optional tensor of shape (N, H, W) representing the input masks for N objects.
|
@@ -2009,7 +2009,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
2009
2009
|
mask (torch.Tensor | None): The mask input for the object with shape (H, W).
|
2010
2010
|
|
2011
2011
|
Returns:
|
2012
|
-
current_out (
|
2012
|
+
current_out (dict[str, Any]): A dictionary containing the current output with mask predictions and object pointers.
|
2013
2013
|
Keys include 'point_inputs', 'mask_inputs', 'pred_masks', 'pred_masks_high_res', 'obj_ptr', 'object_score_logits'.
|
2014
2014
|
"""
|
2015
2015
|
if mask is not None and self.model.use_mask_input_as_output_without_sam:
|
ultralytics/models/utils/loss.py
CHANGED
@@ -23,7 +23,7 @@ class DETRLoss(nn.Module):
|
|
23
23
|
|
24
24
|
Attributes:
|
25
25
|
nc (int): Number of classes.
|
26
|
-
loss_gain (
|
26
|
+
loss_gain (dict[str, float]): Coefficients for different loss components.
|
27
27
|
aux_loss (bool): Whether to compute auxiliary losses.
|
28
28
|
use_fl (bool): Whether to use FocalLoss.
|
29
29
|
use_vfl (bool): Whether to use VarifocalLoss.
|
@@ -55,7 +55,7 @@ class DETRLoss(nn.Module):
|
|
55
55
|
|
56
56
|
Args:
|
57
57
|
nc (int): Number of classes.
|
58
|
-
loss_gain (
|
58
|
+
loss_gain (dict[str, float], optional): Coefficients for different loss components.
|
59
59
|
aux_loss (bool): Whether to use auxiliary losses from each decoder layer.
|
60
60
|
use_fl (bool): Whether to use FocalLoss.
|
61
61
|
use_vfl (bool): Whether to use VarifocalLoss.
|
@@ -93,7 +93,7 @@ class DETRLoss(nn.Module):
|
|
93
93
|
postfix (str, optional): String to append to the loss name for identification in multi-loss scenarios.
|
94
94
|
|
95
95
|
Returns:
|
96
|
-
(
|
96
|
+
(dict[str, torch.Tensor]): Dictionary containing classification loss value.
|
97
97
|
|
98
98
|
Notes:
|
99
99
|
The function supports different classification loss types:
|
@@ -133,7 +133,7 @@ class DETRLoss(nn.Module):
|
|
133
133
|
postfix (str, optional): String to append to the loss names for identification in multi-loss scenarios.
|
134
134
|
|
135
135
|
Returns:
|
136
|
-
(
|
136
|
+
(dict[str, torch.Tensor]): Dictionary containing:
|
137
137
|
- loss_bbox{postfix}: L1 loss between predicted and ground truth boxes, scaled by the bbox loss gain.
|
138
138
|
- loss_giou{postfix}: GIoU loss between predicted and ground truth boxes, scaled by the giou loss gain.
|
139
139
|
|
@@ -207,14 +207,14 @@ class DETRLoss(nn.Module):
|
|
207
207
|
pred_scores (torch.Tensor): Predicted scores from auxiliary layers.
|
208
208
|
gt_bboxes (torch.Tensor): Ground truth bounding boxes.
|
209
209
|
gt_cls (torch.Tensor): Ground truth classes.
|
210
|
-
gt_groups (
|
211
|
-
match_indices (
|
210
|
+
gt_groups (list[int]): Number of ground truths per image.
|
211
|
+
match_indices (list[tuple], optional): Pre-computed matching indices.
|
212
212
|
postfix (str, optional): String to append to loss names.
|
213
213
|
masks (torch.Tensor, optional): Predicted masks if using segmentation.
|
214
214
|
gt_mask (torch.Tensor, optional): Ground truth masks if using segmentation.
|
215
215
|
|
216
216
|
Returns:
|
217
|
-
(
|
217
|
+
(dict[str, torch.Tensor]): Dictionary of auxiliary losses.
|
218
218
|
"""
|
219
219
|
# NOTE: loss class, bbox, giou, mask, dice
|
220
220
|
loss = torch.zeros(5 if masks is not None else 3, device=pred_bboxes.device)
|
@@ -265,10 +265,10 @@ class DETRLoss(nn.Module):
|
|
265
265
|
Extract batch indices, source indices, and destination indices from match indices.
|
266
266
|
|
267
267
|
Args:
|
268
|
-
match_indices (
|
268
|
+
match_indices (list[tuple]): List of tuples containing matched indices.
|
269
269
|
|
270
270
|
Returns:
|
271
|
-
batch_idx (
|
271
|
+
batch_idx (tuple[torch.Tensor, torch.Tensor]): Tuple containing (batch_idx, src_idx).
|
272
272
|
dst_idx (torch.Tensor): Destination indices.
|
273
273
|
"""
|
274
274
|
batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(match_indices)])
|
@@ -285,7 +285,7 @@ class DETRLoss(nn.Module):
|
|
285
285
|
Args:
|
286
286
|
pred_bboxes (torch.Tensor): Predicted bounding boxes.
|
287
287
|
gt_bboxes (torch.Tensor): Ground truth bounding boxes.
|
288
|
-
match_indices (
|
288
|
+
match_indices (list[tuple]): List of tuples containing matched indices.
|
289
289
|
|
290
290
|
Returns:
|
291
291
|
pred_assigned (torch.Tensor): Assigned predicted bounding boxes.
|
@@ -325,14 +325,14 @@ class DETRLoss(nn.Module):
|
|
325
325
|
pred_scores (torch.Tensor): Predicted class scores.
|
326
326
|
gt_bboxes (torch.Tensor): Ground truth bounding boxes.
|
327
327
|
gt_cls (torch.Tensor): Ground truth classes.
|
328
|
-
gt_groups (
|
328
|
+
gt_groups (list[int]): Number of ground truths per image.
|
329
329
|
masks (torch.Tensor, optional): Predicted masks if using segmentation.
|
330
330
|
gt_mask (torch.Tensor, optional): Ground truth masks if using segmentation.
|
331
331
|
postfix (str, optional): String to append to loss names.
|
332
|
-
match_indices (
|
332
|
+
match_indices (list[tuple], optional): Pre-computed matching indices.
|
333
333
|
|
334
334
|
Returns:
|
335
|
-
(
|
335
|
+
(dict[str, torch.Tensor]): Dictionary of losses.
|
336
336
|
"""
|
337
337
|
if match_indices is None:
|
338
338
|
match_indices = self.matcher(
|
@@ -370,12 +370,12 @@ class DETRLoss(nn.Module):
|
|
370
370
|
Args:
|
371
371
|
pred_bboxes (torch.Tensor): Predicted bounding boxes, shape (L, B, N, 4).
|
372
372
|
pred_scores (torch.Tensor): Predicted class scores, shape (L, B, N, C).
|
373
|
-
batch (
|
373
|
+
batch (dict[str, Any]): Batch information containing cls, bboxes, and gt_groups.
|
374
374
|
postfix (str, optional): Postfix for loss names.
|
375
375
|
**kwargs (Any): Additional arguments, may include 'match_indices'.
|
376
376
|
|
377
377
|
Returns:
|
378
|
-
(
|
378
|
+
(dict[str, torch.Tensor]): Computed losses, including main and auxiliary (if enabled).
|
379
379
|
|
380
380
|
Notes:
|
381
381
|
Uses last elements of pred_bboxes and pred_scores for main loss, and the rest for auxiliary losses if
|
@@ -419,14 +419,14 @@ class RTDETRDetectionLoss(DETRLoss):
|
|
419
419
|
Forward pass to compute detection loss with optional denoising loss.
|
420
420
|
|
421
421
|
Args:
|
422
|
-
preds (
|
423
|
-
batch (
|
422
|
+
preds (tuple[torch.Tensor, torch.Tensor]): Tuple containing predicted bounding boxes and scores.
|
423
|
+
batch (dict[str, Any]): Batch data containing ground truth information.
|
424
424
|
dn_bboxes (torch.Tensor, optional): Denoising bounding boxes.
|
425
425
|
dn_scores (torch.Tensor, optional): Denoising scores.
|
426
|
-
dn_meta (
|
426
|
+
dn_meta (dict[str, Any], optional): Metadata for denoising.
|
427
427
|
|
428
428
|
Returns:
|
429
|
-
(
|
429
|
+
(dict[str, torch.Tensor]): Dictionary containing total loss and denoising loss if applicable.
|
430
430
|
"""
|
431
431
|
pred_bboxes, pred_scores = preds
|
432
432
|
total_loss = super().forward(pred_bboxes, pred_scores, batch)
|
@@ -456,12 +456,12 @@ class RTDETRDetectionLoss(DETRLoss):
|
|
456
456
|
Get match indices for denoising.
|
457
457
|
|
458
458
|
Args:
|
459
|
-
dn_pos_idx (
|
459
|
+
dn_pos_idx (list[torch.Tensor]): List of tensors containing positive indices for denoising.
|
460
460
|
dn_num_group (int): Number of denoising groups.
|
461
|
-
gt_groups (
|
461
|
+
gt_groups (list[int]): List of integers representing number of ground truths per image.
|
462
462
|
|
463
463
|
Returns:
|
464
|
-
(
|
464
|
+
(list[tuple[torch.Tensor, torch.Tensor]]): List of tuples containing matched indices for denoising.
|
465
465
|
"""
|
466
466
|
dn_match_indices = []
|
467
467
|
idx_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0)
|
ultralytics/models/utils/ops.py
CHANGED
@@ -22,7 +22,7 @@ class HungarianMatcher(nn.Module):
|
|
22
22
|
used in end-to-end object detection models like DETR.
|
23
23
|
|
24
24
|
Attributes:
|
25
|
-
cost_gain (
|
25
|
+
cost_gain (dict[str, float]): Dictionary of cost coefficients for 'class', 'bbox', 'giou', 'mask', and 'dice'
|
26
26
|
components.
|
27
27
|
use_fl (bool): Whether to use Focal Loss for classification cost calculation.
|
28
28
|
with_mask (bool): Whether the model makes mask predictions.
|
@@ -60,7 +60,7 @@ class HungarianMatcher(nn.Module):
|
|
60
60
|
Initialize HungarianMatcher for optimal assignment of predicted and ground truth bounding boxes.
|
61
61
|
|
62
62
|
Args:
|
63
|
-
cost_gain (
|
63
|
+
cost_gain (dict[str, float], optional): Dictionary of cost coefficients for different matching cost
|
64
64
|
components. Should contain keys 'class', 'bbox', 'giou', 'mask', and 'dice'.
|
65
65
|
use_fl (bool): Whether to use Focal Loss for classification cost calculation.
|
66
66
|
with_mask (bool): Whether the model makes mask predictions.
|
@@ -100,12 +100,12 @@ class HungarianMatcher(nn.Module):
|
|
100
100
|
num_classes).
|
101
101
|
gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape (num_gts, 4).
|
102
102
|
gt_cls (torch.Tensor): Ground truth class labels with shape (num_gts,).
|
103
|
-
gt_groups (
|
103
|
+
gt_groups (list[int]): Number of ground truth boxes for each image in the batch.
|
104
104
|
masks (torch.Tensor, optional): Predicted masks with shape (batch_size, num_queries, height, width).
|
105
|
-
gt_mask (
|
105
|
+
gt_mask (list[torch.Tensor], optional): Ground truth masks, each with shape (num_masks, Height, Width).
|
106
106
|
|
107
107
|
Returns:
|
108
|
-
(
|
108
|
+
(list[tuple[torch.Tensor, torch.Tensor]]): A list of size batch_size, each element is a tuple
|
109
109
|
(index_i, index_j), where index_i is the tensor of indices of the selected predictions (in order)
|
110
110
|
and index_j is the tensor of indices of the corresponding selected ground truth targets (in order).
|
111
111
|
For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
|
@@ -205,8 +205,8 @@ def get_cdn_group(
|
|
205
205
|
bounding boxes and class labels. It generates both positive and negative samples to improve model robustness.
|
206
206
|
|
207
207
|
Args:
|
208
|
-
batch (
|
209
|
-
'gt_bboxes' (torch.Tensor with shape (num_gts, 4)), and 'gt_groups' (
|
208
|
+
batch (dict[str, Any]): Batch dictionary containing 'gt_cls' (torch.Tensor with shape (num_gts,)),
|
209
|
+
'gt_bboxes' (torch.Tensor with shape (num_gts, 4)), and 'gt_groups' (list[int]) indicating number of
|
210
210
|
ground truths per image.
|
211
211
|
num_classes (int): Total number of object classes.
|
212
212
|
num_queries (int): Number of object queries.
|
@@ -220,7 +220,7 @@ def get_cdn_group(
|
|
220
220
|
padding_cls (torch.Tensor | None): Modified class embeddings for denoising with shape (bs, num_dn, embed_dim).
|
221
221
|
padding_bbox (torch.Tensor | None): Modified bounding boxes for denoising with shape (bs, num_dn, 4).
|
222
222
|
attn_mask (torch.Tensor | None): Attention mask for denoising with shape (tgt_size, tgt_size).
|
223
|
-
dn_meta (
|
223
|
+
dn_meta (dict[str, Any] | None): Meta information dictionary containing denoising parameters.
|
224
224
|
|
225
225
|
Examples:
|
226
226
|
Generate denoising group for training
|
@@ -78,10 +78,10 @@ class ClassificationPredictor(BasePredictor):
|
|
78
78
|
Args:
|
79
79
|
preds (torch.Tensor): Raw predictions from the model.
|
80
80
|
img (torch.Tensor): Input images after preprocessing.
|
81
|
-
orig_imgs (
|
81
|
+
orig_imgs (list[np.ndarray] | torch.Tensor): Original images before preprocessing.
|
82
82
|
|
83
83
|
Returns:
|
84
|
-
(
|
84
|
+
(list[Results]): List of Results objects containing classification results for each image.
|
85
85
|
"""
|
86
86
|
if not isinstance(orig_imgs, list): # Input images are a torch.Tensor, not a list
|
87
87
|
orig_imgs = ops.convert_torch2numpy_batch(orig_imgs)
|
@@ -25,8 +25,8 @@ class ClassificationTrainer(BaseTrainer):
|
|
25
25
|
|
26
26
|
Attributes:
|
27
27
|
model (ClassificationModel): The classification model to be trained.
|
28
|
-
data (
|
29
|
-
loss_names (
|
28
|
+
data (dict[str, Any]): Dictionary containing dataset information including class names and number of classes.
|
29
|
+
loss_names (list[str]): Names of the loss functions used during training.
|
30
30
|
validator (ClassificationValidator): Validator instance for model evaluation.
|
31
31
|
|
32
32
|
Methods:
|
@@ -59,9 +59,9 @@ class ClassificationTrainer(BaseTrainer):
|
|
59
59
|
image size if not specified.
|
60
60
|
|
61
61
|
Args:
|
62
|
-
cfg (
|
63
|
-
overrides (
|
64
|
-
_callbacks (
|
62
|
+
cfg (dict[str, Any], optional): Default configuration dictionary containing training parameters.
|
63
|
+
overrides (dict[str, Any], optional): Dictionary of parameter overrides for the default configuration.
|
64
|
+
_callbacks (list[Any], optional): List of callback functions to be executed during training.
|
65
65
|
|
66
66
|
Examples:
|
67
67
|
Create a trainer with custom configuration
|
@@ -196,8 +196,8 @@ class ClassificationTrainer(BaseTrainer):
|
|
196
196
|
prefix (str, optional): Prefix to prepend to loss names.
|
197
197
|
|
198
198
|
Returns:
|
199
|
-
keys (
|
200
|
-
loss_dict (
|
199
|
+
keys (list[str]): List of loss keys if loss_items is None.
|
200
|
+
loss_dict (dict[str, float]): Dictionary of loss items if loss_items is provided.
|
201
201
|
"""
|
202
202
|
keys = [f"{prefix}/{x}" for x in self.loss_names]
|
203
203
|
if loss_items is None:
|
@@ -227,7 +227,7 @@ class ClassificationTrainer(BaseTrainer):
|
|
227
227
|
Plot training samples with their annotations.
|
228
228
|
|
229
229
|
Args:
|
230
|
-
batch (
|
230
|
+
batch (dict[str, torch.Tensor]): Batch containing images and class labels.
|
231
231
|
ni (int): Number of iterations.
|
232
232
|
"""
|
233
233
|
batch["batch_idx"] = torch.arange(len(batch["img"])) # add batch index for plotting
|
@@ -22,8 +22,8 @@ class ClassificationValidator(BaseValidator):
|
|
22
22
|
confusion matrix generation, and visualization of results.
|
23
23
|
|
24
24
|
Attributes:
|
25
|
-
targets (
|
26
|
-
pred (
|
25
|
+
targets (list[torch.Tensor]): Ground truth class labels.
|
26
|
+
pred (list[torch.Tensor]): Model predictions.
|
27
27
|
metrics (ClassifyMetrics): Object to calculate and store classification metrics.
|
28
28
|
names (dict): Mapping of class indices to class names.
|
29
29
|
nc (int): Number of classes.
|
@@ -170,7 +170,7 @@ class ClassificationValidator(BaseValidator):
|
|
170
170
|
Plot validation image samples with their ground truth labels.
|
171
171
|
|
172
172
|
Args:
|
173
|
-
batch (
|
173
|
+
batch (dict[str, Any]): Dictionary containing batch data with 'img' (images) and 'cls' (class labels).
|
174
174
|
ni (int): Batch index used for naming the output file.
|
175
175
|
|
176
176
|
Examples:
|
@@ -191,7 +191,7 @@ class ClassificationValidator(BaseValidator):
|
|
191
191
|
Plot images with their predicted class labels and save the visualization.
|
192
192
|
|
193
193
|
Args:
|
194
|
-
batch (
|
194
|
+
batch (dict[str, Any]): Batch data containing images and other information.
|
195
195
|
preds (torch.Tensor): Model predictions with shape (batch_size, num_classes).
|
196
196
|
ni (int): Batch index used for naming the output file.
|
197
197
|
|
@@ -96,12 +96,12 @@ class DetectionPredictor(BasePredictor):
|
|
96
96
|
Construct a list of Results objects from model predictions.
|
97
97
|
|
98
98
|
Args:
|
99
|
-
preds (
|
99
|
+
preds (list[torch.Tensor]): List of predicted bounding boxes and scores for each image.
|
100
100
|
img (torch.Tensor): Batch of preprocessed images used for inference.
|
101
|
-
orig_imgs (
|
101
|
+
orig_imgs (list[np.ndarray]): List of original images before preprocessing.
|
102
102
|
|
103
103
|
Returns:
|
104
|
-
(
|
104
|
+
(list[Results]): List of Results objects containing detection information for each image.
|
105
105
|
"""
|
106
106
|
return [
|
107
107
|
self.construct_result(pred, img, orig_img, img_path)
|