ultralytics 8.3.36__py3-none-any.whl → 8.3.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +7 -6
- ultralytics/cfg/default.yaml +1 -2
- ultralytics/data/augment.py +4 -5
- ultralytics/data/loaders.py +1 -1
- ultralytics/engine/exporter.py +5 -4
- ultralytics/engine/model.py +17 -0
- ultralytics/models/sam/__init__.py +2 -2
- ultralytics/models/sam/model.py +1 -1
- ultralytics/models/sam/modules/sam.py +16 -39
- ultralytics/models/sam/predict.py +817 -28
- ultralytics/nn/modules/block.py +2 -2
- ultralytics/nn/modules/conv.py +1 -1
- ultralytics/solutions/parking_management.py +1 -1
- ultralytics/trackers/basetrack.py +1 -1
- ultralytics/trackers/utils/matching.py +3 -4
- ultralytics/utils/__init__.py +8 -6
- ultralytics/utils/loss.py +2 -3
- ultralytics/utils/metrics.py +12 -13
- ultralytics/utils/ops.py +20 -14
- ultralytics/utils/plotting.py +14 -14
- {ultralytics-8.3.36.dist-info → ultralytics-8.3.38.dist-info}/METADATA +3 -3
- {ultralytics-8.3.36.dist-info → ultralytics-8.3.38.dist-info}/RECORD +27 -27
- {ultralytics-8.3.36.dist-info → ultralytics-8.3.38.dist-info}/LICENSE +0 -0
- {ultralytics-8.3.36.dist-info → ultralytics-8.3.38.dist-info}/WHEEL +0 -0
- {ultralytics-8.3.36.dist-info → ultralytics-8.3.38.dist-info}/entry_points.txt +0 -0
- {ultralytics-8.3.36.dist-info → ultralytics-8.3.38.dist-info}/top_level.txt +0 -0
ultralytics/__init__.py
CHANGED
ultralytics/cfg/__init__.py
CHANGED
@@ -83,13 +83,13 @@ SOLUTIONS_HELP_MSG = f"""
|
|
83
83
|
See all ARGS at https://docs.ultralytics.com/usage/cfg or with 'yolo cfg'
|
84
84
|
|
85
85
|
1. Call object counting solution
|
86
|
-
yolo solutions count source="path/to/video/file.mp4" region=[(20, 400), (1080,
|
86
|
+
yolo solutions count source="path/to/video/file.mp4" region=[(20, 400), (1080, 400), (1080, 360), (20, 360)]
|
87
87
|
|
88
88
|
2. Call heatmaps solution
|
89
89
|
yolo solutions heatmap colormap=cv2.COLORMAP_PARAULA model=yolo11n.pt
|
90
90
|
|
91
91
|
3. Call queue management solution
|
92
|
-
yolo solutions queue region=[(20, 400), (1080,
|
92
|
+
yolo solutions queue region=[(20, 400), (1080, 400), (1080, 360), (20, 360)] model=yolo11n.pt
|
93
93
|
|
94
94
|
4. Call workouts monitoring solution for push-ups
|
95
95
|
yolo solutions workout model=yolo11n-pose.pt kpts=[6, 8, 10]
|
@@ -160,7 +160,6 @@ CFG_FRACTION_KEYS = { # fractional float arguments with 0.0<=values<=1.0
|
|
160
160
|
"weight_decay",
|
161
161
|
"warmup_momentum",
|
162
162
|
"warmup_bias_lr",
|
163
|
-
"label_smoothing",
|
164
163
|
"hsv_h",
|
165
164
|
"hsv_s",
|
166
165
|
"hsv_v",
|
@@ -436,6 +435,9 @@ def _handle_deprecation(custom):
|
|
436
435
|
if key == "line_thickness":
|
437
436
|
deprecation_warn(key, "line_width")
|
438
437
|
custom["line_width"] = custom.pop("line_thickness")
|
438
|
+
if key == "label_smoothing":
|
439
|
+
deprecation_warn(key)
|
440
|
+
custom.pop("label_smoothing")
|
439
441
|
|
440
442
|
return custom
|
441
443
|
|
@@ -738,9 +740,8 @@ def parse_key_value_pair(pair: str = "key=value"):
|
|
738
740
|
pair (str): A string containing a key-value pair in the format "key=value".
|
739
741
|
|
740
742
|
Returns:
|
741
|
-
(
|
742
|
-
|
743
|
-
- value (str): The parsed value.
|
743
|
+
key (str): The parsed key.
|
744
|
+
value (str): The parsed value.
|
744
745
|
|
745
746
|
Raises:
|
746
747
|
AssertionError: If the value is missing or empty.
|
ultralytics/cfg/default.yaml
CHANGED
@@ -83,7 +83,7 @@ int8: False # (bool) CoreML/TF INT8 quantization
|
|
83
83
|
dynamic: False # (bool) ONNX/TF/TensorRT: dynamic axes
|
84
84
|
simplify: True # (bool) ONNX: simplify model using `onnxslim`
|
85
85
|
opset: # (int, optional) ONNX: opset version
|
86
|
-
workspace:
|
86
|
+
workspace: None # (float, optional) TensorRT: workspace size (GiB), `None` will let TensorRT auto-allocate memory
|
87
87
|
nms: False # (bool) CoreML: add NMS
|
88
88
|
|
89
89
|
# Hyperparameters ------------------------------------------------------------------------------------------------------
|
@@ -99,7 +99,6 @@ cls: 0.5 # (float) cls loss gain (scale with pixels)
|
|
99
99
|
dfl: 1.5 # (float) dfl loss gain
|
100
100
|
pose: 12.0 # (float) pose loss gain
|
101
101
|
kobj: 1.0 # (float) keypoint obj loss gain
|
102
|
-
label_smoothing: 0.0 # (float) label smoothing (fraction)
|
103
102
|
nbs: 64 # (int) nominal batch size
|
104
103
|
hsv_h: 0.015 # (float) image HSV-Hue augmentation (fraction)
|
105
104
|
hsv_s: 0.7 # (float) image HSV-Saturation augmentation (fraction)
|
ultralytics/data/augment.py
CHANGED
@@ -1591,7 +1591,7 @@ class LetterBox:
|
|
1591
1591
|
labels["ratio_pad"] = (labels["ratio_pad"], (left, top)) # for evaluation
|
1592
1592
|
|
1593
1593
|
if len(labels):
|
1594
|
-
labels = self._update_labels(labels, ratio,
|
1594
|
+
labels = self._update_labels(labels, ratio, left, top)
|
1595
1595
|
labels["img"] = img
|
1596
1596
|
labels["resized_shape"] = new_shape
|
1597
1597
|
return labels
|
@@ -2111,10 +2111,9 @@ class Format:
|
|
2111
2111
|
h (int): Height of the image.
|
2112
2112
|
|
2113
2113
|
Returns:
|
2114
|
-
(
|
2115
|
-
|
2116
|
-
|
2117
|
-
cls (numpy.ndarray): Updated class labels, sorted if mask_overlap is True.
|
2114
|
+
masks (numpy.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
|
2115
|
+
instances (Instances): Updated instances object with sorted segments if mask_overlap is True.
|
2116
|
+
cls (numpy.ndarray): Updated class labels, sorted if mask_overlap is True.
|
2118
2117
|
|
2119
2118
|
Notes:
|
2120
2119
|
- If self.mask_overlap is True, masks are overlapped and sorted by area.
|
ultralytics/data/loaders.py
CHANGED
@@ -354,7 +354,7 @@ class LoadImagesAndVideos:
|
|
354
354
|
self.nf = ni + nv # number of files
|
355
355
|
self.ni = ni # number of images
|
356
356
|
self.video_flag = [False] * ni + [True] * nv
|
357
|
-
self.mode = "image"
|
357
|
+
self.mode = "video" if ni == 0 else "image" # default to video if no images
|
358
358
|
self.vid_stride = vid_stride # video frame-rate stride
|
359
359
|
self.bs = batch
|
360
360
|
if any(videos):
|
ultralytics/engine/exporter.py
CHANGED
@@ -220,6 +220,7 @@ class Exporter:
|
|
220
220
|
self.args.device = "0"
|
221
221
|
if fmt == "engine" and "dla" in str(self.args.device): # convert int/list to str first
|
222
222
|
dla = self.args.device.split(":")[-1]
|
223
|
+
self.args.device = "0" # update device to "0"
|
223
224
|
assert dla in {"0", "1"}, f"Expected self.args.device='dla:0' or 'dla:1, but got {self.args.device}."
|
224
225
|
self.device = select_device("cpu" if self.args.device is None else self.args.device)
|
225
226
|
|
@@ -781,10 +782,10 @@ class Exporter:
|
|
781
782
|
# Engine builder
|
782
783
|
builder = trt.Builder(logger)
|
783
784
|
config = builder.create_builder_config()
|
784
|
-
workspace = int(self.args.workspace * (1 << 30))
|
785
|
-
if is_trt10:
|
785
|
+
workspace = int(self.args.workspace * (1 << 30)) if self.args.workspace is not None else 0
|
786
|
+
if is_trt10 and workspace > 0:
|
786
787
|
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace)
|
787
|
-
|
788
|
+
elif workspace > 0 and not is_trt10: # TensorRT versions 7, 8
|
788
789
|
config.max_workspace_size = workspace
|
789
790
|
flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
|
790
791
|
network = builder.create_network(flag)
|
@@ -823,7 +824,7 @@ class Exporter:
|
|
823
824
|
LOGGER.warning(f"{prefix} WARNING ⚠️ 'dynamic=True' model requires max batch size, i.e. 'batch=16'")
|
824
825
|
profile = builder.create_optimization_profile()
|
825
826
|
min_shape = (1, shape[1], 32, 32) # minimum input shape
|
826
|
-
max_shape = (*shape[:2], *(int(max(1,
|
827
|
+
max_shape = (*shape[:2], *(int(max(1, workspace) * d) for d in shape[2:])) # max input shape
|
827
828
|
for inp in inputs:
|
828
829
|
profile.set_shape(inp.name, min=min_shape, opt=shape, max=max_shape)
|
829
830
|
config.add_optimization_profile(profile)
|
ultralytics/engine/model.py
CHANGED
@@ -1126,3 +1126,20 @@ class Model(nn.Module):
|
|
1126
1126
|
description of the expected behavior and structure.
|
1127
1127
|
"""
|
1128
1128
|
raise NotImplementedError("Please provide task map for your model!")
|
1129
|
+
|
1130
|
+
def eval(self):
|
1131
|
+
"""
|
1132
|
+
Sets the model to evaluation mode.
|
1133
|
+
|
1134
|
+
This method changes the model's mode to evaluation, which affects layers like dropout and batch normalization
|
1135
|
+
that behave differently during training and evaluation.
|
1136
|
+
|
1137
|
+
Returns:
|
1138
|
+
(Model): The model instance with evaluation mode set.
|
1139
|
+
|
1140
|
+
Examples:
|
1141
|
+
>> model = YOLO("yolo11n.pt")
|
1142
|
+
>> model.eval()
|
1143
|
+
"""
|
1144
|
+
self.model.eval()
|
1145
|
+
return self
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# Ultralytics YOLO 🚀, AGPL-3.0 license
|
2
2
|
|
3
3
|
from .model import SAM
|
4
|
-
from .predict import Predictor, SAM2Predictor
|
4
|
+
from .predict import Predictor, SAM2Predictor, SAM2VideoPredictor
|
5
5
|
|
6
|
-
__all__ = "SAM", "Predictor", "SAM2Predictor" # tuple or list
|
6
|
+
__all__ = "SAM", "Predictor", "SAM2Predictor", "SAM2VideoPredictor" # tuple or list
|
ultralytics/models/sam/model.py
CHANGED
@@ -148,7 +148,7 @@ class SAM(Model):
|
|
148
148
|
verbose (bool): If True, prints the information to the console.
|
149
149
|
|
150
150
|
Returns:
|
151
|
-
(
|
151
|
+
(tuple): A tuple containing the model's information (string representations of the model).
|
152
152
|
|
153
153
|
Examples:
|
154
154
|
>>> sam = SAM("sam_b.pt")
|
@@ -36,8 +36,6 @@ class SAMModel(nn.Module):
|
|
36
36
|
image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
|
37
37
|
prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
|
38
38
|
mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
|
39
|
-
pixel_mean (torch.Tensor): Mean pixel values for image normalization, shape (3, 1, 1).
|
40
|
-
pixel_std (torch.Tensor): Standard deviation values for image normalization, shape (3, 1, 1).
|
41
39
|
|
42
40
|
Methods:
|
43
41
|
__init__: Initializes the SAMModel with encoders, decoder, and normalization parameters.
|
@@ -349,8 +347,7 @@ class SAM2Model(torch.nn.Module):
|
|
349
347
|
self.sam_prompt_embed_dim = self.hidden_dim
|
350
348
|
self.sam_image_embedding_size = self.image_size // self.backbone_stride
|
351
349
|
|
352
|
-
#
|
353
|
-
# (their hyperparameters like `mask_in_chans=16` are from SAM code)
|
350
|
+
# Build PromptEncoder and MaskDecoder from SAM (hyperparameters like `mask_in_chans=16` are from SAM code)
|
354
351
|
self.sam_prompt_encoder = PromptEncoder(
|
355
352
|
embed_dim=self.sam_prompt_embed_dim,
|
356
353
|
image_embedding_size=(
|
@@ -425,8 +422,8 @@ class SAM2Model(torch.nn.Module):
|
|
425
422
|
low_res_multimasks: Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
|
426
423
|
high_res_multimasks: Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
|
427
424
|
ious: Tensor of shape (B, M) with estimated IoU for each output mask.
|
428
|
-
low_res_masks: Tensor of shape (B, 1, H*4, W*4) with best low-resolution mask.
|
429
|
-
high_res_masks: Tensor of shape (B, 1, H*16, W*16) with best high-resolution mask.
|
425
|
+
low_res_masks: Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
|
426
|
+
high_res_masks: Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
|
430
427
|
obj_ptr: Tensor of shape (B, C) with object pointer vector for the output mask.
|
431
428
|
object_score_logits: Tensor of shape (B,) with object score logits.
|
432
429
|
|
@@ -488,12 +485,7 @@ class SAM2Model(torch.nn.Module):
|
|
488
485
|
boxes=None,
|
489
486
|
masks=sam_mask_prompt,
|
490
487
|
)
|
491
|
-
(
|
492
|
-
low_res_multimasks,
|
493
|
-
ious,
|
494
|
-
sam_output_tokens,
|
495
|
-
object_score_logits,
|
496
|
-
) = self.sam_mask_decoder(
|
488
|
+
low_res_multimasks, ious, sam_output_tokens, object_score_logits = self.sam_mask_decoder(
|
497
489
|
image_embeddings=backbone_features,
|
498
490
|
image_pe=self.sam_prompt_encoder.get_dense_pe(),
|
499
491
|
sparse_prompt_embeddings=sparse_embeddings,
|
@@ -505,13 +497,8 @@ class SAM2Model(torch.nn.Module):
|
|
505
497
|
if self.pred_obj_scores:
|
506
498
|
is_obj_appearing = object_score_logits > 0
|
507
499
|
|
508
|
-
#
|
509
|
-
|
510
|
-
low_res_multimasks = torch.where(
|
511
|
-
is_obj_appearing[:, None, None],
|
512
|
-
low_res_multimasks,
|
513
|
-
NO_OBJ_SCORE,
|
514
|
-
)
|
500
|
+
# Spatial memory mask is a *hard* choice between obj and no obj, consistent with actual mask prediction
|
501
|
+
low_res_multimasks = torch.where(is_obj_appearing[:, None, None], low_res_multimasks, NO_OBJ_SCORE)
|
515
502
|
|
516
503
|
# convert masks from possibly bfloat16 (or float16) to float32
|
517
504
|
# (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
|
@@ -617,7 +604,6 @@ class SAM2Model(torch.nn.Module):
|
|
617
604
|
|
618
605
|
def _prepare_backbone_features(self, backbone_out):
|
619
606
|
"""Prepares and flattens visual features from the image backbone output for further processing."""
|
620
|
-
backbone_out = backbone_out.copy()
|
621
607
|
assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
|
622
608
|
assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
|
623
609
|
|
@@ -826,11 +812,7 @@ class SAM2Model(torch.nn.Module):
|
|
826
812
|
mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
|
827
813
|
if self.sigmoid_bias_for_mem_enc != 0.0:
|
828
814
|
mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
|
829
|
-
maskmem_out = self.memory_encoder(
|
830
|
-
pix_feat,
|
831
|
-
mask_for_mem,
|
832
|
-
skip_mask_sigmoid=True, # sigmoid already applied
|
833
|
-
)
|
815
|
+
maskmem_out = self.memory_encoder(pix_feat, mask_for_mem, skip_mask_sigmoid=True) # sigmoid already applied
|
834
816
|
maskmem_features = maskmem_out["vision_features"]
|
835
817
|
maskmem_pos_enc = maskmem_out["vision_pos_enc"]
|
836
818
|
# add a no-object embedding to the spatial memory to indicate that the frame
|
@@ -965,16 +947,7 @@ class SAM2Model(torch.nn.Module):
|
|
965
947
|
track_in_reverse,
|
966
948
|
prev_sam_mask_logits,
|
967
949
|
)
|
968
|
-
|
969
|
-
(
|
970
|
-
_,
|
971
|
-
_,
|
972
|
-
_,
|
973
|
-
low_res_masks,
|
974
|
-
high_res_masks,
|
975
|
-
obj_ptr,
|
976
|
-
object_score_logits,
|
977
|
-
) = sam_outputs
|
950
|
+
_, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
|
978
951
|
|
979
952
|
current_out["pred_masks"] = low_res_masks
|
980
953
|
current_out["pred_masks_high_res"] = high_res_masks
|
@@ -984,8 +957,7 @@ class SAM2Model(torch.nn.Module):
|
|
984
957
|
# it's mainly used in the demo to encode spatial memories w/ consolidated masks)
|
985
958
|
current_out["object_score_logits"] = object_score_logits
|
986
959
|
|
987
|
-
#
|
988
|
-
# it into a new memory feature (that can be used in future frames)
|
960
|
+
# Run memory encoder on the predicted mask to encode it into a new memory feature (for use in future frames)
|
989
961
|
self._encode_memory_in_output(
|
990
962
|
current_vision_feats,
|
991
963
|
feat_sizes,
|
@@ -1007,8 +979,9 @@ class SAM2Model(torch.nn.Module):
|
|
1007
979
|
and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num)
|
1008
980
|
)
|
1009
981
|
|
1010
|
-
|
1011
|
-
|
982
|
+
@staticmethod
|
983
|
+
def _apply_non_overlapping_constraints(pred_masks):
|
984
|
+
"""Applies non-overlapping constraints to masks, keeping the highest scoring object per location."""
|
1012
985
|
batch_size = pred_masks.size(0)
|
1013
986
|
if batch_size == 1:
|
1014
987
|
return pred_masks
|
@@ -1024,6 +997,10 @@ class SAM2Model(torch.nn.Module):
|
|
1024
997
|
pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
|
1025
998
|
return pred_masks
|
1026
999
|
|
1000
|
+
def set_binarize(self, binarize=False):
|
1001
|
+
"""Set binarize for VideoPredictor."""
|
1002
|
+
self.binarize_mask_from_pts_for_mem_enc = binarize
|
1003
|
+
|
1027
1004
|
def set_imgsz(self, imgsz):
|
1028
1005
|
"""
|
1029
1006
|
Set image size to make model compatible with different image sizes.
|