ultralytics 8.3.36__py3-none-any.whl → 8.3.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ultralytics/__init__.py CHANGED
@@ -1,6 +1,6 @@
1
1
  # Ultralytics YOLO 🚀, AGPL-3.0 license
2
2
 
3
- __version__ = "8.3.36"
3
+ __version__ = "8.3.38"
4
4
 
5
5
  import os
6
6
 
@@ -83,13 +83,13 @@ SOLUTIONS_HELP_MSG = f"""
83
83
  See all ARGS at https://docs.ultralytics.com/usage/cfg or with 'yolo cfg'
84
84
 
85
85
  1. Call object counting solution
86
- yolo solutions count source="path/to/video/file.mp4" region=[(20, 400), (1080, 404), (1080, 360), (20, 360)]
86
+ yolo solutions count source="path/to/video/file.mp4" region=[(20, 400), (1080, 400), (1080, 360), (20, 360)]
87
87
 
88
88
  2. Call heatmaps solution
89
89
  yolo solutions heatmap colormap=cv2.COLORMAP_PARAULA model=yolo11n.pt
90
90
 
91
91
  3. Call queue management solution
92
- yolo solutions queue region=[(20, 400), (1080, 404), (1080, 360), (20, 360)] model=yolo11n.pt
92
+ yolo solutions queue region=[(20, 400), (1080, 400), (1080, 360), (20, 360)] model=yolo11n.pt
93
93
 
94
94
  4. Call workouts monitoring solution for push-ups
95
95
  yolo solutions workout model=yolo11n-pose.pt kpts=[6, 8, 10]
@@ -160,7 +160,6 @@ CFG_FRACTION_KEYS = { # fractional float arguments with 0.0<=values<=1.0
160
160
  "weight_decay",
161
161
  "warmup_momentum",
162
162
  "warmup_bias_lr",
163
- "label_smoothing",
164
163
  "hsv_h",
165
164
  "hsv_s",
166
165
  "hsv_v",
@@ -436,6 +435,9 @@ def _handle_deprecation(custom):
436
435
  if key == "line_thickness":
437
436
  deprecation_warn(key, "line_width")
438
437
  custom["line_width"] = custom.pop("line_thickness")
438
+ if key == "label_smoothing":
439
+ deprecation_warn(key)
440
+ custom.pop("label_smoothing")
439
441
 
440
442
  return custom
441
443
 
@@ -738,9 +740,8 @@ def parse_key_value_pair(pair: str = "key=value"):
738
740
  pair (str): A string containing a key-value pair in the format "key=value".
739
741
 
740
742
  Returns:
741
- (tuple): A tuple containing two elements:
742
- - key (str): The parsed key.
743
- - value (str): The parsed value.
743
+ key (str): The parsed key.
744
+ value (str): The parsed value.
744
745
 
745
746
  Raises:
746
747
  AssertionError: If the value is missing or empty.
@@ -83,7 +83,7 @@ int8: False # (bool) CoreML/TF INT8 quantization
83
83
  dynamic: False # (bool) ONNX/TF/TensorRT: dynamic axes
84
84
  simplify: True # (bool) ONNX: simplify model using `onnxslim`
85
85
  opset: # (int, optional) ONNX: opset version
86
- workspace: 4 # (int) TensorRT: workspace size (GB)
86
+ workspace: None # (float, optional) TensorRT: workspace size (GiB), `None` will let TensorRT auto-allocate memory
87
87
  nms: False # (bool) CoreML: add NMS
88
88
 
89
89
  # Hyperparameters ------------------------------------------------------------------------------------------------------
@@ -99,7 +99,6 @@ cls: 0.5 # (float) cls loss gain (scale with pixels)
99
99
  dfl: 1.5 # (float) dfl loss gain
100
100
  pose: 12.0 # (float) pose loss gain
101
101
  kobj: 1.0 # (float) keypoint obj loss gain
102
- label_smoothing: 0.0 # (float) label smoothing (fraction)
103
102
  nbs: 64 # (int) nominal batch size
104
103
  hsv_h: 0.015 # (float) image HSV-Hue augmentation (fraction)
105
104
  hsv_s: 0.7 # (float) image HSV-Saturation augmentation (fraction)
@@ -1591,7 +1591,7 @@ class LetterBox:
1591
1591
  labels["ratio_pad"] = (labels["ratio_pad"], (left, top)) # for evaluation
1592
1592
 
1593
1593
  if len(labels):
1594
- labels = self._update_labels(labels, ratio, dw, dh)
1594
+ labels = self._update_labels(labels, ratio, left, top)
1595
1595
  labels["img"] = img
1596
1596
  labels["resized_shape"] = new_shape
1597
1597
  return labels
@@ -2111,10 +2111,9 @@ class Format:
2111
2111
  h (int): Height of the image.
2112
2112
 
2113
2113
  Returns:
2114
- (tuple): Tuple containing:
2115
- masks (numpy.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
2116
- instances (Instances): Updated instances object with sorted segments if mask_overlap is True.
2117
- cls (numpy.ndarray): Updated class labels, sorted if mask_overlap is True.
2114
+ masks (numpy.ndarray): Bitmap masks with shape (N, H, W) or (1, H, W) if mask_overlap is True.
2115
+ instances (Instances): Updated instances object with sorted segments if mask_overlap is True.
2116
+ cls (numpy.ndarray): Updated class labels, sorted if mask_overlap is True.
2118
2117
 
2119
2118
  Notes:
2120
2119
  - If self.mask_overlap is True, masks are overlapped and sorted by area.
@@ -354,7 +354,7 @@ class LoadImagesAndVideos:
354
354
  self.nf = ni + nv # number of files
355
355
  self.ni = ni # number of images
356
356
  self.video_flag = [False] * ni + [True] * nv
357
- self.mode = "image"
357
+ self.mode = "video" if ni == 0 else "image" # default to video if no images
358
358
  self.vid_stride = vid_stride # video frame-rate stride
359
359
  self.bs = batch
360
360
  if any(videos):
@@ -220,6 +220,7 @@ class Exporter:
220
220
  self.args.device = "0"
221
221
  if fmt == "engine" and "dla" in str(self.args.device): # convert int/list to str first
222
222
  dla = self.args.device.split(":")[-1]
223
+ self.args.device = "0" # update device to "0"
223
224
  assert dla in {"0", "1"}, f"Expected self.args.device='dla:0' or 'dla:1, but got {self.args.device}."
224
225
  self.device = select_device("cpu" if self.args.device is None else self.args.device)
225
226
 
@@ -781,10 +782,10 @@ class Exporter:
781
782
  # Engine builder
782
783
  builder = trt.Builder(logger)
783
784
  config = builder.create_builder_config()
784
- workspace = int(self.args.workspace * (1 << 30))
785
- if is_trt10:
785
+ workspace = int(self.args.workspace * (1 << 30)) if self.args.workspace is not None else 0
786
+ if is_trt10 and workspace > 0:
786
787
  config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace)
787
- else: # TensorRT versions 7, 8
788
+ elif workspace > 0 and not is_trt10: # TensorRT versions 7, 8
788
789
  config.max_workspace_size = workspace
789
790
  flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
790
791
  network = builder.create_network(flag)
@@ -823,7 +824,7 @@ class Exporter:
823
824
  LOGGER.warning(f"{prefix} WARNING ⚠️ 'dynamic=True' model requires max batch size, i.e. 'batch=16'")
824
825
  profile = builder.create_optimization_profile()
825
826
  min_shape = (1, shape[1], 32, 32) # minimum input shape
826
- max_shape = (*shape[:2], *(int(max(1, self.args.workspace) * d) for d in shape[2:])) # max input shape
827
+ max_shape = (*shape[:2], *(int(max(1, workspace) * d) for d in shape[2:])) # max input shape
827
828
  for inp in inputs:
828
829
  profile.set_shape(inp.name, min=min_shape, opt=shape, max=max_shape)
829
830
  config.add_optimization_profile(profile)
@@ -1126,3 +1126,20 @@ class Model(nn.Module):
1126
1126
  description of the expected behavior and structure.
1127
1127
  """
1128
1128
  raise NotImplementedError("Please provide task map for your model!")
1129
+
1130
+ def eval(self):
1131
+ """
1132
+ Sets the model to evaluation mode.
1133
+
1134
+ This method changes the model's mode to evaluation, which affects layers like dropout and batch normalization
1135
+ that behave differently during training and evaluation.
1136
+
1137
+ Returns:
1138
+ (Model): The model instance with evaluation mode set.
1139
+
1140
+ Examples:
1141
+ >> model = YOLO("yolo11n.pt")
1142
+ >> model.eval()
1143
+ """
1144
+ self.model.eval()
1145
+ return self
@@ -1,6 +1,6 @@
1
1
  # Ultralytics YOLO 🚀, AGPL-3.0 license
2
2
 
3
3
  from .model import SAM
4
- from .predict import Predictor, SAM2Predictor
4
+ from .predict import Predictor, SAM2Predictor, SAM2VideoPredictor
5
5
 
6
- __all__ = "SAM", "Predictor", "SAM2Predictor" # tuple or list
6
+ __all__ = "SAM", "Predictor", "SAM2Predictor", "SAM2VideoPredictor" # tuple or list
@@ -148,7 +148,7 @@ class SAM(Model):
148
148
  verbose (bool): If True, prints the information to the console.
149
149
 
150
150
  Returns:
151
- (Tuple): A tuple containing the model's information (string representations of the model).
151
+ (tuple): A tuple containing the model's information (string representations of the model).
152
152
 
153
153
  Examples:
154
154
  >>> sam = SAM("sam_b.pt")
@@ -36,8 +36,6 @@ class SAMModel(nn.Module):
36
36
  image_encoder (ImageEncoderViT): Backbone for encoding images into embeddings.
37
37
  prompt_encoder (PromptEncoder): Encoder for various types of input prompts.
38
38
  mask_decoder (MaskDecoder): Predicts object masks from image and prompt embeddings.
39
- pixel_mean (torch.Tensor): Mean pixel values for image normalization, shape (3, 1, 1).
40
- pixel_std (torch.Tensor): Standard deviation values for image normalization, shape (3, 1, 1).
41
39
 
42
40
  Methods:
43
41
  __init__: Initializes the SAMModel with encoders, decoder, and normalization parameters.
@@ -349,8 +347,7 @@ class SAM2Model(torch.nn.Module):
349
347
  self.sam_prompt_embed_dim = self.hidden_dim
350
348
  self.sam_image_embedding_size = self.image_size // self.backbone_stride
351
349
 
352
- # build PromptEncoder and MaskDecoder from SAM
353
- # (their hyperparameters like `mask_in_chans=16` are from SAM code)
350
+ # Build PromptEncoder and MaskDecoder from SAM (hyperparameters like `mask_in_chans=16` are from SAM code)
354
351
  self.sam_prompt_encoder = PromptEncoder(
355
352
  embed_dim=self.sam_prompt_embed_dim,
356
353
  image_embedding_size=(
@@ -425,8 +422,8 @@ class SAM2Model(torch.nn.Module):
425
422
  low_res_multimasks: Tensor of shape (B, M, H*4, W*4) with SAM output mask logits.
426
423
  high_res_multimasks: Tensor of shape (B, M, H*16, W*16) with upsampled mask logits.
427
424
  ious: Tensor of shape (B, M) with estimated IoU for each output mask.
428
- low_res_masks: Tensor of shape (B, 1, H*4, W*4) with best low-resolution mask.
429
- high_res_masks: Tensor of shape (B, 1, H*16, W*16) with best high-resolution mask.
425
+ low_res_masks: Tensor of shape (B, 1, H*4, W*4) with the best low-resolution mask.
426
+ high_res_masks: Tensor of shape (B, 1, H*16, W*16) with the best high-resolution mask.
430
427
  obj_ptr: Tensor of shape (B, C) with object pointer vector for the output mask.
431
428
  object_score_logits: Tensor of shape (B,) with object score logits.
432
429
 
@@ -488,12 +485,7 @@ class SAM2Model(torch.nn.Module):
488
485
  boxes=None,
489
486
  masks=sam_mask_prompt,
490
487
  )
491
- (
492
- low_res_multimasks,
493
- ious,
494
- sam_output_tokens,
495
- object_score_logits,
496
- ) = self.sam_mask_decoder(
488
+ low_res_multimasks, ious, sam_output_tokens, object_score_logits = self.sam_mask_decoder(
497
489
  image_embeddings=backbone_features,
498
490
  image_pe=self.sam_prompt_encoder.get_dense_pe(),
499
491
  sparse_prompt_embeddings=sparse_embeddings,
@@ -505,13 +497,8 @@ class SAM2Model(torch.nn.Module):
505
497
  if self.pred_obj_scores:
506
498
  is_obj_appearing = object_score_logits > 0
507
499
 
508
- # Mask used for spatial memories is always a *hard* choice between obj and no obj,
509
- # consistent with the actual mask prediction
510
- low_res_multimasks = torch.where(
511
- is_obj_appearing[:, None, None],
512
- low_res_multimasks,
513
- NO_OBJ_SCORE,
514
- )
500
+ # Spatial memory mask is a *hard* choice between obj and no obj, consistent with actual mask prediction
501
+ low_res_multimasks = torch.where(is_obj_appearing[:, None, None], low_res_multimasks, NO_OBJ_SCORE)
515
502
 
516
503
  # convert masks from possibly bfloat16 (or float16) to float32
517
504
  # (older PyTorch versions before 2.1 don't support `interpolate` on bf16)
@@ -617,7 +604,6 @@ class SAM2Model(torch.nn.Module):
617
604
 
618
605
  def _prepare_backbone_features(self, backbone_out):
619
606
  """Prepares and flattens visual features from the image backbone output for further processing."""
620
- backbone_out = backbone_out.copy()
621
607
  assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
622
608
  assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
623
609
 
@@ -826,11 +812,7 @@ class SAM2Model(torch.nn.Module):
826
812
  mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc
827
813
  if self.sigmoid_bias_for_mem_enc != 0.0:
828
814
  mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc
829
- maskmem_out = self.memory_encoder(
830
- pix_feat,
831
- mask_for_mem,
832
- skip_mask_sigmoid=True, # sigmoid already applied
833
- )
815
+ maskmem_out = self.memory_encoder(pix_feat, mask_for_mem, skip_mask_sigmoid=True) # sigmoid already applied
834
816
  maskmem_features = maskmem_out["vision_features"]
835
817
  maskmem_pos_enc = maskmem_out["vision_pos_enc"]
836
818
  # add a no-object embedding to the spatial memory to indicate that the frame
@@ -965,16 +947,7 @@ class SAM2Model(torch.nn.Module):
965
947
  track_in_reverse,
966
948
  prev_sam_mask_logits,
967
949
  )
968
-
969
- (
970
- _,
971
- _,
972
- _,
973
- low_res_masks,
974
- high_res_masks,
975
- obj_ptr,
976
- object_score_logits,
977
- ) = sam_outputs
950
+ _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
978
951
 
979
952
  current_out["pred_masks"] = low_res_masks
980
953
  current_out["pred_masks_high_res"] = high_res_masks
@@ -984,8 +957,7 @@ class SAM2Model(torch.nn.Module):
984
957
  # it's mainly used in the demo to encode spatial memories w/ consolidated masks)
985
958
  current_out["object_score_logits"] = object_score_logits
986
959
 
987
- # Finally run the memory encoder on the predicted mask to encode
988
- # it into a new memory feature (that can be used in future frames)
960
+ # Run memory encoder on the predicted mask to encode it into a new memory feature (for use in future frames)
989
961
  self._encode_memory_in_output(
990
962
  current_vision_feats,
991
963
  feat_sizes,
@@ -1007,8 +979,9 @@ class SAM2Model(torch.nn.Module):
1007
979
  and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num)
1008
980
  )
1009
981
 
1010
- def _apply_non_overlapping_constraints(self, pred_masks):
1011
- """Applies non-overlapping constraints to masks, keeping highest scoring object per location."""
982
+ @staticmethod
983
+ def _apply_non_overlapping_constraints(pred_masks):
984
+ """Applies non-overlapping constraints to masks, keeping the highest scoring object per location."""
1012
985
  batch_size = pred_masks.size(0)
1013
986
  if batch_size == 1:
1014
987
  return pred_masks
@@ -1024,6 +997,10 @@ class SAM2Model(torch.nn.Module):
1024
997
  pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0))
1025
998
  return pred_masks
1026
999
 
1000
+ def set_binarize(self, binarize=False):
1001
+ """Set binarize for VideoPredictor."""
1002
+ self.binarize_mask_from_pts_for_mem_enc = binarize
1003
+
1027
1004
  def set_imgsz(self, imgsz):
1028
1005
  """
1029
1006
  Set image size to make model compatible with different image sizes.