dgenerate-ultralytics-headless 8.3.237__py3-none-any.whl → 8.3.240__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/METADATA +2 -1
  2. {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/RECORD +105 -106
  3. tests/test_exports.py +3 -1
  4. tests/test_python.py +2 -2
  5. tests/test_solutions.py +6 -6
  6. ultralytics/__init__.py +1 -1
  7. ultralytics/cfg/__init__.py +4 -4
  8. ultralytics/cfg/datasets/Argoverse.yaml +7 -6
  9. ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
  10. ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
  11. ultralytics/cfg/datasets/VOC.yaml +15 -16
  12. ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
  13. ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
  14. ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
  15. ultralytics/cfg/datasets/dota8.yaml +2 -2
  16. ultralytics/cfg/datasets/kitti.yaml +1 -1
  17. ultralytics/cfg/datasets/xView.yaml +16 -16
  18. ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
  19. ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
  20. ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
  21. ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
  22. ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
  23. ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
  24. ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
  25. ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
  26. ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
  27. ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
  28. ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
  29. ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
  30. ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
  31. ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
  32. ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
  33. ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
  34. ultralytics/data/augment.py +1 -1
  35. ultralytics/data/base.py +4 -2
  36. ultralytics/data/build.py +4 -4
  37. ultralytics/data/loaders.py +17 -12
  38. ultralytics/data/utils.py +4 -4
  39. ultralytics/engine/exporter.py +24 -16
  40. ultralytics/engine/predictor.py +5 -4
  41. ultralytics/engine/results.py +12 -13
  42. ultralytics/engine/trainer.py +2 -2
  43. ultralytics/engine/tuner.py +2 -3
  44. ultralytics/engine/validator.py +2 -2
  45. ultralytics/models/fastsam/model.py +2 -2
  46. ultralytics/models/fastsam/predict.py +2 -3
  47. ultralytics/models/fastsam/val.py +4 -4
  48. ultralytics/models/rtdetr/predict.py +2 -3
  49. ultralytics/models/rtdetr/val.py +5 -4
  50. ultralytics/models/sam/build.py +5 -5
  51. ultralytics/models/sam/build_sam3.py +9 -6
  52. ultralytics/models/sam/model.py +1 -1
  53. ultralytics/models/sam/modules/sam.py +10 -5
  54. ultralytics/models/sam/modules/utils.py +8 -3
  55. ultralytics/models/sam/predict.py +53 -62
  56. ultralytics/models/sam/sam3/encoder.py +4 -4
  57. ultralytics/models/sam/sam3/geometry_encoders.py +3 -3
  58. ultralytics/models/sam/sam3/necks.py +17 -17
  59. ultralytics/models/sam/sam3/sam3_image.py +3 -21
  60. ultralytics/models/sam/sam3/vl_combiner.py +1 -6
  61. ultralytics/models/yolo/classify/val.py +1 -1
  62. ultralytics/models/yolo/detect/train.py +1 -1
  63. ultralytics/models/yolo/detect/val.py +7 -7
  64. ultralytics/models/yolo/obb/val.py +1 -1
  65. ultralytics/models/yolo/pose/val.py +1 -1
  66. ultralytics/models/yolo/segment/val.py +1 -1
  67. ultralytics/nn/autobackend.py +9 -9
  68. ultralytics/nn/modules/block.py +1 -1
  69. ultralytics/nn/tasks.py +3 -3
  70. ultralytics/nn/text_model.py +2 -7
  71. ultralytics/solutions/ai_gym.py +1 -1
  72. ultralytics/solutions/analytics.py +6 -6
  73. ultralytics/solutions/config.py +1 -1
  74. ultralytics/solutions/distance_calculation.py +1 -1
  75. ultralytics/solutions/object_counter.py +1 -1
  76. ultralytics/solutions/object_cropper.py +3 -6
  77. ultralytics/solutions/parking_management.py +21 -17
  78. ultralytics/solutions/queue_management.py +5 -5
  79. ultralytics/solutions/region_counter.py +2 -2
  80. ultralytics/solutions/security_alarm.py +1 -1
  81. ultralytics/solutions/solutions.py +45 -22
  82. ultralytics/solutions/speed_estimation.py +1 -1
  83. ultralytics/trackers/basetrack.py +1 -1
  84. ultralytics/trackers/bot_sort.py +4 -3
  85. ultralytics/trackers/byte_tracker.py +4 -4
  86. ultralytics/trackers/utils/gmc.py +6 -7
  87. ultralytics/trackers/utils/kalman_filter.py +2 -1
  88. ultralytics/trackers/utils/matching.py +4 -3
  89. ultralytics/utils/__init__.py +12 -3
  90. ultralytics/utils/benchmarks.py +2 -2
  91. ultralytics/utils/callbacks/tensorboard.py +19 -25
  92. ultralytics/utils/checks.py +2 -1
  93. ultralytics/utils/downloads.py +1 -1
  94. ultralytics/utils/export/tensorflow.py +16 -2
  95. ultralytics/utils/files.py +13 -12
  96. ultralytics/utils/logger.py +62 -27
  97. ultralytics/utils/metrics.py +1 -1
  98. ultralytics/utils/ops.py +6 -6
  99. ultralytics/utils/patches.py +3 -3
  100. ultralytics/utils/plotting.py +18 -23
  101. ultralytics/utils/tuner.py +1 -1
  102. ultralytics/models/sam/sam3/tokenizer_ve.py +0 -242
  103. {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/WHEEL +0 -0
  104. {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/entry_points.txt +0 -0
  105. {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/licenses/LICENSE +0 -0
  106. {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/top_level.txt +0 -0
@@ -110,10 +110,12 @@ class Predictor(BasePredictor):
110
110
  """Preprocess the input image for model inference.
111
111
 
112
112
  This method prepares the input image by applying transformations and normalization. It supports both
113
- torch.Tensor and list of np.ndarray as input formats.
113
+ torch.Tensor and list of np.ndarray as input formats. For OpenCV-loaded images, the input is typically BGR and
114
+ is converted to RGB during preprocessing.
114
115
 
115
116
  Args:
116
- im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or list of HWC numpy arrays.
117
+ im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or a list of HWC NumPy arrays.
118
+ NumPy arrays are expected to be in BGR order (as returned by OpenCV) and will be converted to RGB.
117
119
 
118
120
  Returns:
119
121
  (torch.Tensor): The preprocessed image tensor, normalized and converted to the appropriate dtype.
@@ -534,7 +536,7 @@ class Predictor(BasePredictor):
534
536
 
535
537
  Args:
536
538
  image (str | np.ndarray): Path to the image file as a string, or a numpy array representing an image read by
537
- cv2.
539
+ cv2 (BGR channel order).
538
540
 
539
541
  Raises:
540
542
  AssertionError: If more than one image is attempted to be set.
@@ -876,6 +878,7 @@ class SAM2VideoPredictor(SAM2Predictor):
876
878
  self.clear_non_cond_mem_around_input = False
877
879
  self.clear_non_cond_mem_for_multi_obj = False
878
880
  self.callbacks["on_predict_start"].append(self.init_state)
881
+ self.clear_non_cond_mem = True # Whether to clear non-conditioning memory periodically
879
882
 
880
883
  def get_model(self):
881
884
  """Retrieve and configure the model with binarization enabled.
@@ -950,6 +953,7 @@ class SAM2VideoPredictor(SAM2Predictor):
950
953
  run_mem_encoder=True,
951
954
  )
952
955
  output_dict[storage_key][frame] = current_out
956
+ self._prune_non_cond_memory(frame)
953
957
  # Create slices of per-object outputs for subsequent interaction with each
954
958
  # individual object after tracking.
955
959
  self._add_output_per_object(frame, current_out, storage_key)
@@ -1244,14 +1248,11 @@ class SAM2VideoPredictor(SAM2Predictor):
1244
1248
  - If `batch` is greater than 1, the features are expanded to fit the batch size.
1245
1249
  - The method leverages the model's `_prepare_backbone_features` method to prepare the backbone features.
1246
1250
  """
1247
- backbone_out = self.model.forward_image(im)
1248
- if batch > 1: # expand features if there's more than one prompt
1249
- for i, feat in enumerate(backbone_out["backbone_fpn"]):
1250
- backbone_out["backbone_fpn"][i] = feat.expand(batch, -1, -1, -1)
1251
- for i, pos in enumerate(backbone_out["vision_pos_enc"]):
1252
- pos = pos.expand(batch, -1, -1, -1)
1253
- backbone_out["vision_pos_enc"][i] = pos
1254
- _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
1251
+ # check if there's precomputed backbone output
1252
+ backbone_out = getattr(self, "backbone_out", None)
1253
+ if backbone_out is None:
1254
+ backbone_out = self.model.forward_image(im)
1255
+ _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out, batch=batch)
1255
1256
  return vis_feats, vis_pos_embed, feat_sizes
1256
1257
 
1257
1258
  def _obj_id_to_idx(self, obj_id, inference_state: dict[str, Any] | None = None):
@@ -1831,6 +1832,25 @@ class SAM2VideoPredictor(SAM2Predictor):
1831
1832
  inference_state["frames_already_tracked"].clear()
1832
1833
  inference_state["first_ann_frame_idx"] = None
1833
1834
 
1835
+ def _prune_non_cond_memory(self, frame_idx, inference_state=None):
1836
+ """Prune old non-conditioning frames to bound memory usage."""
1837
+ if not self.clear_non_cond_mem:
1838
+ return
1839
+ inference_state = inference_state or self.inference_state
1840
+
1841
+ # Determine window size
1842
+ min_frame = frame_idx - self.model.num_maskmem * self.model.memory_temporal_stride_for_eval
1843
+ output_dict = inference_state["output_dict"]
1844
+
1845
+ # Prune global non_cond_frame_outputs
1846
+ for f in [k for k in output_dict["non_cond_frame_outputs"] if k < min_frame]:
1847
+ output_dict["non_cond_frame_outputs"].pop(f, None)
1848
+
1849
+ # Prune per-object non_cond_frame_outputs
1850
+ for obj_output_dict in inference_state.get("output_dict_per_obj", {}).values():
1851
+ for f in [k for k in obj_output_dict["non_cond_frame_outputs"] if k < min_frame]:
1852
+ obj_output_dict["non_cond_frame_outputs"].pop(f, None)
1853
+
1834
1854
 
1835
1855
  class SAM2DynamicInteractivePredictor(SAM2Predictor):
1836
1856
  """SAM2DynamicInteractivePredictor extends SAM2Predictor to support dynamic interactions with video frames or a
@@ -2055,11 +2075,12 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2055
2075
  self.memory_bank.append(consolidated_out)
2056
2076
 
2057
2077
  def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
2058
- """Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
2059
- prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features
2060
- for all objects in the image. If there is no memory, it will directly add a no-memory embedding to the
2061
- current vision features. If there is memory, it will use the memory features from previous frames to
2062
- condition the current vision features using a transformer attention mechanism.
2078
+ """Prepare memory-conditioned features for the current image state.
2079
+
2080
+ If ``obj_idx`` is provided, features are prepared for a specific prompted object in the image. If ``obj_idx`` is
2081
+ None, features are prepared for all objects. If no memory is available, a no-memory embedding is added to the
2082
+ current vision features. Otherwise, memory from previous frames is used to condition the current vision features
2083
+ via a transformer attention mechanism.
2063
2084
 
2064
2085
  Args:
2065
2086
  obj_idx (int | None): The index of the object for which to prepare the features.
@@ -2068,8 +2089,8 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2068
2089
  pix_feat_with_mem (torch.Tensor): The memory-conditioned pixel features.
2069
2090
  """
2070
2091
  if len(self.memory_bank) == 0 or isinstance(obj_idx, int):
2071
- # for initial conditioning frames with, encode them without using any previous memory
2072
- # directly add no-mem embedding (instead of using the transformer encoder)
2092
+ # For initial conditioning frames, encode without using any previous memory.
2093
+ # Directly add the no-memory embedding (instead of using the transformer encoder).
2073
2094
  pix_feat_with_mem = self.vision_feats[-1] + self.model.no_mem_embed
2074
2095
  else:
2075
2096
  # for inference frames, use the memory features from previous frames
@@ -2081,7 +2102,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2081
2102
  memory_pos=memory_pos_embed,
2082
2103
  num_obj_ptr_tokens=0, # num_obj_ptr_tokens
2083
2104
  )
2084
- # reshape the output (HW)BC => BCHW
2105
+ # Reshape output (HW)BC => BCHW
2085
2106
  return pix_feat_with_mem.permute(1, 2, 0).view(
2086
2107
  self._max_obj_num,
2087
2108
  self.model.memory_attention.d_model,
@@ -2145,9 +2166,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2145
2166
  pix_feat = pix_feat.view(-1, self.model.memory_attention.d_model, *self.feat_sizes[-1])
2146
2167
  _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._use_mask_as_output(mask)
2147
2168
  else:
2148
- # fused the visual feature with previous memory features in the memory bank
2169
+ # Fuse visual features with previous memory features in the memory bank.
2149
2170
  pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
2150
- # calculate the first feature if adding obj_idx exists(means adding prompts)
2171
+ # If ``obj_idx`` is provided (i.e., prompts are being added), keep only the first feature map.
2151
2172
  pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
2152
2173
  _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
2153
2174
  backbone_features=pix_feat_with_mem,
@@ -2182,7 +2203,7 @@ class SAM3Predictor(SAM2Predictor):
2182
2203
  self.std = torch.tensor([127.5, 127.5, 127.5]).view(-1, 1, 1).to(self.device)
2183
2204
 
2184
2205
  def get_model(self):
2185
- """Retrieve and initialize the Segment Anything Model 2 (SAM2) for image segmentation tasks."""
2206
+ """Retrieve and initialize the Segment Anything Model 3 (SAM3) for image segmentation tasks."""
2186
2207
  from .build_sam3 import build_interactive_sam3 # slow import
2187
2208
 
2188
2209
  return build_interactive_sam3(self.args.model, compile=self.args.compile)
@@ -2191,16 +2212,11 @@ class SAM3Predictor(SAM2Predictor):
2191
2212
  class SAM3SemanticPredictor(SAM3Predictor):
2192
2213
  """Segment Anything Model 3 (SAM3) Predictor for image segmentation tasks."""
2193
2214
 
2194
- def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None, bpe_path=None):
2195
- """Initialize the SAM3SemanticPredictor with configuration and optional overrides."""
2196
- super().__init__(cfg, overrides, _callbacks)
2197
- self.bpe_path = bpe_path
2198
-
2199
2215
  def get_model(self):
2200
2216
  """Retrieve and initialize the Segment Anything Model 3 (SAM3) for image segmentation tasks."""
2201
2217
  from .build_sam3 import build_sam3_image_model # slow import
2202
2218
 
2203
- return build_sam3_image_model(self.args.model, bpe_path=self.bpe_path, compile=self.args.compile)
2219
+ return build_sam3_image_model(self.args.model, compile=self.args.compile)
2204
2220
 
2205
2221
  @smart_inference_mode()
2206
2222
  def get_im_features(self, im):
@@ -2428,6 +2444,7 @@ class SAM3VideoPredictor(SAM2VideoPredictor, SAM3Predictor):
2428
2444
  inference_state=inference_state,
2429
2445
  )
2430
2446
  output_dict[storage_key][frame] = current_out
2447
+ self._prune_non_cond_memory(frame, inference_state=inference_state)
2431
2448
  # Create slices of per-object outputs for subsequent interaction with each
2432
2449
  # individual object after tracking.
2433
2450
  self._add_output_per_object(frame, current_out, storage_key, inference_state=inference_state)
@@ -2437,24 +2454,6 @@ class SAM3VideoPredictor(SAM2VideoPredictor, SAM3Predictor):
2437
2454
 
2438
2455
  return obj_ids, pred_masks, obj_scores
2439
2456
 
2440
- def get_im_features(self, im, batch=1):
2441
- """A wrapper to get image features, supporting pre-extracted backbone outputs."""
2442
- if getattr(self, "backbone_out", None):
2443
- backbone_out = self.backbone_out
2444
- if batch > 1: # expand features if there's more than one prompt
2445
- backbone_out = {
2446
- "backbone_fpn": backbone_out["backbone_fpn"].copy(),
2447
- "vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
2448
- }
2449
- for i, feat in enumerate(backbone_out["backbone_fpn"]):
2450
- backbone_out["backbone_fpn"][i] = feat.expand(batch, -1, -1, -1)
2451
- for i, pos in enumerate(backbone_out["vision_pos_enc"]):
2452
- pos = pos.expand(batch, -1, -1, -1)
2453
- backbone_out["vision_pos_enc"][i] = pos
2454
- _, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
2455
- return vis_feats, vis_pos_embed, feat_sizes
2456
- return super().get_im_features(im, batch)
2457
-
2458
2457
 
2459
2458
  class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
2460
2459
  """Segment Anything Model 3 (SAM3) Video Semantic Predictor."""
@@ -2479,7 +2478,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
2479
2478
  cfg=DEFAULT_CFG,
2480
2479
  overrides=None,
2481
2480
  _callbacks=None,
2482
- bpe_path="bpe_simple_vocab_16e6.txt.gz",
2483
2481
  # prob threshold for detection outputs -- only keep detections above this threshold
2484
2482
  # enters NMS and det-to-track matching
2485
2483
  score_threshold_detection=0.5,
@@ -2499,14 +2497,12 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
2499
2497
  hotstart_delay=0,
2500
2498
  hotstart_unmatch_thresh=3,
2501
2499
  hotstart_dup_thresh=3,
2502
- # Whether to suppress masks only within hotstart. If False, we can suppress masks even if they start before hotstart period.
2503
- suppress_unmatched_only_within_hotstart=True,
2504
- init_trk_keep_alive=0,
2505
- max_trk_keep_alive=8,
2500
+ init_trk_keep_alive=30,
2501
+ max_trk_keep_alive=30,
2506
2502
  min_trk_keep_alive=-4,
2507
2503
  # Threshold for suppressing overlapping objects based on recent occlusion
2508
2504
  suppress_overlapping_based_on_recent_occlusion_threshold=0.0,
2509
- decrease_trk_keep_alive_for_empty_masklets=False,
2505
+ decrease_trk_keep_alive_for_empty_masklets=True,
2510
2506
  o2o_matching_masklets_enable=False, # Enable hungarian matching to match existing masklets
2511
2507
  suppress_det_close_to_boundary=False,
2512
2508
  fill_hole_area=16,
@@ -2523,7 +2519,7 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
2523
2519
  reconstruction_bbox_det_score=0.0,
2524
2520
  ):
2525
2521
  """Initialize the SAM3VideoSemanticPredictor with configuration and optional overrides."""
2526
- super().__init__(cfg, overrides, _callbacks, bpe_path=bpe_path)
2522
+ super().__init__(cfg, overrides, _callbacks)
2527
2523
  self.score_threshold_detection = score_threshold_detection
2528
2524
  self.det_nms_thresh = det_nms_thresh
2529
2525
  self.assoc_iou_thresh = assoc_iou_thresh
@@ -2537,7 +2533,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
2537
2533
  self.hotstart_delay = hotstart_delay
2538
2534
  self.hotstart_unmatch_thresh = hotstart_unmatch_thresh
2539
2535
  self.hotstart_dup_thresh = hotstart_dup_thresh
2540
- self.suppress_unmatched_only_within_hotstart = suppress_unmatched_only_within_hotstart
2541
2536
  self.init_trk_keep_alive = init_trk_keep_alive
2542
2537
  self.max_trk_keep_alive = max_trk_keep_alive
2543
2538
  self.min_trk_keep_alive = min_trk_keep_alive
@@ -2662,7 +2657,7 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
2662
2657
  ) > 0
2663
2658
 
2664
2659
  # names = getattr(self.model, "names", [str(i) for i in range(pred_scores.shape[0])])
2665
- names = dict(enumerate(str(i) for i in range(pred_masks.shape[0])))
2660
+ names = dict(enumerate(str(i) for i in range(pred_boxes.shape[0])))
2666
2661
  results = []
2667
2662
  for masks, boxes, orig_img, img_path in zip([pred_masks], [pred_boxes], orig_imgs, self.batch[0]):
2668
2663
  results.append(Results(orig_img, path=img_path, names=names, masks=masks, boxes=boxes))
@@ -2713,7 +2708,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
2713
2708
  metadata = tracker_metadata_new["metadata"]
2714
2709
  removed_obj_ids = metadata["removed_obj_ids"]
2715
2710
  out["removed_obj_ids"] = removed_obj_ids
2716
- out["suppressed_obj_ids"] = metadata["suppressed_obj_ids"][frame_idx]
2717
2711
  out["frame_stats"] = frame_stats
2718
2712
  if self.masklet_confirmation_enable:
2719
2713
  status = metadata["masklet_confirmation"]["status"]
@@ -3621,7 +3615,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
3621
3615
  overlap_pair_to_frame_inds = metadata["overlap_pair_to_frame_inds"]
3622
3616
  # removed_obj_ids: object IDs that are suppressed via hot-start
3623
3617
  removed_obj_ids = metadata["removed_obj_ids"]
3624
- suppressed_obj_ids = metadata["suppressed_obj_ids"][frame_idx]
3625
3618
 
3626
3619
  obj_ids_newly_removed = set() # object IDs to be newly removed on this frame
3627
3620
  hotstart_diff = frame_idx - self.hotstart_delay if not reverse else frame_idx + self.hotstart_delay
@@ -3671,12 +3664,12 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
3671
3664
  )
3672
3665
  if (
3673
3666
  trk_keep_alive[obj_id] <= 0 # Object has not been matched for too long
3674
- and not self.suppress_unmatched_only_within_hotstart
3675
3667
  and obj_id not in removed_obj_ids
3676
3668
  and obj_id not in obj_ids_newly_removed
3677
3669
  ):
3678
- LOGGER.debug(f"Suppressing object {obj_id} at frame {frame_idx}, due to being unmatched")
3679
- suppressed_obj_ids.add(obj_id)
3670
+ LOGGER.debug(f"Removing object {obj_id} at frame {frame_idx}, due to being unmatched")
3671
+ # directly removed the object instead of suppressing it
3672
+ obj_ids_newly_removed.add(obj_id)
3680
3673
 
3681
3674
  # Step 3: removed tracks that overlaps with another track for `hotstart_dup_thresh` frames
3682
3675
  # a) find overlaps tracks -- we consider overlap if they match to the same detection
@@ -3855,8 +3848,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
3855
3848
  "trk_keep_alive": defaultdict(int), # This is used only for object suppression not for removal
3856
3849
  "overlap_pair_to_frame_inds": defaultdict(list),
3857
3850
  "removed_obj_ids": set(),
3858
- # frame_idx --> set of objects with suppressed outputs, but still continue to be tracked
3859
- "suppressed_obj_ids": defaultdict(set),
3860
3851
  }
3861
3852
  if self.masklet_confirmation_enable:
3862
3853
  # all the following are np.ndarray with the same shape as `obj_ids_all_gpu`
@@ -171,7 +171,7 @@ class TransformerEncoderLayer(nn.Module):
171
171
  assert tgt.shape[0] % 2 == 0
172
172
  other_tgt = tgt[tgt.shape[0] // 2 :]
173
173
  tgt = tgt[: tgt.shape[0] // 2]
174
- tgt2 = self.norm1(tgt)
174
+ tgt2 = self.norm1(tgt).contiguous()
175
175
  q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
176
176
  tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
177
177
  tgt = tgt + self.dropout1(tgt2)
@@ -179,13 +179,13 @@ class TransformerEncoderLayer(nn.Module):
179
179
  # Recombine
180
180
  tgt = torch.cat((tgt, other_tgt), dim=0)
181
181
  tgt2 = self.norm2(tgt)
182
+ memory = memory.to(tgt2.dtype).contiguous()
182
183
  tgt2 = self.cross_attn_image(
183
184
  query=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
184
- key=memory.to(tgt2.dtype) + pos if self.pos_enc_at_cross_attn_keys else memory.to(tgt2.dtype),
185
- value=memory.to(tgt2.dtype),
185
+ key=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
186
+ value=memory,
186
187
  attn_mask=memory_mask,
187
188
  key_padding_mask=memory_key_padding_mask,
188
- # attn_bias=attn_bias,
189
189
  )[0]
190
190
  tgt = tgt + self.dropout2(tgt2)
191
191
  tgt2 = self.norm3(tgt)
@@ -42,8 +42,8 @@ def concat_padded_sequences(seq1, mask1, seq2, mask2, return_index: bool = False
42
42
  assert seq1_length == mask1.size(1)
43
43
  assert seq2_length == mask2.size(1)
44
44
 
45
- torch._assert_async(is_right_padded(mask1))
46
- torch._assert_async(is_right_padded(mask2))
45
+ torch._assert(is_right_padded(mask1), "Mask is not right padded")
46
+ torch._assert(is_right_padded(mask2), "Mask is not right padded")
47
47
 
48
48
  actual_seq1_lengths = (~mask1).sum(dim=-1)
49
49
  actual_seq2_lengths = (~mask2).sum(dim=-1)
@@ -288,7 +288,7 @@ class SequenceGeometryEncoder(nn.Module):
288
288
  # Convert boxes to xyxy format and denormalize
289
289
  boxes_xyxy = xywh2xyxy(boxes.to(img_feats.dtype))
290
290
  scale = torch.tensor([W, H, W, H], dtype=boxes_xyxy.dtype)
291
- scale = scale.pin_memory().to(device=boxes_xyxy.device, non_blocking=True)
291
+ scale = scale.to(device=boxes_xyxy.device, non_blocking=True)
292
292
  scale = scale.view(1, 1, 4)
293
293
  boxes_xyxy = boxes_xyxy * scale
294
294
 
@@ -103,27 +103,27 @@ class Sam3DualViTDetNeck(nn.Module):
103
103
 
104
104
  def forward(
105
105
  self, tensor_list: list[torch.Tensor]
106
- ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]:
107
- """Get the feature maps and positional encodings from the neck."""
106
+ ) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor] | None, list[torch.Tensor] | None]:
107
+ """Get feature maps and positional encodings from the neck."""
108
108
  xs = self.trunk(tensor_list)
109
- sam3_out, sam3_pos = [], []
110
- sam2_out, sam2_pos = None, None
111
- if self.sam2_convs is not None:
112
- sam2_out, sam2_pos = [], []
113
109
  x = xs[-1] # simpleFPN
114
- for i in range(len(self.convs)):
115
- sam3_x_out = self.convs[i](x)
116
- sam3_pos_out = self.position_encoding(sam3_x_out).to(sam3_x_out.dtype)
117
- sam3_out.append(sam3_x_out)
118
- sam3_pos.append(sam3_pos_out)
119
-
120
- if self.sam2_convs is not None:
121
- sam2_x_out = self.sam2_convs[i](x)
122
- sam2_pos_out = self.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
123
- sam2_out.append(sam2_x_out)
124
- sam2_pos.append(sam2_pos_out)
110
+ sam3_out, sam3_pos = self.sam_forward_feature_levels(x, self.convs)
111
+ if self.sam2_convs is None:
112
+ return sam3_out, sam3_pos, None, None
113
+ sam2_out, sam2_pos = self.sam_forward_feature_levels(x, self.sam2_convs)
125
114
  return sam3_out, sam3_pos, sam2_out, sam2_pos
126
115
 
116
+ def sam_forward_feature_levels(
117
+ self, x: torch.Tensor, convs: nn.ModuleList
118
+ ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
119
+ """Run neck convolutions and compute positional encodings for each feature level."""
120
+ outs, poss = [], []
121
+ for conv in convs:
122
+ feat = conv(x)
123
+ outs.append(feat)
124
+ poss.append(self.position_encoding(feat).to(feat.dtype))
125
+ return outs, poss
126
+
127
127
  def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
128
128
  """Set the image size for the trunk backbone."""
129
129
  self.trunk.set_imgsz(imgsz)
@@ -11,6 +11,7 @@ import torch
11
11
  from ultralytics.nn.modules.utils import inverse_sigmoid
12
12
  from ultralytics.utils.ops import xywh2xyxy
13
13
 
14
+ from ..modules.sam import SAM2Model
14
15
  from .geometry_encoders import Prompt
15
16
  from .vl_combiner import SAM3VLBackbone
16
17
 
@@ -93,25 +94,6 @@ class SAM3SemanticModel(torch.nn.Module):
93
94
  self.text_embeddings = {}
94
95
  self.names = []
95
96
 
96
- def _prepare_backbone_features(self, backbone_out, num_prompts=1):
97
- """Prepare and flatten visual features from the image backbone output for further processing."""
98
- if num_prompts > 1: # expand features if there's more than one prompt
99
- for i, feat in enumerate(backbone_out["backbone_fpn"]):
100
- backbone_out["backbone_fpn"][i] = feat.expand(num_prompts, -1, -1, -1)
101
- for i, pos in enumerate(backbone_out["vision_pos_enc"]):
102
- pos = pos.expand(num_prompts, -1, -1, -1)
103
- backbone_out["vision_pos_enc"][i] = pos
104
- assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
105
- assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
106
-
107
- feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
108
- vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
109
- feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
110
- # flatten NxCxHxW to HWxNxC
111
- vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
112
- vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
113
- return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
114
-
115
97
  def _encode_prompt(
116
98
  self,
117
99
  img_feats,
@@ -304,8 +286,8 @@ class SAM3SemanticModel(torch.nn.Module):
304
286
  self, backbone_out: dict[str, torch.Tensor], text_ids: torch.Tensor, geometric_prompt: Prompt = None
305
287
  ):
306
288
  """Forward pass for grounding (detection + segmentation) given input images and text."""
307
- backbone_out, img_feats, img_pos_embeds, vis_feat_sizes = self._prepare_backbone_features(
308
- backbone_out, num_prompts=len(text_ids)
289
+ backbone_out, img_feats, img_pos_embeds, vis_feat_sizes = SAM2Model._prepare_backbone_features(
290
+ self, backbone_out, batch=len(text_ids)
309
291
  )
310
292
  backbone_out.update({k: v for k, v in self.text_embeddings.items()})
311
293
  with torch.profiler.record_function("SAM3Image._encode_prompt"):
@@ -110,15 +110,10 @@ class SAM3VLBackbone(nn.Module):
110
110
  def forward_image_sam2(self, samples: torch.Tensor):
111
111
  """Forward pass of the vision backbone to get SAM2 features only."""
112
112
  xs = self.vision_backbone.trunk(samples)
113
- sam2_features, sam2_pos = [], []
114
113
  x = xs[-1] # simpleFPN
115
114
 
116
115
  assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
117
- for i in range(len(self.vision_backbone.sam2_convs)):
118
- sam2_x_out = self.vision_backbone.sam2_convs[i](x)
119
- sam2_pos_out = self.vision_backbone.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
120
- sam2_features.append(sam2_x_out)
121
- sam2_pos.append(sam2_pos_out)
116
+ sam2_features, sam2_pos = self.vision_backbone.sam_forward_feature_levels(x, self.vision_backbone.sam2_convs)
122
117
 
123
118
  if self.scalp > 0:
124
119
  # Discard the lowest resolution features
@@ -57,7 +57,7 @@ class ClassificationValidator(BaseValidator):
57
57
  """Initialize ClassificationValidator with dataloader, save directory, and other parameters.
58
58
 
59
59
  Args:
60
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
60
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
61
61
  save_dir (str | Path, optional): Directory to save results.
62
62
  args (dict, optional): Arguments containing model and validation configuration.
63
63
  _callbacks (list, optional): List of callback functions to be called during validation.
@@ -53,7 +53,7 @@ class DetectionTrainer(BaseTrainer):
53
53
  """
54
54
 
55
55
  def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
56
- """Initialize a DetectionTrainer object for training YOLO object detection model training.
56
+ """Initialize a DetectionTrainer object for training YOLO object detection models.
57
57
 
58
58
  Args:
59
59
  cfg (dict, optional): Default configuration dictionary containing training parameters.
@@ -46,7 +46,7 @@ class DetectionValidator(BaseValidator):
46
46
  """Initialize detection validator with necessary variables and settings.
47
47
 
48
48
  Args:
49
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
49
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
50
50
  save_dir (Path, optional): Directory to save results.
51
51
  args (dict[str, Any], optional): Arguments for the validator.
52
52
  _callbacks (list[Any], optional): List of callback functions.
@@ -256,7 +256,7 @@ class DetectionValidator(BaseValidator):
256
256
  pf = "%22s" + "%11i" * 2 + "%11.3g" * len(self.metrics.keys) # print format
257
257
  LOGGER.info(pf % ("all", self.seen, self.metrics.nt_per_class.sum(), *self.metrics.mean_results()))
258
258
  if self.metrics.nt_per_class.sum() == 0:
259
- LOGGER.warning(f"no labels found in {self.args.task} set, can not compute metrics without labels")
259
+ LOGGER.warning(f"no labels found in {self.args.task} set, cannot compute metrics without labels")
260
260
 
261
261
  # Print results per class
262
262
  if self.args.verbose and not self.training and self.nc > 1 and len(self.metrics.stats):
@@ -308,7 +308,7 @@ class DetectionValidator(BaseValidator):
308
308
  batch_size (int): Size of each batch.
309
309
 
310
310
  Returns:
311
- (torch.utils.data.DataLoader): Dataloader for validation.
311
+ (torch.utils.data.DataLoader): DataLoader for validation.
312
312
  """
313
313
  dataset = self.build_dataset(dataset_path, batch=batch_size, mode="val")
314
314
  return build_dataloader(
@@ -460,11 +460,11 @@ class DetectionValidator(BaseValidator):
460
460
 
461
461
  Args:
462
462
  stats (dict[str, Any]): Dictionary to store computed metrics and statistics.
463
- pred_json (str | Path]): Path to JSON file containing predictions in COCO format.
464
- anno_json (str | Path]): Path to JSON file containing ground truth annotations in COCO format.
465
- iou_types (str | list[str]]): IoU type(s) for evaluation. Can be single string or list of strings. Common
463
+ pred_json (str | Path): Path to JSON file containing predictions in COCO format.
464
+ anno_json (str | Path): Path to JSON file containing ground truth annotations in COCO format.
465
+ iou_types (str | list[str]): IoU type(s) for evaluation. Can be single string or list of strings. Common
466
466
  values include "bbox", "segm", "keypoints". Defaults to "bbox".
467
- suffix (str | list[str]]): Suffix to append to metric names in stats dictionary. Should correspond to
467
+ suffix (str | list[str]): Suffix to append to metric names in stats dictionary. Should correspond to
468
468
  iou_types if multiple types provided. Defaults to "Box".
469
469
 
470
470
  Returns:
@@ -50,7 +50,7 @@ class OBBValidator(DetectionValidator):
50
50
  extends the DetectionValidator class and configures it specifically for the OBB task.
51
51
 
52
52
  Args:
53
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
53
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
54
54
  save_dir (str | Path, optional): Directory to save results.
55
55
  args (dict | SimpleNamespace, optional): Arguments containing validation parameters.
56
56
  _callbacks (list, optional): List of callback functions to be called during validation.
@@ -59,7 +59,7 @@ class PoseValidator(DetectionValidator):
59
59
  specialized metrics for pose evaluation.
60
60
 
61
61
  Args:
62
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
62
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
63
63
  save_dir (Path | str, optional): Directory to save results.
64
64
  args (dict, optional): Arguments for the validator including task set to "pose".
65
65
  _callbacks (list, optional): List of callback functions to be executed during validation.
@@ -39,7 +39,7 @@ class SegmentationValidator(DetectionValidator):
39
39
  """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.
40
40
 
41
41
  Args:
42
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
42
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
43
43
  save_dir (Path, optional): Directory to save results.
44
44
  args (namespace, optional): Arguments for the validator.
45
45
  _callbacks (list, optional): List of callback functions.
@@ -127,7 +127,7 @@ class AutoBackend(nn.Module):
127
127
 
128
128
  Methods:
129
129
  forward: Run inference on an input image.
130
- from_numpy: Convert numpy array to tensor.
130
+ from_numpy: Convert NumPy arrays to tensors on the model device.
131
131
  warmup: Warm up the model with a dummy input.
132
132
  _model_type: Determine the model type from file path.
133
133
 
@@ -182,7 +182,7 @@ class AutoBackend(nn.Module):
182
182
  triton,
183
183
  ) = self._model_type("" if nn_module else model)
184
184
  fp16 &= pt or jit or onnx or xml or engine or nn_module or triton # FP16
185
- nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn # BHWC formats (vs torch BCWH)
185
+ nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn # BHWC formats (vs torch BCHW)
186
186
  stride, ch = 32, 3 # default stride and channels
187
187
  end2end, dynamic = False, False
188
188
  metadata, task = None, None
@@ -894,14 +894,14 @@ class AutoBackend(nn.Module):
894
894
  else:
895
895
  return self.from_numpy(y)
896
896
 
897
- def from_numpy(self, x: np.ndarray) -> torch.Tensor:
898
- """Convert a numpy array to a tensor.
897
+ def from_numpy(self, x: np.ndarray | torch.Tensor) -> torch.Tensor:
898
+ """Convert a NumPy array to a torch tensor on the model device.
899
899
 
900
900
  Args:
901
- x (np.ndarray): The array to be converted.
901
+ x (np.ndarray | torch.Tensor): Input array or tensor.
902
902
 
903
903
  Returns:
904
- (torch.Tensor): The converted tensor
904
+ (torch.Tensor): Tensor on `self.device`.
905
905
  """
906
906
  return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
907
907
 
@@ -909,7 +909,7 @@ class AutoBackend(nn.Module):
909
909
  """Warm up the model by running one forward pass with a dummy input.
910
910
 
911
911
  Args:
912
- imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width)
912
+ imgsz (tuple[int, int, int, int]): Dummy input shape in (batch, channels, height, width) format.
913
913
  """
914
914
  warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
915
915
  if any(warmup_types) and (self.device.type != "cpu" or self.triton):
@@ -931,8 +931,8 @@ class AutoBackend(nn.Module):
931
931
  (list[bool]): List of booleans indicating the model type.
932
932
 
933
933
  Examples:
934
- >>> model = AutoBackend(model="path/to/model.onnx")
935
- >>> model_type = model._model_type() # returns "onnx"
934
+ >>> types = AutoBackend._model_type("path/to/model.onnx")
935
+ >>> assert types[2] # onnx
936
936
  """
937
937
  from ultralytics.engine.exporter import export_formats
938
938
 
@@ -1812,7 +1812,7 @@ class A2C2f(nn.Module):
1812
1812
  """
1813
1813
  super().__init__()
1814
1814
  c_ = int(c2 * e) # hidden channels
1815
- assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."
1815
+ assert c_ % 32 == 0, "Dimension of ABlock must be a multiple of 32."
1816
1816
 
1817
1817
  self.cv1 = Conv(c1, c_, 1, 1)
1818
1818
  self.cv2 = Conv((1 + n) * c_, c2, 1)
ultralytics/nn/tasks.py CHANGED
@@ -866,7 +866,7 @@ class WorldModel(DetectionModel):
866
866
  self.model[-1].nc = len(text)
867
867
 
868
868
  def get_text_pe(self, text, batch=80, cache_clip_model=True):
869
- """Set classes in advance so that model could do offline-inference without clip model.
869
+ """Get text positional embeddings for offline inference without CLIP model.
870
870
 
871
871
  Args:
872
872
  text (list[str]): List of class names.
@@ -987,13 +987,13 @@ class YOLOEModel(DetectionModel):
987
987
 
988
988
  @smart_inference_mode()
989
989
  def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
990
- """Set classes in advance so that model could do offline-inference without clip model.
990
+ """Get text positional embeddings for offline inference without CLIP model.
991
991
 
992
992
  Args:
993
993
  text (list[str]): List of class names.
994
994
  batch (int): Batch size for processing text tokens.
995
995
  cache_clip_model (bool): Whether to cache the CLIP model.
996
- without_reprta (bool): Whether to return text embeddings cooperated with reprta module.
996
+ without_reprta (bool): Whether to return text embeddings without reprta module processing.
997
997
 
998
998
  Returns:
999
999
  (torch.Tensor): Text positional embeddings.