dgenerate-ultralytics-headless 8.3.237__py3-none-any.whl → 8.3.240__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/METADATA +2 -1
- {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/RECORD +105 -106
- tests/test_exports.py +3 -1
- tests/test_python.py +2 -2
- tests/test_solutions.py +6 -6
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +4 -4
- ultralytics/cfg/datasets/Argoverse.yaml +7 -6
- ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
- ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
- ultralytics/cfg/datasets/VOC.yaml +15 -16
- ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
- ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
- ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
- ultralytics/cfg/datasets/dota8.yaml +2 -2
- ultralytics/cfg/datasets/kitti.yaml +1 -1
- ultralytics/cfg/datasets/xView.yaml +16 -16
- ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
- ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
- ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
- ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
- ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
- ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
- ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
- ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
- ultralytics/data/augment.py +1 -1
- ultralytics/data/base.py +4 -2
- ultralytics/data/build.py +4 -4
- ultralytics/data/loaders.py +17 -12
- ultralytics/data/utils.py +4 -4
- ultralytics/engine/exporter.py +24 -16
- ultralytics/engine/predictor.py +5 -4
- ultralytics/engine/results.py +12 -13
- ultralytics/engine/trainer.py +2 -2
- ultralytics/engine/tuner.py +2 -3
- ultralytics/engine/validator.py +2 -2
- ultralytics/models/fastsam/model.py +2 -2
- ultralytics/models/fastsam/predict.py +2 -3
- ultralytics/models/fastsam/val.py +4 -4
- ultralytics/models/rtdetr/predict.py +2 -3
- ultralytics/models/rtdetr/val.py +5 -4
- ultralytics/models/sam/build.py +5 -5
- ultralytics/models/sam/build_sam3.py +9 -6
- ultralytics/models/sam/model.py +1 -1
- ultralytics/models/sam/modules/sam.py +10 -5
- ultralytics/models/sam/modules/utils.py +8 -3
- ultralytics/models/sam/predict.py +53 -62
- ultralytics/models/sam/sam3/encoder.py +4 -4
- ultralytics/models/sam/sam3/geometry_encoders.py +3 -3
- ultralytics/models/sam/sam3/necks.py +17 -17
- ultralytics/models/sam/sam3/sam3_image.py +3 -21
- ultralytics/models/sam/sam3/vl_combiner.py +1 -6
- ultralytics/models/yolo/classify/val.py +1 -1
- ultralytics/models/yolo/detect/train.py +1 -1
- ultralytics/models/yolo/detect/val.py +7 -7
- ultralytics/models/yolo/obb/val.py +1 -1
- ultralytics/models/yolo/pose/val.py +1 -1
- ultralytics/models/yolo/segment/val.py +1 -1
- ultralytics/nn/autobackend.py +9 -9
- ultralytics/nn/modules/block.py +1 -1
- ultralytics/nn/tasks.py +3 -3
- ultralytics/nn/text_model.py +2 -7
- ultralytics/solutions/ai_gym.py +1 -1
- ultralytics/solutions/analytics.py +6 -6
- ultralytics/solutions/config.py +1 -1
- ultralytics/solutions/distance_calculation.py +1 -1
- ultralytics/solutions/object_counter.py +1 -1
- ultralytics/solutions/object_cropper.py +3 -6
- ultralytics/solutions/parking_management.py +21 -17
- ultralytics/solutions/queue_management.py +5 -5
- ultralytics/solutions/region_counter.py +2 -2
- ultralytics/solutions/security_alarm.py +1 -1
- ultralytics/solutions/solutions.py +45 -22
- ultralytics/solutions/speed_estimation.py +1 -1
- ultralytics/trackers/basetrack.py +1 -1
- ultralytics/trackers/bot_sort.py +4 -3
- ultralytics/trackers/byte_tracker.py +4 -4
- ultralytics/trackers/utils/gmc.py +6 -7
- ultralytics/trackers/utils/kalman_filter.py +2 -1
- ultralytics/trackers/utils/matching.py +4 -3
- ultralytics/utils/__init__.py +12 -3
- ultralytics/utils/benchmarks.py +2 -2
- ultralytics/utils/callbacks/tensorboard.py +19 -25
- ultralytics/utils/checks.py +2 -1
- ultralytics/utils/downloads.py +1 -1
- ultralytics/utils/export/tensorflow.py +16 -2
- ultralytics/utils/files.py +13 -12
- ultralytics/utils/logger.py +62 -27
- ultralytics/utils/metrics.py +1 -1
- ultralytics/utils/ops.py +6 -6
- ultralytics/utils/patches.py +3 -3
- ultralytics/utils/plotting.py +18 -23
- ultralytics/utils/tuner.py +1 -1
- ultralytics/models/sam/sam3/tokenizer_ve.py +0 -242
- {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.237.dist-info → dgenerate_ultralytics_headless-8.3.240.dist-info}/top_level.txt +0 -0
|
@@ -110,10 +110,12 @@ class Predictor(BasePredictor):
|
|
|
110
110
|
"""Preprocess the input image for model inference.
|
|
111
111
|
|
|
112
112
|
This method prepares the input image by applying transformations and normalization. It supports both
|
|
113
|
-
torch.Tensor and list of np.ndarray as input formats.
|
|
113
|
+
torch.Tensor and list of np.ndarray as input formats. For OpenCV-loaded images, the input is typically BGR and
|
|
114
|
+
is converted to RGB during preprocessing.
|
|
114
115
|
|
|
115
116
|
Args:
|
|
116
|
-
im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or list of HWC
|
|
117
|
+
im (torch.Tensor | list[np.ndarray]): Input image(s) in BCHW tensor format or a list of HWC NumPy arrays.
|
|
118
|
+
NumPy arrays are expected to be in BGR order (as returned by OpenCV) and will be converted to RGB.
|
|
117
119
|
|
|
118
120
|
Returns:
|
|
119
121
|
(torch.Tensor): The preprocessed image tensor, normalized and converted to the appropriate dtype.
|
|
@@ -534,7 +536,7 @@ class Predictor(BasePredictor):
|
|
|
534
536
|
|
|
535
537
|
Args:
|
|
536
538
|
image (str | np.ndarray): Path to the image file as a string, or a numpy array representing an image read by
|
|
537
|
-
cv2.
|
|
539
|
+
cv2 (BGR channel order).
|
|
538
540
|
|
|
539
541
|
Raises:
|
|
540
542
|
AssertionError: If more than one image is attempted to be set.
|
|
@@ -876,6 +878,7 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
|
876
878
|
self.clear_non_cond_mem_around_input = False
|
|
877
879
|
self.clear_non_cond_mem_for_multi_obj = False
|
|
878
880
|
self.callbacks["on_predict_start"].append(self.init_state)
|
|
881
|
+
self.clear_non_cond_mem = True # Whether to clear non-conditioning memory periodically
|
|
879
882
|
|
|
880
883
|
def get_model(self):
|
|
881
884
|
"""Retrieve and configure the model with binarization enabled.
|
|
@@ -950,6 +953,7 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
|
950
953
|
run_mem_encoder=True,
|
|
951
954
|
)
|
|
952
955
|
output_dict[storage_key][frame] = current_out
|
|
956
|
+
self._prune_non_cond_memory(frame)
|
|
953
957
|
# Create slices of per-object outputs for subsequent interaction with each
|
|
954
958
|
# individual object after tracking.
|
|
955
959
|
self._add_output_per_object(frame, current_out, storage_key)
|
|
@@ -1244,14 +1248,11 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
|
1244
1248
|
- If `batch` is greater than 1, the features are expanded to fit the batch size.
|
|
1245
1249
|
- The method leverages the model's `_prepare_backbone_features` method to prepare the backbone features.
|
|
1246
1250
|
"""
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
pos = pos.expand(batch, -1, -1, -1)
|
|
1253
|
-
backbone_out["vision_pos_enc"][i] = pos
|
|
1254
|
-
_, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
|
|
1251
|
+
# check if there's precomputed backbone output
|
|
1252
|
+
backbone_out = getattr(self, "backbone_out", None)
|
|
1253
|
+
if backbone_out is None:
|
|
1254
|
+
backbone_out = self.model.forward_image(im)
|
|
1255
|
+
_, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out, batch=batch)
|
|
1255
1256
|
return vis_feats, vis_pos_embed, feat_sizes
|
|
1256
1257
|
|
|
1257
1258
|
def _obj_id_to_idx(self, obj_id, inference_state: dict[str, Any] | None = None):
|
|
@@ -1831,6 +1832,25 @@ class SAM2VideoPredictor(SAM2Predictor):
|
|
|
1831
1832
|
inference_state["frames_already_tracked"].clear()
|
|
1832
1833
|
inference_state["first_ann_frame_idx"] = None
|
|
1833
1834
|
|
|
1835
|
+
def _prune_non_cond_memory(self, frame_idx, inference_state=None):
|
|
1836
|
+
"""Prune old non-conditioning frames to bound memory usage."""
|
|
1837
|
+
if not self.clear_non_cond_mem:
|
|
1838
|
+
return
|
|
1839
|
+
inference_state = inference_state or self.inference_state
|
|
1840
|
+
|
|
1841
|
+
# Determine window size
|
|
1842
|
+
min_frame = frame_idx - self.model.num_maskmem * self.model.memory_temporal_stride_for_eval
|
|
1843
|
+
output_dict = inference_state["output_dict"]
|
|
1844
|
+
|
|
1845
|
+
# Prune global non_cond_frame_outputs
|
|
1846
|
+
for f in [k for k in output_dict["non_cond_frame_outputs"] if k < min_frame]:
|
|
1847
|
+
output_dict["non_cond_frame_outputs"].pop(f, None)
|
|
1848
|
+
|
|
1849
|
+
# Prune per-object non_cond_frame_outputs
|
|
1850
|
+
for obj_output_dict in inference_state.get("output_dict_per_obj", {}).values():
|
|
1851
|
+
for f in [k for k in obj_output_dict["non_cond_frame_outputs"] if k < min_frame]:
|
|
1852
|
+
obj_output_dict["non_cond_frame_outputs"].pop(f, None)
|
|
1853
|
+
|
|
1834
1854
|
|
|
1835
1855
|
class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1836
1856
|
"""SAM2DynamicInteractivePredictor extends SAM2Predictor to support dynamic interactions with video frames or a
|
|
@@ -2055,11 +2075,12 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
|
2055
2075
|
self.memory_bank.append(consolidated_out)
|
|
2056
2076
|
|
|
2057
2077
|
def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
|
|
2058
|
-
"""Prepare
|
|
2059
|
-
|
|
2060
|
-
|
|
2061
|
-
|
|
2062
|
-
|
|
2078
|
+
"""Prepare memory-conditioned features for the current image state.
|
|
2079
|
+
|
|
2080
|
+
If ``obj_idx`` is provided, features are prepared for a specific prompted object in the image. If ``obj_idx`` is
|
|
2081
|
+
None, features are prepared for all objects. If no memory is available, a no-memory embedding is added to the
|
|
2082
|
+
current vision features. Otherwise, memory from previous frames is used to condition the current vision features
|
|
2083
|
+
via a transformer attention mechanism.
|
|
2063
2084
|
|
|
2064
2085
|
Args:
|
|
2065
2086
|
obj_idx (int | None): The index of the object for which to prepare the features.
|
|
@@ -2068,8 +2089,8 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
|
2068
2089
|
pix_feat_with_mem (torch.Tensor): The memory-conditioned pixel features.
|
|
2069
2090
|
"""
|
|
2070
2091
|
if len(self.memory_bank) == 0 or isinstance(obj_idx, int):
|
|
2071
|
-
#
|
|
2072
|
-
#
|
|
2092
|
+
# For initial conditioning frames, encode without using any previous memory.
|
|
2093
|
+
# Directly add the no-memory embedding (instead of using the transformer encoder).
|
|
2073
2094
|
pix_feat_with_mem = self.vision_feats[-1] + self.model.no_mem_embed
|
|
2074
2095
|
else:
|
|
2075
2096
|
# for inference frames, use the memory features from previous frames
|
|
@@ -2081,7 +2102,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
|
2081
2102
|
memory_pos=memory_pos_embed,
|
|
2082
2103
|
num_obj_ptr_tokens=0, # num_obj_ptr_tokens
|
|
2083
2104
|
)
|
|
2084
|
-
#
|
|
2105
|
+
# Reshape output (HW)BC => BCHW
|
|
2085
2106
|
return pix_feat_with_mem.permute(1, 2, 0).view(
|
|
2086
2107
|
self._max_obj_num,
|
|
2087
2108
|
self.model.memory_attention.d_model,
|
|
@@ -2145,9 +2166,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
|
2145
2166
|
pix_feat = pix_feat.view(-1, self.model.memory_attention.d_model, *self.feat_sizes[-1])
|
|
2146
2167
|
_, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._use_mask_as_output(mask)
|
|
2147
2168
|
else:
|
|
2148
|
-
#
|
|
2169
|
+
# Fuse visual features with previous memory features in the memory bank.
|
|
2149
2170
|
pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
|
|
2150
|
-
#
|
|
2171
|
+
# If ``obj_idx`` is provided (i.e., prompts are being added), keep only the first feature map.
|
|
2151
2172
|
pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
|
|
2152
2173
|
_, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
|
|
2153
2174
|
backbone_features=pix_feat_with_mem,
|
|
@@ -2182,7 +2203,7 @@ class SAM3Predictor(SAM2Predictor):
|
|
|
2182
2203
|
self.std = torch.tensor([127.5, 127.5, 127.5]).view(-1, 1, 1).to(self.device)
|
|
2183
2204
|
|
|
2184
2205
|
def get_model(self):
|
|
2185
|
-
"""Retrieve and initialize the Segment Anything Model
|
|
2206
|
+
"""Retrieve and initialize the Segment Anything Model 3 (SAM3) for image segmentation tasks."""
|
|
2186
2207
|
from .build_sam3 import build_interactive_sam3 # slow import
|
|
2187
2208
|
|
|
2188
2209
|
return build_interactive_sam3(self.args.model, compile=self.args.compile)
|
|
@@ -2191,16 +2212,11 @@ class SAM3Predictor(SAM2Predictor):
|
|
|
2191
2212
|
class SAM3SemanticPredictor(SAM3Predictor):
|
|
2192
2213
|
"""Segment Anything Model 3 (SAM3) Predictor for image segmentation tasks."""
|
|
2193
2214
|
|
|
2194
|
-
def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None, bpe_path=None):
|
|
2195
|
-
"""Initialize the SAM3SemanticPredictor with configuration and optional overrides."""
|
|
2196
|
-
super().__init__(cfg, overrides, _callbacks)
|
|
2197
|
-
self.bpe_path = bpe_path
|
|
2198
|
-
|
|
2199
2215
|
def get_model(self):
|
|
2200
2216
|
"""Retrieve and initialize the Segment Anything Model 3 (SAM3) for image segmentation tasks."""
|
|
2201
2217
|
from .build_sam3 import build_sam3_image_model # slow import
|
|
2202
2218
|
|
|
2203
|
-
return build_sam3_image_model(self.args.model,
|
|
2219
|
+
return build_sam3_image_model(self.args.model, compile=self.args.compile)
|
|
2204
2220
|
|
|
2205
2221
|
@smart_inference_mode()
|
|
2206
2222
|
def get_im_features(self, im):
|
|
@@ -2428,6 +2444,7 @@ class SAM3VideoPredictor(SAM2VideoPredictor, SAM3Predictor):
|
|
|
2428
2444
|
inference_state=inference_state,
|
|
2429
2445
|
)
|
|
2430
2446
|
output_dict[storage_key][frame] = current_out
|
|
2447
|
+
self._prune_non_cond_memory(frame, inference_state=inference_state)
|
|
2431
2448
|
# Create slices of per-object outputs for subsequent interaction with each
|
|
2432
2449
|
# individual object after tracking.
|
|
2433
2450
|
self._add_output_per_object(frame, current_out, storage_key, inference_state=inference_state)
|
|
@@ -2437,24 +2454,6 @@ class SAM3VideoPredictor(SAM2VideoPredictor, SAM3Predictor):
|
|
|
2437
2454
|
|
|
2438
2455
|
return obj_ids, pred_masks, obj_scores
|
|
2439
2456
|
|
|
2440
|
-
def get_im_features(self, im, batch=1):
|
|
2441
|
-
"""A wrapper to get image features, supporting pre-extracted backbone outputs."""
|
|
2442
|
-
if getattr(self, "backbone_out", None):
|
|
2443
|
-
backbone_out = self.backbone_out
|
|
2444
|
-
if batch > 1: # expand features if there's more than one prompt
|
|
2445
|
-
backbone_out = {
|
|
2446
|
-
"backbone_fpn": backbone_out["backbone_fpn"].copy(),
|
|
2447
|
-
"vision_pos_enc": backbone_out["vision_pos_enc"].copy(),
|
|
2448
|
-
}
|
|
2449
|
-
for i, feat in enumerate(backbone_out["backbone_fpn"]):
|
|
2450
|
-
backbone_out["backbone_fpn"][i] = feat.expand(batch, -1, -1, -1)
|
|
2451
|
-
for i, pos in enumerate(backbone_out["vision_pos_enc"]):
|
|
2452
|
-
pos = pos.expand(batch, -1, -1, -1)
|
|
2453
|
-
backbone_out["vision_pos_enc"][i] = pos
|
|
2454
|
-
_, vis_feats, vis_pos_embed, feat_sizes = self.model._prepare_backbone_features(backbone_out)
|
|
2455
|
-
return vis_feats, vis_pos_embed, feat_sizes
|
|
2456
|
-
return super().get_im_features(im, batch)
|
|
2457
|
-
|
|
2458
2457
|
|
|
2459
2458
|
class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
2460
2459
|
"""Segment Anything Model 3 (SAM3) Video Semantic Predictor."""
|
|
@@ -2479,7 +2478,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
2479
2478
|
cfg=DEFAULT_CFG,
|
|
2480
2479
|
overrides=None,
|
|
2481
2480
|
_callbacks=None,
|
|
2482
|
-
bpe_path="bpe_simple_vocab_16e6.txt.gz",
|
|
2483
2481
|
# prob threshold for detection outputs -- only keep detections above this threshold
|
|
2484
2482
|
# enters NMS and det-to-track matching
|
|
2485
2483
|
score_threshold_detection=0.5,
|
|
@@ -2499,14 +2497,12 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
2499
2497
|
hotstart_delay=0,
|
|
2500
2498
|
hotstart_unmatch_thresh=3,
|
|
2501
2499
|
hotstart_dup_thresh=3,
|
|
2502
|
-
|
|
2503
|
-
|
|
2504
|
-
init_trk_keep_alive=0,
|
|
2505
|
-
max_trk_keep_alive=8,
|
|
2500
|
+
init_trk_keep_alive=30,
|
|
2501
|
+
max_trk_keep_alive=30,
|
|
2506
2502
|
min_trk_keep_alive=-4,
|
|
2507
2503
|
# Threshold for suppressing overlapping objects based on recent occlusion
|
|
2508
2504
|
suppress_overlapping_based_on_recent_occlusion_threshold=0.0,
|
|
2509
|
-
decrease_trk_keep_alive_for_empty_masklets=
|
|
2505
|
+
decrease_trk_keep_alive_for_empty_masklets=True,
|
|
2510
2506
|
o2o_matching_masklets_enable=False, # Enable hungarian matching to match existing masklets
|
|
2511
2507
|
suppress_det_close_to_boundary=False,
|
|
2512
2508
|
fill_hole_area=16,
|
|
@@ -2523,7 +2519,7 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
2523
2519
|
reconstruction_bbox_det_score=0.0,
|
|
2524
2520
|
):
|
|
2525
2521
|
"""Initialize the SAM3VideoSemanticPredictor with configuration and optional overrides."""
|
|
2526
|
-
super().__init__(cfg, overrides, _callbacks
|
|
2522
|
+
super().__init__(cfg, overrides, _callbacks)
|
|
2527
2523
|
self.score_threshold_detection = score_threshold_detection
|
|
2528
2524
|
self.det_nms_thresh = det_nms_thresh
|
|
2529
2525
|
self.assoc_iou_thresh = assoc_iou_thresh
|
|
@@ -2537,7 +2533,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
2537
2533
|
self.hotstart_delay = hotstart_delay
|
|
2538
2534
|
self.hotstart_unmatch_thresh = hotstart_unmatch_thresh
|
|
2539
2535
|
self.hotstart_dup_thresh = hotstart_dup_thresh
|
|
2540
|
-
self.suppress_unmatched_only_within_hotstart = suppress_unmatched_only_within_hotstart
|
|
2541
2536
|
self.init_trk_keep_alive = init_trk_keep_alive
|
|
2542
2537
|
self.max_trk_keep_alive = max_trk_keep_alive
|
|
2543
2538
|
self.min_trk_keep_alive = min_trk_keep_alive
|
|
@@ -2662,7 +2657,7 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
2662
2657
|
) > 0
|
|
2663
2658
|
|
|
2664
2659
|
# names = getattr(self.model, "names", [str(i) for i in range(pred_scores.shape[0])])
|
|
2665
|
-
names = dict(enumerate(str(i) for i in range(
|
|
2660
|
+
names = dict(enumerate(str(i) for i in range(pred_boxes.shape[0])))
|
|
2666
2661
|
results = []
|
|
2667
2662
|
for masks, boxes, orig_img, img_path in zip([pred_masks], [pred_boxes], orig_imgs, self.batch[0]):
|
|
2668
2663
|
results.append(Results(orig_img, path=img_path, names=names, masks=masks, boxes=boxes))
|
|
@@ -2713,7 +2708,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
2713
2708
|
metadata = tracker_metadata_new["metadata"]
|
|
2714
2709
|
removed_obj_ids = metadata["removed_obj_ids"]
|
|
2715
2710
|
out["removed_obj_ids"] = removed_obj_ids
|
|
2716
|
-
out["suppressed_obj_ids"] = metadata["suppressed_obj_ids"][frame_idx]
|
|
2717
2711
|
out["frame_stats"] = frame_stats
|
|
2718
2712
|
if self.masklet_confirmation_enable:
|
|
2719
2713
|
status = metadata["masklet_confirmation"]["status"]
|
|
@@ -3621,7 +3615,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
3621
3615
|
overlap_pair_to_frame_inds = metadata["overlap_pair_to_frame_inds"]
|
|
3622
3616
|
# removed_obj_ids: object IDs that are suppressed via hot-start
|
|
3623
3617
|
removed_obj_ids = metadata["removed_obj_ids"]
|
|
3624
|
-
suppressed_obj_ids = metadata["suppressed_obj_ids"][frame_idx]
|
|
3625
3618
|
|
|
3626
3619
|
obj_ids_newly_removed = set() # object IDs to be newly removed on this frame
|
|
3627
3620
|
hotstart_diff = frame_idx - self.hotstart_delay if not reverse else frame_idx + self.hotstart_delay
|
|
@@ -3671,12 +3664,12 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
3671
3664
|
)
|
|
3672
3665
|
if (
|
|
3673
3666
|
trk_keep_alive[obj_id] <= 0 # Object has not been matched for too long
|
|
3674
|
-
and not self.suppress_unmatched_only_within_hotstart
|
|
3675
3667
|
and obj_id not in removed_obj_ids
|
|
3676
3668
|
and obj_id not in obj_ids_newly_removed
|
|
3677
3669
|
):
|
|
3678
|
-
LOGGER.debug(f"
|
|
3679
|
-
|
|
3670
|
+
LOGGER.debug(f"Removing object {obj_id} at frame {frame_idx}, due to being unmatched")
|
|
3671
|
+
# directly removed the object instead of suppressing it
|
|
3672
|
+
obj_ids_newly_removed.add(obj_id)
|
|
3680
3673
|
|
|
3681
3674
|
# Step 3: removed tracks that overlaps with another track for `hotstart_dup_thresh` frames
|
|
3682
3675
|
# a) find overlaps tracks -- we consider overlap if they match to the same detection
|
|
@@ -3855,8 +3848,6 @@ class SAM3VideoSemanticPredictor(SAM3SemanticPredictor):
|
|
|
3855
3848
|
"trk_keep_alive": defaultdict(int), # This is used only for object suppression not for removal
|
|
3856
3849
|
"overlap_pair_to_frame_inds": defaultdict(list),
|
|
3857
3850
|
"removed_obj_ids": set(),
|
|
3858
|
-
# frame_idx --> set of objects with suppressed outputs, but still continue to be tracked
|
|
3859
|
-
"suppressed_obj_ids": defaultdict(set),
|
|
3860
3851
|
}
|
|
3861
3852
|
if self.masklet_confirmation_enable:
|
|
3862
3853
|
# all the following are np.ndarray with the same shape as `obj_ids_all_gpu`
|
|
@@ -171,7 +171,7 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
171
171
|
assert tgt.shape[0] % 2 == 0
|
|
172
172
|
other_tgt = tgt[tgt.shape[0] // 2 :]
|
|
173
173
|
tgt = tgt[: tgt.shape[0] // 2]
|
|
174
|
-
tgt2 = self.norm1(tgt)
|
|
174
|
+
tgt2 = self.norm1(tgt).contiguous()
|
|
175
175
|
q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
|
|
176
176
|
tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0]
|
|
177
177
|
tgt = tgt + self.dropout1(tgt2)
|
|
@@ -179,13 +179,13 @@ class TransformerEncoderLayer(nn.Module):
|
|
|
179
179
|
# Recombine
|
|
180
180
|
tgt = torch.cat((tgt, other_tgt), dim=0)
|
|
181
181
|
tgt2 = self.norm2(tgt)
|
|
182
|
+
memory = memory.to(tgt2.dtype).contiguous()
|
|
182
183
|
tgt2 = self.cross_attn_image(
|
|
183
184
|
query=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2,
|
|
184
|
-
key=memory
|
|
185
|
-
value=memory
|
|
185
|
+
key=memory + pos if self.pos_enc_at_cross_attn_keys else memory,
|
|
186
|
+
value=memory,
|
|
186
187
|
attn_mask=memory_mask,
|
|
187
188
|
key_padding_mask=memory_key_padding_mask,
|
|
188
|
-
# attn_bias=attn_bias,
|
|
189
189
|
)[0]
|
|
190
190
|
tgt = tgt + self.dropout2(tgt2)
|
|
191
191
|
tgt2 = self.norm3(tgt)
|
|
@@ -42,8 +42,8 @@ def concat_padded_sequences(seq1, mask1, seq2, mask2, return_index: bool = False
|
|
|
42
42
|
assert seq1_length == mask1.size(1)
|
|
43
43
|
assert seq2_length == mask2.size(1)
|
|
44
44
|
|
|
45
|
-
torch.
|
|
46
|
-
torch.
|
|
45
|
+
torch._assert(is_right_padded(mask1), "Mask is not right padded")
|
|
46
|
+
torch._assert(is_right_padded(mask2), "Mask is not right padded")
|
|
47
47
|
|
|
48
48
|
actual_seq1_lengths = (~mask1).sum(dim=-1)
|
|
49
49
|
actual_seq2_lengths = (~mask2).sum(dim=-1)
|
|
@@ -288,7 +288,7 @@ class SequenceGeometryEncoder(nn.Module):
|
|
|
288
288
|
# Convert boxes to xyxy format and denormalize
|
|
289
289
|
boxes_xyxy = xywh2xyxy(boxes.to(img_feats.dtype))
|
|
290
290
|
scale = torch.tensor([W, H, W, H], dtype=boxes_xyxy.dtype)
|
|
291
|
-
scale = scale.
|
|
291
|
+
scale = scale.to(device=boxes_xyxy.device, non_blocking=True)
|
|
292
292
|
scale = scale.view(1, 1, 4)
|
|
293
293
|
boxes_xyxy = boxes_xyxy * scale
|
|
294
294
|
|
|
@@ -103,27 +103,27 @@ class Sam3DualViTDetNeck(nn.Module):
|
|
|
103
103
|
|
|
104
104
|
def forward(
|
|
105
105
|
self, tensor_list: list[torch.Tensor]
|
|
106
|
-
) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor], list[torch.Tensor]]:
|
|
107
|
-
"""Get
|
|
106
|
+
) -> tuple[list[torch.Tensor], list[torch.Tensor], list[torch.Tensor] | None, list[torch.Tensor] | None]:
|
|
107
|
+
"""Get feature maps and positional encodings from the neck."""
|
|
108
108
|
xs = self.trunk(tensor_list)
|
|
109
|
-
sam3_out, sam3_pos = [], []
|
|
110
|
-
sam2_out, sam2_pos = None, None
|
|
111
|
-
if self.sam2_convs is not None:
|
|
112
|
-
sam2_out, sam2_pos = [], []
|
|
113
109
|
x = xs[-1] # simpleFPN
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
sam3_pos.append(sam3_pos_out)
|
|
119
|
-
|
|
120
|
-
if self.sam2_convs is not None:
|
|
121
|
-
sam2_x_out = self.sam2_convs[i](x)
|
|
122
|
-
sam2_pos_out = self.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
|
|
123
|
-
sam2_out.append(sam2_x_out)
|
|
124
|
-
sam2_pos.append(sam2_pos_out)
|
|
110
|
+
sam3_out, sam3_pos = self.sam_forward_feature_levels(x, self.convs)
|
|
111
|
+
if self.sam2_convs is None:
|
|
112
|
+
return sam3_out, sam3_pos, None, None
|
|
113
|
+
sam2_out, sam2_pos = self.sam_forward_feature_levels(x, self.sam2_convs)
|
|
125
114
|
return sam3_out, sam3_pos, sam2_out, sam2_pos
|
|
126
115
|
|
|
116
|
+
def sam_forward_feature_levels(
|
|
117
|
+
self, x: torch.Tensor, convs: nn.ModuleList
|
|
118
|
+
) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
|
|
119
|
+
"""Run neck convolutions and compute positional encodings for each feature level."""
|
|
120
|
+
outs, poss = [], []
|
|
121
|
+
for conv in convs:
|
|
122
|
+
feat = conv(x)
|
|
123
|
+
outs.append(feat)
|
|
124
|
+
poss.append(self.position_encoding(feat).to(feat.dtype))
|
|
125
|
+
return outs, poss
|
|
126
|
+
|
|
127
127
|
def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
|
|
128
128
|
"""Set the image size for the trunk backbone."""
|
|
129
129
|
self.trunk.set_imgsz(imgsz)
|
|
@@ -11,6 +11,7 @@ import torch
|
|
|
11
11
|
from ultralytics.nn.modules.utils import inverse_sigmoid
|
|
12
12
|
from ultralytics.utils.ops import xywh2xyxy
|
|
13
13
|
|
|
14
|
+
from ..modules.sam import SAM2Model
|
|
14
15
|
from .geometry_encoders import Prompt
|
|
15
16
|
from .vl_combiner import SAM3VLBackbone
|
|
16
17
|
|
|
@@ -93,25 +94,6 @@ class SAM3SemanticModel(torch.nn.Module):
|
|
|
93
94
|
self.text_embeddings = {}
|
|
94
95
|
self.names = []
|
|
95
96
|
|
|
96
|
-
def _prepare_backbone_features(self, backbone_out, num_prompts=1):
|
|
97
|
-
"""Prepare and flatten visual features from the image backbone output for further processing."""
|
|
98
|
-
if num_prompts > 1: # expand features if there's more than one prompt
|
|
99
|
-
for i, feat in enumerate(backbone_out["backbone_fpn"]):
|
|
100
|
-
backbone_out["backbone_fpn"][i] = feat.expand(num_prompts, -1, -1, -1)
|
|
101
|
-
for i, pos in enumerate(backbone_out["vision_pos_enc"]):
|
|
102
|
-
pos = pos.expand(num_prompts, -1, -1, -1)
|
|
103
|
-
backbone_out["vision_pos_enc"][i] = pos
|
|
104
|
-
assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"])
|
|
105
|
-
assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels
|
|
106
|
-
|
|
107
|
-
feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :]
|
|
108
|
-
vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :]
|
|
109
|
-
feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds]
|
|
110
|
-
# flatten NxCxHxW to HWxNxC
|
|
111
|
-
vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps]
|
|
112
|
-
vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds]
|
|
113
|
-
return backbone_out, vision_feats, vision_pos_embeds, feat_sizes
|
|
114
|
-
|
|
115
97
|
def _encode_prompt(
|
|
116
98
|
self,
|
|
117
99
|
img_feats,
|
|
@@ -304,8 +286,8 @@ class SAM3SemanticModel(torch.nn.Module):
|
|
|
304
286
|
self, backbone_out: dict[str, torch.Tensor], text_ids: torch.Tensor, geometric_prompt: Prompt = None
|
|
305
287
|
):
|
|
306
288
|
"""Forward pass for grounding (detection + segmentation) given input images and text."""
|
|
307
|
-
backbone_out, img_feats, img_pos_embeds, vis_feat_sizes =
|
|
308
|
-
backbone_out,
|
|
289
|
+
backbone_out, img_feats, img_pos_embeds, vis_feat_sizes = SAM2Model._prepare_backbone_features(
|
|
290
|
+
self, backbone_out, batch=len(text_ids)
|
|
309
291
|
)
|
|
310
292
|
backbone_out.update({k: v for k, v in self.text_embeddings.items()})
|
|
311
293
|
with torch.profiler.record_function("SAM3Image._encode_prompt"):
|
|
@@ -110,15 +110,10 @@ class SAM3VLBackbone(nn.Module):
|
|
|
110
110
|
def forward_image_sam2(self, samples: torch.Tensor):
|
|
111
111
|
"""Forward pass of the vision backbone to get SAM2 features only."""
|
|
112
112
|
xs = self.vision_backbone.trunk(samples)
|
|
113
|
-
sam2_features, sam2_pos = [], []
|
|
114
113
|
x = xs[-1] # simpleFPN
|
|
115
114
|
|
|
116
115
|
assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
|
|
117
|
-
|
|
118
|
-
sam2_x_out = self.vision_backbone.sam2_convs[i](x)
|
|
119
|
-
sam2_pos_out = self.vision_backbone.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
|
|
120
|
-
sam2_features.append(sam2_x_out)
|
|
121
|
-
sam2_pos.append(sam2_pos_out)
|
|
116
|
+
sam2_features, sam2_pos = self.vision_backbone.sam_forward_feature_levels(x, self.vision_backbone.sam2_convs)
|
|
122
117
|
|
|
123
118
|
if self.scalp > 0:
|
|
124
119
|
# Discard the lowest resolution features
|
|
@@ -57,7 +57,7 @@ class ClassificationValidator(BaseValidator):
|
|
|
57
57
|
"""Initialize ClassificationValidator with dataloader, save directory, and other parameters.
|
|
58
58
|
|
|
59
59
|
Args:
|
|
60
|
-
dataloader (torch.utils.data.DataLoader, optional):
|
|
60
|
+
dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
|
|
61
61
|
save_dir (str | Path, optional): Directory to save results.
|
|
62
62
|
args (dict, optional): Arguments containing model and validation configuration.
|
|
63
63
|
_callbacks (list, optional): List of callback functions to be called during validation.
|
|
@@ -53,7 +53,7 @@ class DetectionTrainer(BaseTrainer):
|
|
|
53
53
|
"""
|
|
54
54
|
|
|
55
55
|
def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
|
|
56
|
-
"""Initialize a DetectionTrainer object for training YOLO object detection
|
|
56
|
+
"""Initialize a DetectionTrainer object for training YOLO object detection models.
|
|
57
57
|
|
|
58
58
|
Args:
|
|
59
59
|
cfg (dict, optional): Default configuration dictionary containing training parameters.
|
|
@@ -46,7 +46,7 @@ class DetectionValidator(BaseValidator):
|
|
|
46
46
|
"""Initialize detection validator with necessary variables and settings.
|
|
47
47
|
|
|
48
48
|
Args:
|
|
49
|
-
dataloader (torch.utils.data.DataLoader, optional):
|
|
49
|
+
dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
|
|
50
50
|
save_dir (Path, optional): Directory to save results.
|
|
51
51
|
args (dict[str, Any], optional): Arguments for the validator.
|
|
52
52
|
_callbacks (list[Any], optional): List of callback functions.
|
|
@@ -256,7 +256,7 @@ class DetectionValidator(BaseValidator):
|
|
|
256
256
|
pf = "%22s" + "%11i" * 2 + "%11.3g" * len(self.metrics.keys) # print format
|
|
257
257
|
LOGGER.info(pf % ("all", self.seen, self.metrics.nt_per_class.sum(), *self.metrics.mean_results()))
|
|
258
258
|
if self.metrics.nt_per_class.sum() == 0:
|
|
259
|
-
LOGGER.warning(f"no labels found in {self.args.task} set,
|
|
259
|
+
LOGGER.warning(f"no labels found in {self.args.task} set, cannot compute metrics without labels")
|
|
260
260
|
|
|
261
261
|
# Print results per class
|
|
262
262
|
if self.args.verbose and not self.training and self.nc > 1 and len(self.metrics.stats):
|
|
@@ -308,7 +308,7 @@ class DetectionValidator(BaseValidator):
|
|
|
308
308
|
batch_size (int): Size of each batch.
|
|
309
309
|
|
|
310
310
|
Returns:
|
|
311
|
-
(torch.utils.data.DataLoader):
|
|
311
|
+
(torch.utils.data.DataLoader): DataLoader for validation.
|
|
312
312
|
"""
|
|
313
313
|
dataset = self.build_dataset(dataset_path, batch=batch_size, mode="val")
|
|
314
314
|
return build_dataloader(
|
|
@@ -460,11 +460,11 @@ class DetectionValidator(BaseValidator):
|
|
|
460
460
|
|
|
461
461
|
Args:
|
|
462
462
|
stats (dict[str, Any]): Dictionary to store computed metrics and statistics.
|
|
463
|
-
pred_json (str | Path
|
|
464
|
-
anno_json (str | Path
|
|
465
|
-
iou_types (str | list[str]
|
|
463
|
+
pred_json (str | Path): Path to JSON file containing predictions in COCO format.
|
|
464
|
+
anno_json (str | Path): Path to JSON file containing ground truth annotations in COCO format.
|
|
465
|
+
iou_types (str | list[str]): IoU type(s) for evaluation. Can be single string or list of strings. Common
|
|
466
466
|
values include "bbox", "segm", "keypoints". Defaults to "bbox".
|
|
467
|
-
suffix (str | list[str]
|
|
467
|
+
suffix (str | list[str]): Suffix to append to metric names in stats dictionary. Should correspond to
|
|
468
468
|
iou_types if multiple types provided. Defaults to "Box".
|
|
469
469
|
|
|
470
470
|
Returns:
|
|
@@ -50,7 +50,7 @@ class OBBValidator(DetectionValidator):
|
|
|
50
50
|
extends the DetectionValidator class and configures it specifically for the OBB task.
|
|
51
51
|
|
|
52
52
|
Args:
|
|
53
|
-
dataloader (torch.utils.data.DataLoader, optional):
|
|
53
|
+
dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
|
|
54
54
|
save_dir (str | Path, optional): Directory to save results.
|
|
55
55
|
args (dict | SimpleNamespace, optional): Arguments containing validation parameters.
|
|
56
56
|
_callbacks (list, optional): List of callback functions to be called during validation.
|
|
@@ -59,7 +59,7 @@ class PoseValidator(DetectionValidator):
|
|
|
59
59
|
specialized metrics for pose evaluation.
|
|
60
60
|
|
|
61
61
|
Args:
|
|
62
|
-
dataloader (torch.utils.data.DataLoader, optional):
|
|
62
|
+
dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
|
|
63
63
|
save_dir (Path | str, optional): Directory to save results.
|
|
64
64
|
args (dict, optional): Arguments for the validator including task set to "pose".
|
|
65
65
|
_callbacks (list, optional): List of callback functions to be executed during validation.
|
|
@@ -39,7 +39,7 @@ class SegmentationValidator(DetectionValidator):
|
|
|
39
39
|
"""Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.
|
|
40
40
|
|
|
41
41
|
Args:
|
|
42
|
-
dataloader (torch.utils.data.DataLoader, optional):
|
|
42
|
+
dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
|
|
43
43
|
save_dir (Path, optional): Directory to save results.
|
|
44
44
|
args (namespace, optional): Arguments for the validator.
|
|
45
45
|
_callbacks (list, optional): List of callback functions.
|
ultralytics/nn/autobackend.py
CHANGED
|
@@ -127,7 +127,7 @@ class AutoBackend(nn.Module):
|
|
|
127
127
|
|
|
128
128
|
Methods:
|
|
129
129
|
forward: Run inference on an input image.
|
|
130
|
-
from_numpy: Convert
|
|
130
|
+
from_numpy: Convert NumPy arrays to tensors on the model device.
|
|
131
131
|
warmup: Warm up the model with a dummy input.
|
|
132
132
|
_model_type: Determine the model type from file path.
|
|
133
133
|
|
|
@@ -182,7 +182,7 @@ class AutoBackend(nn.Module):
|
|
|
182
182
|
triton,
|
|
183
183
|
) = self._model_type("" if nn_module else model)
|
|
184
184
|
fp16 &= pt or jit or onnx or xml or engine or nn_module or triton # FP16
|
|
185
|
-
nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn # BHWC formats (vs torch
|
|
185
|
+
nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn # BHWC formats (vs torch BCHW)
|
|
186
186
|
stride, ch = 32, 3 # default stride and channels
|
|
187
187
|
end2end, dynamic = False, False
|
|
188
188
|
metadata, task = None, None
|
|
@@ -894,14 +894,14 @@ class AutoBackend(nn.Module):
|
|
|
894
894
|
else:
|
|
895
895
|
return self.from_numpy(y)
|
|
896
896
|
|
|
897
|
-
def from_numpy(self, x: np.ndarray) -> torch.Tensor:
|
|
898
|
-
"""Convert a
|
|
897
|
+
def from_numpy(self, x: np.ndarray | torch.Tensor) -> torch.Tensor:
|
|
898
|
+
"""Convert a NumPy array to a torch tensor on the model device.
|
|
899
899
|
|
|
900
900
|
Args:
|
|
901
|
-
x (np.ndarray):
|
|
901
|
+
x (np.ndarray | torch.Tensor): Input array or tensor.
|
|
902
902
|
|
|
903
903
|
Returns:
|
|
904
|
-
(torch.Tensor):
|
|
904
|
+
(torch.Tensor): Tensor on `self.device`.
|
|
905
905
|
"""
|
|
906
906
|
return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
|
|
907
907
|
|
|
@@ -909,7 +909,7 @@ class AutoBackend(nn.Module):
|
|
|
909
909
|
"""Warm up the model by running one forward pass with a dummy input.
|
|
910
910
|
|
|
911
911
|
Args:
|
|
912
|
-
imgsz (tuple
|
|
912
|
+
imgsz (tuple[int, int, int, int]): Dummy input shape in (batch, channels, height, width) format.
|
|
913
913
|
"""
|
|
914
914
|
warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
|
|
915
915
|
if any(warmup_types) and (self.device.type != "cpu" or self.triton):
|
|
@@ -931,8 +931,8 @@ class AutoBackend(nn.Module):
|
|
|
931
931
|
(list[bool]): List of booleans indicating the model type.
|
|
932
932
|
|
|
933
933
|
Examples:
|
|
934
|
-
>>>
|
|
935
|
-
>>>
|
|
934
|
+
>>> types = AutoBackend._model_type("path/to/model.onnx")
|
|
935
|
+
>>> assert types[2] # onnx
|
|
936
936
|
"""
|
|
937
937
|
from ultralytics.engine.exporter import export_formats
|
|
938
938
|
|
ultralytics/nn/modules/block.py
CHANGED
|
@@ -1812,7 +1812,7 @@ class A2C2f(nn.Module):
|
|
|
1812
1812
|
"""
|
|
1813
1813
|
super().__init__()
|
|
1814
1814
|
c_ = int(c2 * e) # hidden channels
|
|
1815
|
-
assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."
|
|
1815
|
+
assert c_ % 32 == 0, "Dimension of ABlock must be a multiple of 32."
|
|
1816
1816
|
|
|
1817
1817
|
self.cv1 = Conv(c1, c_, 1, 1)
|
|
1818
1818
|
self.cv2 = Conv((1 + n) * c_, c2, 1)
|
ultralytics/nn/tasks.py
CHANGED
|
@@ -866,7 +866,7 @@ class WorldModel(DetectionModel):
|
|
|
866
866
|
self.model[-1].nc = len(text)
|
|
867
867
|
|
|
868
868
|
def get_text_pe(self, text, batch=80, cache_clip_model=True):
|
|
869
|
-
"""
|
|
869
|
+
"""Get text positional embeddings for offline inference without CLIP model.
|
|
870
870
|
|
|
871
871
|
Args:
|
|
872
872
|
text (list[str]): List of class names.
|
|
@@ -987,13 +987,13 @@ class YOLOEModel(DetectionModel):
|
|
|
987
987
|
|
|
988
988
|
@smart_inference_mode()
|
|
989
989
|
def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
|
|
990
|
-
"""
|
|
990
|
+
"""Get text positional embeddings for offline inference without CLIP model.
|
|
991
991
|
|
|
992
992
|
Args:
|
|
993
993
|
text (list[str]): List of class names.
|
|
994
994
|
batch (int): Batch size for processing text tokens.
|
|
995
995
|
cache_clip_model (bool): Whether to cache the CLIP model.
|
|
996
|
-
without_reprta (bool): Whether to return text embeddings
|
|
996
|
+
without_reprta (bool): Whether to return text embeddings without reprta module processing.
|
|
997
997
|
|
|
998
998
|
Returns:
|
|
999
999
|
(torch.Tensor): Text positional embeddings.
|