dgenerate-ultralytics-headless 8.3.236__py3-none-any.whl → 8.3.239__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/METADATA +1 -1
  2. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/RECORD +117 -105
  3. tests/test_exports.py +3 -1
  4. tests/test_python.py +2 -2
  5. tests/test_solutions.py +6 -6
  6. ultralytics/__init__.py +1 -1
  7. ultralytics/cfg/__init__.py +4 -4
  8. ultralytics/cfg/datasets/Argoverse.yaml +7 -6
  9. ultralytics/cfg/datasets/DOTAv1.5.yaml +1 -1
  10. ultralytics/cfg/datasets/DOTAv1.yaml +1 -1
  11. ultralytics/cfg/datasets/VOC.yaml +15 -16
  12. ultralytics/cfg/datasets/african-wildlife.yaml +1 -1
  13. ultralytics/cfg/datasets/coco128-seg.yaml +1 -1
  14. ultralytics/cfg/datasets/dota8-multispectral.yaml +1 -1
  15. ultralytics/cfg/datasets/dota8.yaml +2 -2
  16. ultralytics/cfg/datasets/kitti.yaml +1 -1
  17. ultralytics/cfg/datasets/xView.yaml +16 -16
  18. ultralytics/cfg/models/11/yolo11-pose.yaml +1 -1
  19. ultralytics/cfg/models/11/yoloe-11-seg.yaml +2 -2
  20. ultralytics/cfg/models/11/yoloe-11.yaml +2 -2
  21. ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +9 -6
  22. ultralytics/cfg/models/v8/yoloe-v8.yaml +9 -6
  23. ultralytics/cfg/models/v8/yolov8-cls-resnet101.yaml +1 -1
  24. ultralytics/cfg/models/v8/yolov8-cls-resnet50.yaml +1 -1
  25. ultralytics/cfg/models/v8/yolov8-ghost-p2.yaml +2 -2
  26. ultralytics/cfg/models/v8/yolov8-ghost-p6.yaml +2 -2
  27. ultralytics/cfg/models/v8/yolov8-ghost.yaml +2 -2
  28. ultralytics/cfg/models/v8/yolov8-obb.yaml +1 -1
  29. ultralytics/cfg/models/v8/yolov8-p2.yaml +1 -1
  30. ultralytics/cfg/models/v8/yolov8-pose-p6.yaml +1 -1
  31. ultralytics/cfg/models/v8/yolov8-rtdetr.yaml +1 -1
  32. ultralytics/cfg/models/v8/yolov8-world.yaml +1 -1
  33. ultralytics/cfg/models/v8/yolov8-worldv2.yaml +6 -6
  34. ultralytics/data/augment.py +1 -1
  35. ultralytics/data/base.py +4 -2
  36. ultralytics/data/build.py +4 -4
  37. ultralytics/data/loaders.py +17 -12
  38. ultralytics/data/utils.py +4 -4
  39. ultralytics/engine/exporter.py +40 -25
  40. ultralytics/engine/predictor.py +8 -6
  41. ultralytics/engine/results.py +12 -13
  42. ultralytics/engine/trainer.py +10 -2
  43. ultralytics/engine/tuner.py +2 -3
  44. ultralytics/engine/validator.py +2 -2
  45. ultralytics/models/fastsam/model.py +2 -2
  46. ultralytics/models/fastsam/predict.py +2 -3
  47. ultralytics/models/fastsam/val.py +4 -4
  48. ultralytics/models/rtdetr/predict.py +2 -3
  49. ultralytics/models/rtdetr/val.py +10 -5
  50. ultralytics/models/sam/__init__.py +14 -1
  51. ultralytics/models/sam/build.py +22 -13
  52. ultralytics/models/sam/build_sam3.py +377 -0
  53. ultralytics/models/sam/model.py +13 -5
  54. ultralytics/models/sam/modules/blocks.py +20 -8
  55. ultralytics/models/sam/modules/decoders.py +2 -3
  56. ultralytics/models/sam/modules/encoders.py +4 -1
  57. ultralytics/models/sam/modules/memory_attention.py +6 -2
  58. ultralytics/models/sam/modules/sam.py +159 -10
  59. ultralytics/models/sam/modules/utils.py +134 -4
  60. ultralytics/models/sam/predict.py +2073 -139
  61. ultralytics/models/sam/sam3/__init__.py +3 -0
  62. ultralytics/models/sam/sam3/decoder.py +546 -0
  63. ultralytics/models/sam/sam3/encoder.py +535 -0
  64. ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
  65. ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
  66. ultralytics/models/sam/sam3/model_misc.py +198 -0
  67. ultralytics/models/sam/sam3/necks.py +129 -0
  68. ultralytics/models/sam/sam3/sam3_image.py +339 -0
  69. ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
  70. ultralytics/models/sam/sam3/vitdet.py +546 -0
  71. ultralytics/models/sam/sam3/vl_combiner.py +160 -0
  72. ultralytics/models/yolo/classify/val.py +1 -1
  73. ultralytics/models/yolo/detect/train.py +1 -1
  74. ultralytics/models/yolo/detect/val.py +7 -7
  75. ultralytics/models/yolo/obb/val.py +19 -8
  76. ultralytics/models/yolo/pose/val.py +1 -1
  77. ultralytics/models/yolo/segment/val.py +1 -1
  78. ultralytics/nn/autobackend.py +9 -9
  79. ultralytics/nn/modules/block.py +1 -1
  80. ultralytics/nn/modules/transformer.py +21 -1
  81. ultralytics/nn/tasks.py +3 -3
  82. ultralytics/nn/text_model.py +2 -7
  83. ultralytics/solutions/ai_gym.py +1 -1
  84. ultralytics/solutions/analytics.py +6 -6
  85. ultralytics/solutions/config.py +1 -1
  86. ultralytics/solutions/distance_calculation.py +1 -1
  87. ultralytics/solutions/object_counter.py +1 -1
  88. ultralytics/solutions/object_cropper.py +3 -6
  89. ultralytics/solutions/parking_management.py +21 -17
  90. ultralytics/solutions/queue_management.py +5 -5
  91. ultralytics/solutions/region_counter.py +2 -2
  92. ultralytics/solutions/security_alarm.py +1 -1
  93. ultralytics/solutions/solutions.py +45 -22
  94. ultralytics/solutions/speed_estimation.py +1 -1
  95. ultralytics/trackers/basetrack.py +1 -1
  96. ultralytics/trackers/bot_sort.py +4 -3
  97. ultralytics/trackers/byte_tracker.py +4 -4
  98. ultralytics/trackers/utils/gmc.py +6 -7
  99. ultralytics/trackers/utils/kalman_filter.py +2 -1
  100. ultralytics/trackers/utils/matching.py +4 -3
  101. ultralytics/utils/__init__.py +12 -3
  102. ultralytics/utils/benchmarks.py +2 -2
  103. ultralytics/utils/callbacks/tensorboard.py +19 -25
  104. ultralytics/utils/checks.py +4 -3
  105. ultralytics/utils/downloads.py +1 -1
  106. ultralytics/utils/export/tensorflow.py +16 -2
  107. ultralytics/utils/files.py +13 -12
  108. ultralytics/utils/logger.py +62 -27
  109. ultralytics/utils/metrics.py +1 -1
  110. ultralytics/utils/ops.py +7 -9
  111. ultralytics/utils/patches.py +3 -3
  112. ultralytics/utils/plotting.py +7 -12
  113. ultralytics/utils/tuner.py +1 -1
  114. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/WHEEL +0 -0
  115. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/entry_points.txt +0 -0
  116. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/licenses/LICENSE +0 -0
  117. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.239.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,160 @@
1
+ # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
+
3
+ # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
4
+
5
+ """Provides utility to combine a vision backbone with a language backbone."""
6
+
7
+ from __future__ import annotations
8
+
9
+ from copy import copy
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch.nn.attention import SDPBackend, sdpa_kernel
14
+
15
+ from .necks import Sam3DualViTDetNeck
16
+
17
+
18
+ class SAM3VLBackbone(nn.Module):
19
+ """This backbone combines a vision backbone and a language backbone without fusion. As such it is more of a
20
+ convenience wrapper to handle the two backbones together.
21
+
22
+ It adds support for activation checkpointing and compilation.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ visual: Sam3DualViTDetNeck,
28
+ text,
29
+ compile_visual: bool = False,
30
+ act_ckpt_whole_vision_backbone: bool = False,
31
+ act_ckpt_whole_language_backbone: bool = False,
32
+ scalp=0,
33
+ ):
34
+ """Initialize the backbone combiner.
35
+
36
+ :param visual: The vision backbone to use
37
+ :param text: The text encoder to use
38
+ """
39
+ super().__init__()
40
+ self.vision_backbone: Sam3DualViTDetNeck = torch.compile(visual) if compile_visual else visual
41
+ self.language_backbone = text
42
+ self.scalp = scalp
43
+ # allow running activation checkpointing on the entire vision and language backbones
44
+ self.act_ckpt_whole_vision_backbone = act_ckpt_whole_vision_backbone
45
+ self.act_ckpt_whole_language_backbone = act_ckpt_whole_language_backbone
46
+
47
+ def forward(
48
+ self,
49
+ samples: torch.Tensor,
50
+ captions: list[str],
51
+ input_boxes: torch.Tensor = None,
52
+ additional_text: list[str] | None = None,
53
+ ):
54
+ """Forward pass of the backbone combiner.
55
+
56
+ :param samples: The input images
57
+ :param captions: The input captions
58
+ :param input_boxes: If the text contains place-holders for boxes, this
59
+ parameter contains the tensor containing their spatial features
60
+ :param additional_text: This can be used to encode some additional text
61
+ (different from the captions) in the same forward of the backbone
62
+ :return: Output dictionary with the following keys:
63
+ - vision_features: The output of the vision backbone
64
+ - language_features: The output of the language backbone
65
+ - language_mask: The attention mask of the language backbone
66
+ - vision_pos_enc: The positional encoding of the vision backbone
67
+ - (optional) additional_text_features: The output of the language
68
+ backbone for the additional text
69
+ - (optional) additional_text_mask: The attention mask of the
70
+ language backbone for the additional text
71
+ """
72
+ output = self.forward_image(samples)
73
+ output.update(self.forward_text(captions, input_boxes, additional_text))
74
+ return output
75
+
76
+ def forward_image(self, samples: torch.Tensor):
77
+ """Forward pass of the vision backbone and get both SAM3 and SAM2 features."""
78
+ # Forward through backbone
79
+ sam3_features, sam3_pos, sam2_features, sam2_pos = self.vision_backbone.forward(samples)
80
+ if self.scalp > 0:
81
+ # Discard the lowest resolution features
82
+ sam3_features, sam3_pos = (
83
+ sam3_features[: -self.scalp],
84
+ sam3_pos[: -self.scalp],
85
+ )
86
+ if sam2_features is not None and sam2_pos is not None:
87
+ sam2_features, sam2_pos = (
88
+ sam2_features[: -self.scalp],
89
+ sam2_pos[: -self.scalp],
90
+ )
91
+
92
+ sam2_output = None
93
+
94
+ if sam2_features is not None and sam2_pos is not None:
95
+ sam2_src = sam2_features[-1]
96
+ sam2_output = {
97
+ "vision_features": sam2_src,
98
+ "vision_pos_enc": sam2_pos,
99
+ "backbone_fpn": sam2_features,
100
+ }
101
+
102
+ sam3_src = sam3_features[-1]
103
+ return {
104
+ "vision_features": sam3_src,
105
+ "vision_pos_enc": sam3_pos,
106
+ "backbone_fpn": sam3_features,
107
+ "sam2_backbone_out": sam2_output,
108
+ }
109
+
110
+ def forward_image_sam2(self, samples: torch.Tensor):
111
+ """Forward pass of the vision backbone to get SAM2 features only."""
112
+ xs = self.vision_backbone.trunk(samples)
113
+ x = xs[-1] # simpleFPN
114
+
115
+ assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
116
+ sam2_features, sam2_pos = self.vision_backbone.sam_forward_feature_levels(x, self.vision_backbone.sam2_convs)
117
+
118
+ if self.scalp > 0:
119
+ # Discard the lowest resolution features
120
+ sam2_features, sam2_pos = (
121
+ sam2_features[: -self.scalp],
122
+ sam2_pos[: -self.scalp],
123
+ )
124
+
125
+ return {
126
+ "vision_features": sam2_features[-1],
127
+ "vision_pos_enc": sam2_pos,
128
+ "backbone_fpn": sam2_features,
129
+ }
130
+
131
+ def forward_text(self, captions, input_boxes=None, additional_text=None):
132
+ """Forward pass of the text encoder."""
133
+ output = {}
134
+
135
+ # Forward through text_encoder
136
+ text_to_encode = copy(captions)
137
+ if additional_text is not None:
138
+ # if there are additional_text, we piggy-back them into this forward.
139
+ # They'll be used later for output alignment
140
+ text_to_encode += additional_text
141
+
142
+ with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.FLASH_ATTENTION]):
143
+ text_attention_mask, text_memory, text_embeds = self.language_backbone(text_to_encode, input_boxes)
144
+
145
+ if additional_text is not None:
146
+ output["additional_text_features"] = text_memory[:, -len(additional_text) :]
147
+ output["additional_text_mask"] = text_attention_mask[-len(additional_text) :]
148
+
149
+ text_memory = text_memory[:, : len(captions)]
150
+ text_attention_mask = text_attention_mask[: len(captions)]
151
+ text_embeds = text_embeds[:, : len(captions)]
152
+ output["language_features"] = text_memory
153
+ output["language_mask"] = text_attention_mask
154
+ output["language_embeds"] = text_embeds # Text embeddings before forward to the encoder
155
+
156
+ return output
157
+
158
+ def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
159
+ """Set the image size for the vision backbone."""
160
+ self.vision_backbone.set_imgsz(imgsz)
@@ -57,7 +57,7 @@ class ClassificationValidator(BaseValidator):
57
57
  """Initialize ClassificationValidator with dataloader, save directory, and other parameters.
58
58
 
59
59
  Args:
60
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
60
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
61
61
  save_dir (str | Path, optional): Directory to save results.
62
62
  args (dict, optional): Arguments containing model and validation configuration.
63
63
  _callbacks (list, optional): List of callback functions to be called during validation.
@@ -53,7 +53,7 @@ class DetectionTrainer(BaseTrainer):
53
53
  """
54
54
 
55
55
  def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
56
- """Initialize a DetectionTrainer object for training YOLO object detection model training.
56
+ """Initialize a DetectionTrainer object for training YOLO object detection models.
57
57
 
58
58
  Args:
59
59
  cfg (dict, optional): Default configuration dictionary containing training parameters.
@@ -46,7 +46,7 @@ class DetectionValidator(BaseValidator):
46
46
  """Initialize detection validator with necessary variables and settings.
47
47
 
48
48
  Args:
49
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
49
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
50
50
  save_dir (Path, optional): Directory to save results.
51
51
  args (dict[str, Any], optional): Arguments for the validator.
52
52
  _callbacks (list[Any], optional): List of callback functions.
@@ -256,7 +256,7 @@ class DetectionValidator(BaseValidator):
256
256
  pf = "%22s" + "%11i" * 2 + "%11.3g" * len(self.metrics.keys) # print format
257
257
  LOGGER.info(pf % ("all", self.seen, self.metrics.nt_per_class.sum(), *self.metrics.mean_results()))
258
258
  if self.metrics.nt_per_class.sum() == 0:
259
- LOGGER.warning(f"no labels found in {self.args.task} set, can not compute metrics without labels")
259
+ LOGGER.warning(f"no labels found in {self.args.task} set, cannot compute metrics without labels")
260
260
 
261
261
  # Print results per class
262
262
  if self.args.verbose and not self.training and self.nc > 1 and len(self.metrics.stats):
@@ -308,7 +308,7 @@ class DetectionValidator(BaseValidator):
308
308
  batch_size (int): Size of each batch.
309
309
 
310
310
  Returns:
311
- (torch.utils.data.DataLoader): Dataloader for validation.
311
+ (torch.utils.data.DataLoader): DataLoader for validation.
312
312
  """
313
313
  dataset = self.build_dataset(dataset_path, batch=batch_size, mode="val")
314
314
  return build_dataloader(
@@ -460,11 +460,11 @@ class DetectionValidator(BaseValidator):
460
460
 
461
461
  Args:
462
462
  stats (dict[str, Any]): Dictionary to store computed metrics and statistics.
463
- pred_json (str | Path]): Path to JSON file containing predictions in COCO format.
464
- anno_json (str | Path]): Path to JSON file containing ground truth annotations in COCO format.
465
- iou_types (str | list[str]]): IoU type(s) for evaluation. Can be single string or list of strings. Common
463
+ pred_json (str | Path): Path to JSON file containing predictions in COCO format.
464
+ anno_json (str | Path): Path to JSON file containing ground truth annotations in COCO format.
465
+ iou_types (str | list[str]): IoU type(s) for evaluation. Can be single string or list of strings. Common
466
466
  values include "bbox", "segm", "keypoints". Defaults to "bbox".
467
- suffix (str | list[str]]): Suffix to append to metric names in stats dictionary. Should correspond to
467
+ suffix (str | list[str]): Suffix to append to metric names in stats dictionary. Should correspond to
468
468
  iou_types if multiple types provided. Defaults to "Box".
469
469
 
470
470
  Returns:
@@ -12,6 +12,7 @@ from ultralytics.models.yolo.detect import DetectionValidator
12
12
  from ultralytics.utils import LOGGER, ops
13
13
  from ultralytics.utils.metrics import OBBMetrics, batch_probiou
14
14
  from ultralytics.utils.nms import TorchNMS
15
+ from ultralytics.utils.plotting import plot_images
15
16
 
16
17
 
17
18
  class OBBValidator(DetectionValidator):
@@ -49,7 +50,7 @@ class OBBValidator(DetectionValidator):
49
50
  extends the DetectionValidator class and configures it specifically for the OBB task.
50
51
 
51
52
  Args:
52
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
53
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
53
54
  save_dir (str | Path, optional): Directory to save results.
54
55
  args (dict | SimpleNamespace, optional): Arguments containing validation parameters.
55
56
  _callbacks (list, optional): List of callback functions to be called during validation.
@@ -141,24 +142,34 @@ class OBBValidator(DetectionValidator):
141
142
  "im_file": batch["im_file"][si],
142
143
  }
143
144
 
144
- def plot_predictions(self, batch: dict[str, Any], preds: list[torch.Tensor], ni: int) -> None:
145
+ def plot_predictions(self, batch: dict[str, Any], preds: list[dict[str, torch.Tensor]], ni: int) -> None:
145
146
  """Plot predicted bounding boxes on input images and save the result.
146
147
 
147
148
  Args:
148
149
  batch (dict[str, Any]): Batch data containing images, file paths, and other metadata.
149
- preds (list[torch.Tensor]): List of prediction tensors for each image in the batch.
150
+ preds (list[dict[str, torch.Tensor]]): List of prediction dictionaries for each image in the batch.
150
151
  ni (int): Batch index used for naming the output file.
151
152
 
152
153
  Examples:
153
154
  >>> validator = OBBValidator()
154
155
  >>> batch = {"img": images, "im_file": paths}
155
- >>> preds = [torch.rand(10, 7)] # Example predictions for one image
156
+ >>> preds = [{"bboxes": torch.rand(10, 5), "cls": torch.zeros(10), "conf": torch.rand(10)}]
156
157
  >>> validator.plot_predictions(batch, preds, 0)
157
158
  """
158
- for p in preds:
159
- # TODO: fix this duplicated `xywh2xyxy`
160
- p["bboxes"][:, :4] = ops.xywh2xyxy(p["bboxes"][:, :4]) # convert to xyxy format for plotting
161
- super().plot_predictions(batch, preds, ni) # plot bboxes
159
+ if not preds:
160
+ return
161
+ for i, pred in enumerate(preds):
162
+ pred["batch_idx"] = torch.ones_like(pred["conf"]) * i
163
+ keys = preds[0].keys()
164
+ batched_preds = {k: torch.cat([x[k] for x in preds], dim=0) for k in keys}
165
+ plot_images(
166
+ images=batch["img"],
167
+ labels=batched_preds,
168
+ paths=batch["im_file"],
169
+ fname=self.save_dir / f"val_batch{ni}_pred.jpg",
170
+ names=self.names,
171
+ on_plot=self.on_plot,
172
+ )
162
173
 
163
174
  def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
164
175
  """Convert YOLO predictions to COCO JSON format with rotated bounding box information.
@@ -59,7 +59,7 @@ class PoseValidator(DetectionValidator):
59
59
  specialized metrics for pose evaluation.
60
60
 
61
61
  Args:
62
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to be used for validation.
62
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to be used for validation.
63
63
  save_dir (Path | str, optional): Directory to save results.
64
64
  args (dict, optional): Arguments for the validator including task set to "pose".
65
65
  _callbacks (list, optional): List of callback functions to be executed during validation.
@@ -39,7 +39,7 @@ class SegmentationValidator(DetectionValidator):
39
39
  """Initialize SegmentationValidator and set task to 'segment', metrics to SegmentMetrics.
40
40
 
41
41
  Args:
42
- dataloader (torch.utils.data.DataLoader, optional): Dataloader to use for validation.
42
+ dataloader (torch.utils.data.DataLoader, optional): DataLoader to use for validation.
43
43
  save_dir (Path, optional): Directory to save results.
44
44
  args (namespace, optional): Arguments for the validator.
45
45
  _callbacks (list, optional): List of callback functions.
@@ -127,7 +127,7 @@ class AutoBackend(nn.Module):
127
127
 
128
128
  Methods:
129
129
  forward: Run inference on an input image.
130
- from_numpy: Convert numpy array to tensor.
130
+ from_numpy: Convert NumPy arrays to tensors on the model device.
131
131
  warmup: Warm up the model with a dummy input.
132
132
  _model_type: Determine the model type from file path.
133
133
 
@@ -182,7 +182,7 @@ class AutoBackend(nn.Module):
182
182
  triton,
183
183
  ) = self._model_type("" if nn_module else model)
184
184
  fp16 &= pt or jit or onnx or xml or engine or nn_module or triton # FP16
185
- nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn # BHWC formats (vs torch BCWH)
185
+ nhwc = coreml or saved_model or pb or tflite or edgetpu or rknn # BHWC formats (vs torch BCHW)
186
186
  stride, ch = 32, 3 # default stride and channels
187
187
  end2end, dynamic = False, False
188
188
  metadata, task = None, None
@@ -894,14 +894,14 @@ class AutoBackend(nn.Module):
894
894
  else:
895
895
  return self.from_numpy(y)
896
896
 
897
- def from_numpy(self, x: np.ndarray) -> torch.Tensor:
898
- """Convert a numpy array to a tensor.
897
+ def from_numpy(self, x: np.ndarray | torch.Tensor) -> torch.Tensor:
898
+ """Convert a NumPy array to a torch tensor on the model device.
899
899
 
900
900
  Args:
901
- x (np.ndarray): The array to be converted.
901
+ x (np.ndarray | torch.Tensor): Input array or tensor.
902
902
 
903
903
  Returns:
904
- (torch.Tensor): The converted tensor
904
+ (torch.Tensor): Tensor on `self.device`.
905
905
  """
906
906
  return torch.tensor(x).to(self.device) if isinstance(x, np.ndarray) else x
907
907
 
@@ -909,7 +909,7 @@ class AutoBackend(nn.Module):
909
909
  """Warm up the model by running one forward pass with a dummy input.
910
910
 
911
911
  Args:
912
- imgsz (tuple): The shape of the dummy input tensor in the format (batch_size, channels, height, width)
912
+ imgsz (tuple[int, int, int, int]): Dummy input shape in (batch, channels, height, width) format.
913
913
  """
914
914
  warmup_types = self.pt, self.jit, self.onnx, self.engine, self.saved_model, self.pb, self.triton, self.nn_module
915
915
  if any(warmup_types) and (self.device.type != "cpu" or self.triton):
@@ -931,8 +931,8 @@ class AutoBackend(nn.Module):
931
931
  (list[bool]): List of booleans indicating the model type.
932
932
 
933
933
  Examples:
934
- >>> model = AutoBackend(model="path/to/model.onnx")
935
- >>> model_type = model._model_type() # returns "onnx"
934
+ >>> types = AutoBackend._model_type("path/to/model.onnx")
935
+ >>> assert types[2] # onnx
936
936
  """
937
937
  from ultralytics.engine.exporter import export_formats
938
938
 
@@ -1812,7 +1812,7 @@ class A2C2f(nn.Module):
1812
1812
  """
1813
1813
  super().__init__()
1814
1814
  c_ = int(c2 * e) # hidden channels
1815
- assert c_ % 32 == 0, "Dimension of ABlock be a multiple of 32."
1815
+ assert c_ % 32 == 0, "Dimension of ABlock must be a multiple of 32."
1816
1816
 
1817
1817
  self.cv1 = Conv(c1, c_, 1, 1)
1818
1818
  self.cv2 = Conv((1 + n) * c_, c2, 1)
@@ -359,7 +359,15 @@ class MLP(nn.Module):
359
359
  """
360
360
 
361
361
  def __init__(
362
- self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
362
+ self,
363
+ input_dim: int,
364
+ hidden_dim: int,
365
+ output_dim: int,
366
+ num_layers: int,
367
+ act=nn.ReLU,
368
+ sigmoid: bool = False,
369
+ residual: bool = False,
370
+ out_norm: nn.Module = None,
363
371
  ):
364
372
  """Initialize the MLP with specified input, hidden, output dimensions and number of layers.
365
373
 
@@ -370,6 +378,8 @@ class MLP(nn.Module):
370
378
  num_layers (int): Number of layers.
371
379
  act (nn.Module): Activation function.
372
380
  sigmoid (bool): Whether to apply sigmoid to the output.
381
+ residual (bool): Whether to use residual connections.
382
+ out_norm (nn.Module, optional): Normalization layer for the output.
373
383
  """
374
384
  super().__init__()
375
385
  self.num_layers = num_layers
@@ -377,6 +387,12 @@ class MLP(nn.Module):
377
387
  self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
378
388
  self.sigmoid = sigmoid
379
389
  self.act = act()
390
+ if residual and input_dim != output_dim:
391
+ raise ValueError("residual is only supported if input_dim == output_dim")
392
+ self.residual = residual
393
+ # whether to apply a normalization layer to the output
394
+ assert isinstance(out_norm, nn.Module) or out_norm is None
395
+ self.out_norm = out_norm or nn.Identity()
380
396
 
381
397
  def forward(self, x: torch.Tensor) -> torch.Tensor:
382
398
  """Forward pass for the entire MLP.
@@ -387,8 +403,12 @@ class MLP(nn.Module):
387
403
  Returns:
388
404
  (torch.Tensor): Output tensor after MLP.
389
405
  """
406
+ orig_x = x
390
407
  for i, layer in enumerate(self.layers):
391
408
  x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
409
+ if getattr(self, "residual", False):
410
+ x = x + orig_x
411
+ x = getattr(self, "out_norm", nn.Identity())(x)
392
412
  return x.sigmoid() if getattr(self, "sigmoid", False) else x
393
413
 
394
414
 
ultralytics/nn/tasks.py CHANGED
@@ -866,7 +866,7 @@ class WorldModel(DetectionModel):
866
866
  self.model[-1].nc = len(text)
867
867
 
868
868
  def get_text_pe(self, text, batch=80, cache_clip_model=True):
869
- """Set classes in advance so that model could do offline-inference without clip model.
869
+ """Get text positional embeddings for offline inference without CLIP model.
870
870
 
871
871
  Args:
872
872
  text (list[str]): List of class names.
@@ -987,13 +987,13 @@ class YOLOEModel(DetectionModel):
987
987
 
988
988
  @smart_inference_mode()
989
989
  def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
990
- """Set classes in advance so that model could do offline-inference without clip model.
990
+ """Get text positional embeddings for offline inference without CLIP model.
991
991
 
992
992
  Args:
993
993
  text (list[str]): List of class names.
994
994
  batch (int): Batch size for processing text tokens.
995
995
  cache_clip_model (bool): Whether to cache the CLIP model.
996
- without_reprta (bool): Whether to return text embeddings cooperated with reprta module.
996
+ without_reprta (bool): Whether to return text embeddings without reprta module processing.
997
997
 
998
998
  Returns:
999
999
  (torch.Tensor): Text positional embeddings.
@@ -196,12 +196,7 @@ class MobileCLIP(TextModel):
196
196
  device (torch.device): Device to load the model on.
197
197
  """
198
198
  try:
199
- import warnings
200
-
201
- # Suppress 'timm.models.layers is deprecated, please import via timm.layers' warning from mobileclip usage
202
- with warnings.catch_warnings():
203
- warnings.filterwarnings("ignore", category=FutureWarning)
204
- import mobileclip
199
+ import mobileclip
205
200
  except ImportError:
206
201
  # Ultralytics fork preferred since Apple MobileCLIP repo has incorrect version of torchvision
207
202
  checks.check_requirements("git+https://github.com/ultralytics/mobileclip.git")
@@ -308,7 +303,7 @@ class MobileCLIPTS(TextModel):
308
303
  (torch.Tensor): Tokenized text inputs with shape (batch_size, sequence_length).
309
304
 
310
305
  Examples:
311
- >>> model = MobileCLIPTS("cpu")
306
+ >>> model = MobileCLIPTS(device=torch.device("cpu"))
312
307
  >>> tokens = model.tokenize(["a photo of a cat", "a photo of a dog"])
313
308
  >>> strict_tokens = model.tokenize(
314
309
  ... ["a very long caption"], truncate=False
@@ -13,7 +13,7 @@ class AIGym(BaseSolution):
13
13
  repetitions of exercises based on predefined angle thresholds for up and down positions.
14
14
 
15
15
  Attributes:
16
- states (dict[float, int, str]): Stores per-track angle, count, and stage for workout monitoring.
16
+ states (dict[int, dict[str, float | int | str]]): Per-track angle, rep count, and stage for workout monitoring.
17
17
  up_angle (float): Angle threshold for considering the 'up' position of an exercise.
18
18
  down_angle (float): Angle threshold for considering the 'down' position of an exercise.
19
19
  kpts (list[int]): Indices of keypoints used for angle calculation.
@@ -56,7 +56,7 @@ class Analytics(BaseSolution):
56
56
  from matplotlib.backends.backend_agg import FigureCanvasAgg
57
57
  from matplotlib.figure import Figure
58
58
 
59
- self.type = self.CFG["analytics_type"] # type of analytics i.e "line", "pie", "bar" or "area" charts.
59
+ self.type = self.CFG["analytics_type"] # Chart type: "line", "pie", "bar", or "area".
60
60
  self.x_label = "Classes" if self.type in {"bar", "pie"} else "Frame#"
61
61
  self.y_label = "Total Counts"
62
62
 
@@ -66,10 +66,10 @@ class Analytics(BaseSolution):
66
66
  self.title = "Ultralytics Solutions" # window name
67
67
  self.max_points = 45 # maximum points to be drawn on window
68
68
  self.fontsize = 25 # text font size for display
69
- figsize = self.CFG["figsize"] # set output image size i.e (12.8, 7.2) -> w = 1280, h = 720
69
+ figsize = self.CFG["figsize"] # Output size, e.g. (12.8, 7.2) -> 1280x720.
70
70
  self.color_cycle = cycle(["#DD00BA", "#042AFF", "#FF4447", "#7D24FF", "#BD00FF"])
71
71
 
72
- self.total_counts = 0 # count variable for storing total counts i.e. for line
72
+ self.total_counts = 0 # Stores total counts for line charts.
73
73
  self.clswise_count = {} # dictionary for class-wise counts
74
74
  self.update_every = kwargs.get("update_every", 30) # Only update graph every 30 frames by default
75
75
  self.last_plot_im = None # Cache of the last rendered chart
@@ -104,7 +104,7 @@ class Analytics(BaseSolution):
104
104
  and 'classwise_count' (dict, per-class object count).
105
105
 
106
106
  Raises:
107
- ModuleNotFoundError: If an unsupported chart type is specified.
107
+ ValueError: If an unsupported chart type is specified.
108
108
 
109
109
  Examples:
110
110
  >>> analytics = Analytics(analytics_type="line")
@@ -131,9 +131,9 @@ class Analytics(BaseSolution):
131
131
  )
132
132
  plot_im = self.last_plot_im
133
133
  else:
134
- raise ModuleNotFoundError(f"{self.type} chart is not supported ")
134
+ raise ValueError(f"Unsupported analytics_type='{self.type}'. Supported types: line, bar, pie, area.")
135
135
 
136
- # return output dictionary with summary for more usage
136
+ # Return results for downstream use.
137
137
  return SolutionResults(plot_im=plot_im, total_tracks=len(self.track_ids), classwise_count=self.clswise_count)
138
138
 
139
139
  def update_graph(
@@ -35,7 +35,7 @@ class SolutionConfig:
35
35
  vision_point (tuple[int, int]): Reference point for directional tracking or perspective drawing.
36
36
  crop_dir (str): Directory path to save cropped detection images.
37
37
  json_file (str): Path to a JSON file containing data for parking areas.
38
- line_width (int): Width for visual display i.e. bounding boxes, keypoints, counts.
38
+ line_width (int): Width for visual display, e.g. bounding boxes, keypoints, and counts.
39
39
  records (int): Number of detection records to send email alerts.
40
40
  fps (float): Frame rate (Frames Per Second) for speed estimation calculation.
41
41
  max_hist (int): Maximum number of historical points or states stored per tracked object for speed estimation.
@@ -17,7 +17,7 @@ class DistanceCalculation(BaseSolution):
17
17
 
18
18
  Attributes:
19
19
  left_mouse_count (int): Counter for left mouse button clicks.
20
- selected_boxes (dict[int, list[float]]): Dictionary to store selected bounding boxes and their track IDs.
20
+ selected_boxes (dict[int, Any]): Dictionary to store selected bounding boxes keyed by track ID.
21
21
  centroids (list[list[int]]): List to store centroids of selected bounding boxes.
22
22
 
23
23
  Methods:
@@ -19,7 +19,7 @@ class ObjectCounter(BaseSolution):
19
19
  in_count (int): Counter for objects moving inward.
20
20
  out_count (int): Counter for objects moving outward.
21
21
  counted_ids (list[int]): List of IDs of objects that have been counted.
22
- classwise_counts (dict[str, dict[str, int]]): Dictionary for counts, categorized by object class.
22
+ classwise_count (dict[str, dict[str, int]]): Dictionary for counts, categorized by object class.
23
23
  region_initialized (bool): Flag indicating whether the counting region has been initialized.
24
24
  show_in (bool): Flag to control display of inward count.
25
25
  show_out (bool): Flag to control display of outward count.
@@ -1,6 +1,5 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
- import os
4
3
  from pathlib import Path
5
4
  from typing import Any
6
5
 
@@ -40,12 +39,10 @@ class ObjectCropper(BaseSolution):
40
39
  super().__init__(**kwargs)
41
40
 
42
41
  self.crop_dir = self.CFG["crop_dir"] # Directory for storing cropped detections
43
- if not os.path.exists(self.crop_dir):
44
- os.mkdir(self.crop_dir) # Create directory if it does not exist
42
+ Path(self.crop_dir).mkdir(parents=True, exist_ok=True)
45
43
  if self.CFG["show"]:
46
- self.LOGGER.warning(
47
- f"show=True disabled for crop solution, results will be saved in the directory named: {self.crop_dir}"
48
- )
44
+ self.LOGGER.warning(f"show=True is not supported for ObjectCropper; saving crops to '{self.crop_dir}'.")
45
+ self.CFG["show"] = False
49
46
  self.crop_idx = 0 # Initialize counter for total cropped objects
50
47
  self.iou = self.CFG["iou"]
51
48
  self.conf = self.CFG["conf"]