dgenerate-ultralytics-headless 8.3.236__py3-none-any.whl → 8.3.237__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/METADATA +1 -1
  2. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/RECORD +38 -25
  3. ultralytics/__init__.py +1 -1
  4. ultralytics/engine/exporter.py +17 -10
  5. ultralytics/engine/predictor.py +3 -2
  6. ultralytics/engine/trainer.py +8 -0
  7. ultralytics/models/rtdetr/val.py +5 -1
  8. ultralytics/models/sam/__init__.py +14 -1
  9. ultralytics/models/sam/build.py +17 -8
  10. ultralytics/models/sam/build_sam3.py +374 -0
  11. ultralytics/models/sam/model.py +12 -4
  12. ultralytics/models/sam/modules/blocks.py +20 -8
  13. ultralytics/models/sam/modules/decoders.py +2 -3
  14. ultralytics/models/sam/modules/encoders.py +4 -1
  15. ultralytics/models/sam/modules/memory_attention.py +6 -2
  16. ultralytics/models/sam/modules/sam.py +150 -6
  17. ultralytics/models/sam/modules/utils.py +134 -4
  18. ultralytics/models/sam/predict.py +2076 -118
  19. ultralytics/models/sam/sam3/__init__.py +3 -0
  20. ultralytics/models/sam/sam3/decoder.py +546 -0
  21. ultralytics/models/sam/sam3/encoder.py +535 -0
  22. ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
  23. ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
  24. ultralytics/models/sam/sam3/model_misc.py +198 -0
  25. ultralytics/models/sam/sam3/necks.py +129 -0
  26. ultralytics/models/sam/sam3/sam3_image.py +357 -0
  27. ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
  28. ultralytics/models/sam/sam3/tokenizer_ve.py +242 -0
  29. ultralytics/models/sam/sam3/vitdet.py +546 -0
  30. ultralytics/models/sam/sam3/vl_combiner.py +165 -0
  31. ultralytics/models/yolo/obb/val.py +18 -7
  32. ultralytics/nn/modules/transformer.py +21 -1
  33. ultralytics/utils/checks.py +2 -2
  34. ultralytics/utils/ops.py +1 -3
  35. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/WHEEL +0 -0
  36. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/entry_points.txt +0 -0
  37. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/licenses/LICENSE +0 -0
  38. {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,165 @@
1
+ # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
+
3
+ # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
4
+
5
+ """Provides utility to combine a vision backbone with a language backbone."""
6
+
7
+ from __future__ import annotations
8
+
9
+ from copy import copy
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch.nn.attention import SDPBackend, sdpa_kernel
14
+
15
+ from .necks import Sam3DualViTDetNeck
16
+
17
+
18
+ class SAM3VLBackbone(nn.Module):
19
+ """This backbone combines a vision backbone and a language backbone without fusion. As such it is more of a
20
+ convenience wrapper to handle the two backbones together.
21
+
22
+ It adds support for activation checkpointing and compilation.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ visual: Sam3DualViTDetNeck,
28
+ text,
29
+ compile_visual: bool = False,
30
+ act_ckpt_whole_vision_backbone: bool = False,
31
+ act_ckpt_whole_language_backbone: bool = False,
32
+ scalp=0,
33
+ ):
34
+ """Initialize the backbone combiner.
35
+
36
+ :param visual: The vision backbone to use
37
+ :param text: The text encoder to use
38
+ """
39
+ super().__init__()
40
+ self.vision_backbone: Sam3DualViTDetNeck = torch.compile(visual) if compile_visual else visual
41
+ self.language_backbone = text
42
+ self.scalp = scalp
43
+ # allow running activation checkpointing on the entire vision and language backbones
44
+ self.act_ckpt_whole_vision_backbone = act_ckpt_whole_vision_backbone
45
+ self.act_ckpt_whole_language_backbone = act_ckpt_whole_language_backbone
46
+
47
+ def forward(
48
+ self,
49
+ samples: torch.Tensor,
50
+ captions: list[str],
51
+ input_boxes: torch.Tensor = None,
52
+ additional_text: list[str] | None = None,
53
+ ):
54
+ """Forward pass of the backbone combiner.
55
+
56
+ :param samples: The input images
57
+ :param captions: The input captions
58
+ :param input_boxes: If the text contains place-holders for boxes, this
59
+ parameter contains the tensor containing their spatial features
60
+ :param additional_text: This can be used to encode some additional text
61
+ (different from the captions) in the same forward of the backbone
62
+ :return: Output dictionary with the following keys:
63
+ - vision_features: The output of the vision backbone
64
+ - language_features: The output of the language backbone
65
+ - language_mask: The attention mask of the language backbone
66
+ - vision_pos_enc: The positional encoding of the vision backbone
67
+ - (optional) additional_text_features: The output of the language
68
+ backbone for the additional text
69
+ - (optional) additional_text_mask: The attention mask of the
70
+ language backbone for the additional text
71
+ """
72
+ output = self.forward_image(samples)
73
+ output.update(self.forward_text(captions, input_boxes, additional_text))
74
+ return output
75
+
76
+ def forward_image(self, samples: torch.Tensor):
77
+ """Forward pass of the vision backbone and get both SAM3 and SAM2 features."""
78
+ # Forward through backbone
79
+ sam3_features, sam3_pos, sam2_features, sam2_pos = self.vision_backbone.forward(samples)
80
+ if self.scalp > 0:
81
+ # Discard the lowest resolution features
82
+ sam3_features, sam3_pos = (
83
+ sam3_features[: -self.scalp],
84
+ sam3_pos[: -self.scalp],
85
+ )
86
+ if sam2_features is not None and sam2_pos is not None:
87
+ sam2_features, sam2_pos = (
88
+ sam2_features[: -self.scalp],
89
+ sam2_pos[: -self.scalp],
90
+ )
91
+
92
+ sam2_output = None
93
+
94
+ if sam2_features is not None and sam2_pos is not None:
95
+ sam2_src = sam2_features[-1]
96
+ sam2_output = {
97
+ "vision_features": sam2_src,
98
+ "vision_pos_enc": sam2_pos,
99
+ "backbone_fpn": sam2_features,
100
+ }
101
+
102
+ sam3_src = sam3_features[-1]
103
+ return {
104
+ "vision_features": sam3_src,
105
+ "vision_pos_enc": sam3_pos,
106
+ "backbone_fpn": sam3_features,
107
+ "sam2_backbone_out": sam2_output,
108
+ }
109
+
110
+ def forward_image_sam2(self, samples: torch.Tensor):
111
+ """Forward pass of the vision backbone to get SAM2 features only."""
112
+ xs = self.vision_backbone.trunk(samples)
113
+ sam2_features, sam2_pos = [], []
114
+ x = xs[-1] # simpleFPN
115
+
116
+ assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
117
+ for i in range(len(self.vision_backbone.sam2_convs)):
118
+ sam2_x_out = self.vision_backbone.sam2_convs[i](x)
119
+ sam2_pos_out = self.vision_backbone.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
120
+ sam2_features.append(sam2_x_out)
121
+ sam2_pos.append(sam2_pos_out)
122
+
123
+ if self.scalp > 0:
124
+ # Discard the lowest resolution features
125
+ sam2_features, sam2_pos = (
126
+ sam2_features[: -self.scalp],
127
+ sam2_pos[: -self.scalp],
128
+ )
129
+
130
+ return {
131
+ "vision_features": sam2_features[-1],
132
+ "vision_pos_enc": sam2_pos,
133
+ "backbone_fpn": sam2_features,
134
+ }
135
+
136
+ def forward_text(self, captions, input_boxes=None, additional_text=None):
137
+ """Forward pass of the text encoder."""
138
+ output = {}
139
+
140
+ # Forward through text_encoder
141
+ text_to_encode = copy(captions)
142
+ if additional_text is not None:
143
+ # if there are additional_text, we piggy-back them into this forward.
144
+ # They'll be used later for output alignment
145
+ text_to_encode += additional_text
146
+
147
+ with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.FLASH_ATTENTION]):
148
+ text_attention_mask, text_memory, text_embeds = self.language_backbone(text_to_encode, input_boxes)
149
+
150
+ if additional_text is not None:
151
+ output["additional_text_features"] = text_memory[:, -len(additional_text) :]
152
+ output["additional_text_mask"] = text_attention_mask[-len(additional_text) :]
153
+
154
+ text_memory = text_memory[:, : len(captions)]
155
+ text_attention_mask = text_attention_mask[: len(captions)]
156
+ text_embeds = text_embeds[:, : len(captions)]
157
+ output["language_features"] = text_memory
158
+ output["language_mask"] = text_attention_mask
159
+ output["language_embeds"] = text_embeds # Text embeddings before forward to the encoder
160
+
161
+ return output
162
+
163
+ def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
164
+ """Set the image size for the vision backbone."""
165
+ self.vision_backbone.set_imgsz(imgsz)
@@ -12,6 +12,7 @@ from ultralytics.models.yolo.detect import DetectionValidator
12
12
  from ultralytics.utils import LOGGER, ops
13
13
  from ultralytics.utils.metrics import OBBMetrics, batch_probiou
14
14
  from ultralytics.utils.nms import TorchNMS
15
+ from ultralytics.utils.plotting import plot_images
15
16
 
16
17
 
17
18
  class OBBValidator(DetectionValidator):
@@ -141,24 +142,34 @@ class OBBValidator(DetectionValidator):
141
142
  "im_file": batch["im_file"][si],
142
143
  }
143
144
 
144
- def plot_predictions(self, batch: dict[str, Any], preds: list[torch.Tensor], ni: int) -> None:
145
+ def plot_predictions(self, batch: dict[str, Any], preds: list[dict[str, torch.Tensor]], ni: int) -> None:
145
146
  """Plot predicted bounding boxes on input images and save the result.
146
147
 
147
148
  Args:
148
149
  batch (dict[str, Any]): Batch data containing images, file paths, and other metadata.
149
- preds (list[torch.Tensor]): List of prediction tensors for each image in the batch.
150
+ preds (list[dict[str, torch.Tensor]]): List of prediction dictionaries for each image in the batch.
150
151
  ni (int): Batch index used for naming the output file.
151
152
 
152
153
  Examples:
153
154
  >>> validator = OBBValidator()
154
155
  >>> batch = {"img": images, "im_file": paths}
155
- >>> preds = [torch.rand(10, 7)] # Example predictions for one image
156
+ >>> preds = [{"bboxes": torch.rand(10, 5), "cls": torch.zeros(10), "conf": torch.rand(10)}]
156
157
  >>> validator.plot_predictions(batch, preds, 0)
157
158
  """
158
- for p in preds:
159
- # TODO: fix this duplicated `xywh2xyxy`
160
- p["bboxes"][:, :4] = ops.xywh2xyxy(p["bboxes"][:, :4]) # convert to xyxy format for plotting
161
- super().plot_predictions(batch, preds, ni) # plot bboxes
159
+ if not preds:
160
+ return
161
+ for i, pred in enumerate(preds):
162
+ pred["batch_idx"] = torch.ones_like(pred["conf"]) * i
163
+ keys = preds[0].keys()
164
+ batched_preds = {k: torch.cat([x[k] for x in preds], dim=0) for k in keys}
165
+ plot_images(
166
+ images=batch["img"],
167
+ labels=batched_preds,
168
+ paths=batch["im_file"],
169
+ fname=self.save_dir / f"val_batch{ni}_pred.jpg",
170
+ names=self.names,
171
+ on_plot=self.on_plot,
172
+ )
162
173
 
163
174
  def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
164
175
  """Convert YOLO predictions to COCO JSON format with rotated bounding box information.
@@ -359,7 +359,15 @@ class MLP(nn.Module):
359
359
  """
360
360
 
361
361
  def __init__(
362
- self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
362
+ self,
363
+ input_dim: int,
364
+ hidden_dim: int,
365
+ output_dim: int,
366
+ num_layers: int,
367
+ act=nn.ReLU,
368
+ sigmoid: bool = False,
369
+ residual: bool = False,
370
+ out_norm: nn.Module = None,
363
371
  ):
364
372
  """Initialize the MLP with specified input, hidden, output dimensions and number of layers.
365
373
 
@@ -370,6 +378,8 @@ class MLP(nn.Module):
370
378
  num_layers (int): Number of layers.
371
379
  act (nn.Module): Activation function.
372
380
  sigmoid (bool): Whether to apply sigmoid to the output.
381
+ residual (bool): Whether to use residual connections.
382
+ out_norm (nn.Module, optional): Normalization layer for the output.
373
383
  """
374
384
  super().__init__()
375
385
  self.num_layers = num_layers
@@ -377,6 +387,12 @@ class MLP(nn.Module):
377
387
  self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
378
388
  self.sigmoid = sigmoid
379
389
  self.act = act()
390
+ if residual and input_dim != output_dim:
391
+ raise ValueError("residual is only supported if input_dim == output_dim")
392
+ self.residual = residual
393
+ # whether to apply a normalization layer to the output
394
+ assert isinstance(out_norm, nn.Module) or out_norm is None
395
+ self.out_norm = out_norm or nn.Identity()
380
396
 
381
397
  def forward(self, x: torch.Tensor) -> torch.Tensor:
382
398
  """Forward pass for the entire MLP.
@@ -387,8 +403,12 @@ class MLP(nn.Module):
387
403
  Returns:
388
404
  (torch.Tensor): Output tensor after MLP.
389
405
  """
406
+ orig_x = x
390
407
  for i, layer in enumerate(self.layers):
391
408
  x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
409
+ if getattr(self, "residual", False):
410
+ x = x + orig_x
411
+ x = getattr(self, "out_norm", nn.Identity())(x)
392
412
  return x.sigmoid() if getattr(self, "sigmoid", False) else x
393
413
 
394
414
 
@@ -379,8 +379,8 @@ def check_apt_requirements(requirements):
379
379
  f"{prefix} Ultralytics requirement{'s' * (len(missing_packages) > 1)} {missing_packages} not found, attempting AutoUpdate..."
380
380
  )
381
381
  # Optionally update package list first
382
- if is_sudo_available():
383
- subprocess.run(["sudo", "apt", "update"], check=False)
382
+ cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "update"]
383
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
384
384
 
385
385
  # Build and run the install command
386
386
  cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "install", "-y"] + missing_packages
ultralytics/utils/ops.py CHANGED
@@ -660,6 +660,4 @@ def clean_str(s):
660
660
 
661
661
  def empty_like(x):
662
662
  """Create empty torch.Tensor or np.ndarray with same shape as input and float32 dtype."""
663
- return (
664
- torch.empty_like(x, dtype=torch.float32) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=np.float32)
665
- )
663
+ return torch.empty_like(x, dtype=x.dtype) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=x.dtype)