dgenerate-ultralytics-headless 8.3.235__py3-none-any.whl → 8.3.237__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/METADATA +1 -1
  2. {dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/RECORD +41 -28
  3. tests/test_exports.py +15 -1
  4. ultralytics/__init__.py +1 -1
  5. ultralytics/engine/exporter.py +113 -12
  6. ultralytics/engine/predictor.py +3 -2
  7. ultralytics/engine/trainer.py +8 -0
  8. ultralytics/models/rtdetr/val.py +5 -1
  9. ultralytics/models/sam/__init__.py +14 -1
  10. ultralytics/models/sam/build.py +17 -8
  11. ultralytics/models/sam/build_sam3.py +374 -0
  12. ultralytics/models/sam/model.py +12 -4
  13. ultralytics/models/sam/modules/blocks.py +20 -8
  14. ultralytics/models/sam/modules/decoders.py +2 -3
  15. ultralytics/models/sam/modules/encoders.py +4 -1
  16. ultralytics/models/sam/modules/memory_attention.py +6 -2
  17. ultralytics/models/sam/modules/sam.py +150 -6
  18. ultralytics/models/sam/modules/utils.py +134 -4
  19. ultralytics/models/sam/predict.py +2076 -118
  20. ultralytics/models/sam/sam3/__init__.py +3 -0
  21. ultralytics/models/sam/sam3/decoder.py +546 -0
  22. ultralytics/models/sam/sam3/encoder.py +535 -0
  23. ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
  24. ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
  25. ultralytics/models/sam/sam3/model_misc.py +198 -0
  26. ultralytics/models/sam/sam3/necks.py +129 -0
  27. ultralytics/models/sam/sam3/sam3_image.py +357 -0
  28. ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
  29. ultralytics/models/sam/sam3/tokenizer_ve.py +242 -0
  30. ultralytics/models/sam/sam3/vitdet.py +546 -0
  31. ultralytics/models/sam/sam3/vl_combiner.py +165 -0
  32. ultralytics/models/yolo/obb/val.py +18 -7
  33. ultralytics/nn/autobackend.py +35 -0
  34. ultralytics/nn/modules/transformer.py +21 -1
  35. ultralytics/utils/checks.py +41 -0
  36. ultralytics/utils/ops.py +1 -3
  37. ultralytics/utils/torch_utils.py +1 -0
  38. {dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/WHEEL +0 -0
  39. {dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/entry_points.txt +0 -0
  40. {dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/licenses/LICENSE +0 -0
  41. {dgenerate_ultralytics_headless-8.3.235.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,165 @@
1
+ # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
+
3
+ # Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
4
+
5
+ """Provides utility to combine a vision backbone with a language backbone."""
6
+
7
+ from __future__ import annotations
8
+
9
+ from copy import copy
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ from torch.nn.attention import SDPBackend, sdpa_kernel
14
+
15
+ from .necks import Sam3DualViTDetNeck
16
+
17
+
18
+ class SAM3VLBackbone(nn.Module):
19
+ """This backbone combines a vision backbone and a language backbone without fusion. As such it is more of a
20
+ convenience wrapper to handle the two backbones together.
21
+
22
+ It adds support for activation checkpointing and compilation.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ visual: Sam3DualViTDetNeck,
28
+ text,
29
+ compile_visual: bool = False,
30
+ act_ckpt_whole_vision_backbone: bool = False,
31
+ act_ckpt_whole_language_backbone: bool = False,
32
+ scalp=0,
33
+ ):
34
+ """Initialize the backbone combiner.
35
+
36
+ :param visual: The vision backbone to use
37
+ :param text: The text encoder to use
38
+ """
39
+ super().__init__()
40
+ self.vision_backbone: Sam3DualViTDetNeck = torch.compile(visual) if compile_visual else visual
41
+ self.language_backbone = text
42
+ self.scalp = scalp
43
+ # allow running activation checkpointing on the entire vision and language backbones
44
+ self.act_ckpt_whole_vision_backbone = act_ckpt_whole_vision_backbone
45
+ self.act_ckpt_whole_language_backbone = act_ckpt_whole_language_backbone
46
+
47
+ def forward(
48
+ self,
49
+ samples: torch.Tensor,
50
+ captions: list[str],
51
+ input_boxes: torch.Tensor = None,
52
+ additional_text: list[str] | None = None,
53
+ ):
54
+ """Forward pass of the backbone combiner.
55
+
56
+ :param samples: The input images
57
+ :param captions: The input captions
58
+ :param input_boxes: If the text contains place-holders for boxes, this
59
+ parameter contains the tensor containing their spatial features
60
+ :param additional_text: This can be used to encode some additional text
61
+ (different from the captions) in the same forward of the backbone
62
+ :return: Output dictionary with the following keys:
63
+ - vision_features: The output of the vision backbone
64
+ - language_features: The output of the language backbone
65
+ - language_mask: The attention mask of the language backbone
66
+ - vision_pos_enc: The positional encoding of the vision backbone
67
+ - (optional) additional_text_features: The output of the language
68
+ backbone for the additional text
69
+ - (optional) additional_text_mask: The attention mask of the
70
+ language backbone for the additional text
71
+ """
72
+ output = self.forward_image(samples)
73
+ output.update(self.forward_text(captions, input_boxes, additional_text))
74
+ return output
75
+
76
+ def forward_image(self, samples: torch.Tensor):
77
+ """Forward pass of the vision backbone and get both SAM3 and SAM2 features."""
78
+ # Forward through backbone
79
+ sam3_features, sam3_pos, sam2_features, sam2_pos = self.vision_backbone.forward(samples)
80
+ if self.scalp > 0:
81
+ # Discard the lowest resolution features
82
+ sam3_features, sam3_pos = (
83
+ sam3_features[: -self.scalp],
84
+ sam3_pos[: -self.scalp],
85
+ )
86
+ if sam2_features is not None and sam2_pos is not None:
87
+ sam2_features, sam2_pos = (
88
+ sam2_features[: -self.scalp],
89
+ sam2_pos[: -self.scalp],
90
+ )
91
+
92
+ sam2_output = None
93
+
94
+ if sam2_features is not None and sam2_pos is not None:
95
+ sam2_src = sam2_features[-1]
96
+ sam2_output = {
97
+ "vision_features": sam2_src,
98
+ "vision_pos_enc": sam2_pos,
99
+ "backbone_fpn": sam2_features,
100
+ }
101
+
102
+ sam3_src = sam3_features[-1]
103
+ return {
104
+ "vision_features": sam3_src,
105
+ "vision_pos_enc": sam3_pos,
106
+ "backbone_fpn": sam3_features,
107
+ "sam2_backbone_out": sam2_output,
108
+ }
109
+
110
+ def forward_image_sam2(self, samples: torch.Tensor):
111
+ """Forward pass of the vision backbone to get SAM2 features only."""
112
+ xs = self.vision_backbone.trunk(samples)
113
+ sam2_features, sam2_pos = [], []
114
+ x = xs[-1] # simpleFPN
115
+
116
+ assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
117
+ for i in range(len(self.vision_backbone.sam2_convs)):
118
+ sam2_x_out = self.vision_backbone.sam2_convs[i](x)
119
+ sam2_pos_out = self.vision_backbone.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
120
+ sam2_features.append(sam2_x_out)
121
+ sam2_pos.append(sam2_pos_out)
122
+
123
+ if self.scalp > 0:
124
+ # Discard the lowest resolution features
125
+ sam2_features, sam2_pos = (
126
+ sam2_features[: -self.scalp],
127
+ sam2_pos[: -self.scalp],
128
+ )
129
+
130
+ return {
131
+ "vision_features": sam2_features[-1],
132
+ "vision_pos_enc": sam2_pos,
133
+ "backbone_fpn": sam2_features,
134
+ }
135
+
136
+ def forward_text(self, captions, input_boxes=None, additional_text=None):
137
+ """Forward pass of the text encoder."""
138
+ output = {}
139
+
140
+ # Forward through text_encoder
141
+ text_to_encode = copy(captions)
142
+ if additional_text is not None:
143
+ # if there are additional_text, we piggy-back them into this forward.
144
+ # They'll be used later for output alignment
145
+ text_to_encode += additional_text
146
+
147
+ with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.FLASH_ATTENTION]):
148
+ text_attention_mask, text_memory, text_embeds = self.language_backbone(text_to_encode, input_boxes)
149
+
150
+ if additional_text is not None:
151
+ output["additional_text_features"] = text_memory[:, -len(additional_text) :]
152
+ output["additional_text_mask"] = text_attention_mask[-len(additional_text) :]
153
+
154
+ text_memory = text_memory[:, : len(captions)]
155
+ text_attention_mask = text_attention_mask[: len(captions)]
156
+ text_embeds = text_embeds[:, : len(captions)]
157
+ output["language_features"] = text_memory
158
+ output["language_mask"] = text_attention_mask
159
+ output["language_embeds"] = text_embeds # Text embeddings before forward to the encoder
160
+
161
+ return output
162
+
163
+ def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
164
+ """Set the image size for the vision backbone."""
165
+ self.vision_backbone.set_imgsz(imgsz)
@@ -12,6 +12,7 @@ from ultralytics.models.yolo.detect import DetectionValidator
12
12
  from ultralytics.utils import LOGGER, ops
13
13
  from ultralytics.utils.metrics import OBBMetrics, batch_probiou
14
14
  from ultralytics.utils.nms import TorchNMS
15
+ from ultralytics.utils.plotting import plot_images
15
16
 
16
17
 
17
18
  class OBBValidator(DetectionValidator):
@@ -141,24 +142,34 @@ class OBBValidator(DetectionValidator):
141
142
  "im_file": batch["im_file"][si],
142
143
  }
143
144
 
144
- def plot_predictions(self, batch: dict[str, Any], preds: list[torch.Tensor], ni: int) -> None:
145
+ def plot_predictions(self, batch: dict[str, Any], preds: list[dict[str, torch.Tensor]], ni: int) -> None:
145
146
  """Plot predicted bounding boxes on input images and save the result.
146
147
 
147
148
  Args:
148
149
  batch (dict[str, Any]): Batch data containing images, file paths, and other metadata.
149
- preds (list[torch.Tensor]): List of prediction tensors for each image in the batch.
150
+ preds (list[dict[str, torch.Tensor]]): List of prediction dictionaries for each image in the batch.
150
151
  ni (int): Batch index used for naming the output file.
151
152
 
152
153
  Examples:
153
154
  >>> validator = OBBValidator()
154
155
  >>> batch = {"img": images, "im_file": paths}
155
- >>> preds = [torch.rand(10, 7)] # Example predictions for one image
156
+ >>> preds = [{"bboxes": torch.rand(10, 5), "cls": torch.zeros(10), "conf": torch.rand(10)}]
156
157
  >>> validator.plot_predictions(batch, preds, 0)
157
158
  """
158
- for p in preds:
159
- # TODO: fix this duplicated `xywh2xyxy`
160
- p["bboxes"][:, :4] = ops.xywh2xyxy(p["bboxes"][:, :4]) # convert to xyxy format for plotting
161
- super().plot_predictions(batch, preds, ni) # plot bboxes
159
+ if not preds:
160
+ return
161
+ for i, pred in enumerate(preds):
162
+ pred["batch_idx"] = torch.ones_like(pred["conf"]) * i
163
+ keys = preds[0].keys()
164
+ batched_preds = {k: torch.cat([x[k] for x in preds], dim=0) for k in keys}
165
+ plot_images(
166
+ images=batch["img"],
167
+ labels=batched_preds,
168
+ paths=batch["im_file"],
169
+ fname=self.save_dir / f"val_batch{ni}_pred.jpg",
170
+ names=self.names,
171
+ on_plot=self.on_plot,
172
+ )
162
173
 
163
174
  def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
164
175
  """Convert YOLO predictions to COCO JSON format with rotated bounding box information.
@@ -95,6 +95,7 @@ class AutoBackend(nn.Module):
95
95
  | RKNN | *_rknn_model/ |
96
96
  | Triton Inference | triton://model |
97
97
  | ExecuTorch | *.pte |
98
+ | Axelera | *_axelera_model/ |
98
99
 
99
100
  Attributes:
100
101
  model (torch.nn.Module): The loaded YOLO model.
@@ -122,6 +123,7 @@ class AutoBackend(nn.Module):
122
123
  rknn (bool): Whether the model is an RKNN model.
123
124
  triton (bool): Whether the model is a Triton Inference Server model.
124
125
  pte (bool): Whether the model is a PyTorch ExecuTorch model.
126
+ axelera (bool): Whether the model is an Axelera model.
125
127
 
126
128
  Methods:
127
129
  forward: Run inference on an input image.
@@ -176,6 +178,7 @@ class AutoBackend(nn.Module):
176
178
  imx,
177
179
  rknn,
178
180
  pte,
181
+ axelera,
179
182
  triton,
180
183
  ) = self._model_type("" if nn_module else model)
181
184
  fp16 &= pt or jit or onnx or xml or engine or nn_module or triton # FP16
@@ -570,6 +573,33 @@ class AutoBackend(nn.Module):
570
573
  rknn_model.init_runtime()
571
574
  metadata = w.parent / "metadata.yaml"
572
575
 
576
+ # Axelera
577
+ elif axelera:
578
+ import os
579
+
580
+ if not os.environ.get("AXELERA_RUNTIME_DIR"):
581
+ LOGGER.warning(
582
+ "Axelera runtime environment is not activated."
583
+ "\nPlease run: source /opt/axelera/sdk/latest/axelera_activate.sh"
584
+ "\n\nIf this fails, verify driver installation: https://docs.ultralytics.com/integrations/axelera/#axelera-driver-installation"
585
+ )
586
+ try:
587
+ from axelera.runtime import op
588
+ except ImportError:
589
+ check_requirements(
590
+ "axelera_runtime2==0.1.2",
591
+ cmds="--extra-index-url https://software.axelera.ai/artifactory/axelera-runtime-pypi",
592
+ )
593
+ from axelera.runtime import op
594
+
595
+ w = Path(w)
596
+ if (found := next(w.rglob("*.axm"), None)) is None:
597
+ raise FileNotFoundError(f"No .axm file found in: {w}")
598
+ w = found
599
+
600
+ ax_model = op.load(str(w))
601
+ metadata = w.parent / "metadata.yaml"
602
+
573
603
  # ExecuTorch
574
604
  elif pte:
575
605
  LOGGER.info(f"Loading {w} for ExecuTorch inference...")
@@ -796,6 +826,11 @@ class AutoBackend(nn.Module):
796
826
  im = im if isinstance(im, (list, tuple)) else [im]
797
827
  y = self.rknn_model.inference(inputs=im)
798
828
 
829
+ # Axelera
830
+ elif self.axelera:
831
+ im = im.cpu()
832
+ y = self.ax_model(im)
833
+
799
834
  # ExecuTorch
800
835
  elif self.pte:
801
836
  y = self.model.execute([im])
@@ -359,7 +359,15 @@ class MLP(nn.Module):
359
359
  """
360
360
 
361
361
  def __init__(
362
- self, input_dim: int, hidden_dim: int, output_dim: int, num_layers: int, act=nn.ReLU, sigmoid: bool = False
362
+ self,
363
+ input_dim: int,
364
+ hidden_dim: int,
365
+ output_dim: int,
366
+ num_layers: int,
367
+ act=nn.ReLU,
368
+ sigmoid: bool = False,
369
+ residual: bool = False,
370
+ out_norm: nn.Module = None,
363
371
  ):
364
372
  """Initialize the MLP with specified input, hidden, output dimensions and number of layers.
365
373
 
@@ -370,6 +378,8 @@ class MLP(nn.Module):
370
378
  num_layers (int): Number of layers.
371
379
  act (nn.Module): Activation function.
372
380
  sigmoid (bool): Whether to apply sigmoid to the output.
381
+ residual (bool): Whether to use residual connections.
382
+ out_norm (nn.Module, optional): Normalization layer for the output.
373
383
  """
374
384
  super().__init__()
375
385
  self.num_layers = num_layers
@@ -377,6 +387,12 @@ class MLP(nn.Module):
377
387
  self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
378
388
  self.sigmoid = sigmoid
379
389
  self.act = act()
390
+ if residual and input_dim != output_dim:
391
+ raise ValueError("residual is only supported if input_dim == output_dim")
392
+ self.residual = residual
393
+ # whether to apply a normalization layer to the output
394
+ assert isinstance(out_norm, nn.Module) or out_norm is None
395
+ self.out_norm = out_norm or nn.Identity()
380
396
 
381
397
  def forward(self, x: torch.Tensor) -> torch.Tensor:
382
398
  """Forward pass for the entire MLP.
@@ -387,8 +403,12 @@ class MLP(nn.Module):
387
403
  Returns:
388
404
  (torch.Tensor): Output tensor after MLP.
389
405
  """
406
+ orig_x = x
390
407
  for i, layer in enumerate(self.layers):
391
408
  x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
409
+ if getattr(self, "residual", False):
410
+ x = x + orig_x
411
+ x = getattr(self, "out_norm", nn.Identity())(x)
392
412
  return x.sigmoid() if getattr(self, "sigmoid", False) else x
393
413
 
394
414
 
@@ -350,6 +350,46 @@ def check_python(minimum: str = "3.8.0", hard: bool = True, verbose: bool = Fals
350
350
  return check_version(PYTHON_VERSION, minimum, name="Python", hard=hard, verbose=verbose)
351
351
 
352
352
 
353
+ @TryExcept()
354
+ def check_apt_requirements(requirements):
355
+ """Check if apt packages are installed and install missing ones.
356
+
357
+ Args:
358
+ requirements: List of apt package names to check and install
359
+ """
360
+ prefix = colorstr("red", "bold", "apt requirements:")
361
+ # Check which packages are missing
362
+ missing_packages = []
363
+ for package in requirements:
364
+ try:
365
+ # Use dpkg -l to check if package is installed
366
+ result = subprocess.run(["dpkg", "-l", package], capture_output=True, text=True, check=False)
367
+ # Check if package is installed (look for "ii" status)
368
+ if result.returncode != 0 or not any(
369
+ line.startswith("ii") and package in line for line in result.stdout.splitlines()
370
+ ):
371
+ missing_packages.append(package)
372
+ except Exception:
373
+ # If check fails, assume package is not installed
374
+ missing_packages.append(package)
375
+
376
+ # Install missing packages if any
377
+ if missing_packages:
378
+ LOGGER.info(
379
+ f"{prefix} Ultralytics requirement{'s' * (len(missing_packages) > 1)} {missing_packages} not found, attempting AutoUpdate..."
380
+ )
381
+ # Optionally update package list first
382
+ cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "update"]
383
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
384
+
385
+ # Build and run the install command
386
+ cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "install", "-y"] + missing_packages
387
+ result = subprocess.run(cmd, check=True, capture_output=True, text=True)
388
+
389
+ LOGGER.info(f"{prefix} AutoUpdate success ✅")
390
+ LOGGER.warning(f"{prefix} {colorstr('bold', 'Restart runtime or rerun command for updates to take effect')}\n")
391
+
392
+
353
393
  @TryExcept()
354
394
  def check_requirements(requirements=ROOT.parent / "requirements.txt", exclude=(), install=True, cmds=""):
355
395
  """Check if installed dependencies meet Ultralytics YOLO models requirements and attempt to auto-update if needed.
@@ -948,6 +988,7 @@ check_torchvision() # check torch-torchvision compatibility
948
988
  # Define constants
949
989
  IS_PYTHON_3_8 = PYTHON_VERSION.startswith("3.8")
950
990
  IS_PYTHON_3_9 = PYTHON_VERSION.startswith("3.9")
991
+ IS_PYTHON_3_10 = PYTHON_VERSION.startswith("3.10")
951
992
  IS_PYTHON_3_12 = PYTHON_VERSION.startswith("3.12")
952
993
  IS_PYTHON_3_13 = PYTHON_VERSION.startswith("3.13")
953
994
 
ultralytics/utils/ops.py CHANGED
@@ -660,6 +660,4 @@ def clean_str(s):
660
660
 
661
661
  def empty_like(x):
662
662
  """Create empty torch.Tensor or np.ndarray with same shape as input and float32 dtype."""
663
- return (
664
- torch.empty_like(x, dtype=torch.float32) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=np.float32)
665
- )
663
+ return torch.empty_like(x, dtype=x.dtype) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=x.dtype)
@@ -44,6 +44,7 @@ TORCH_1_13 = check_version(TORCH_VERSION, "1.13.0")
44
44
  TORCH_2_0 = check_version(TORCH_VERSION, "2.0.0")
45
45
  TORCH_2_1 = check_version(TORCH_VERSION, "2.1.0")
46
46
  TORCH_2_4 = check_version(TORCH_VERSION, "2.4.0")
47
+ TORCH_2_8 = check_version(TORCH_VERSION, "2.8.0")
47
48
  TORCH_2_9 = check_version(TORCH_VERSION, "2.9.0")
48
49
  TORCHVISION_0_10 = check_version(TORCHVISION_VERSION, "0.10.0")
49
50
  TORCHVISION_0_11 = check_version(TORCHVISION_VERSION, "0.11.0")