dgenerate-ultralytics-headless 8.3.236__py3-none-any.whl → 8.3.237__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/METADATA +1 -1
- {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/RECORD +38 -25
- ultralytics/__init__.py +1 -1
- ultralytics/engine/exporter.py +17 -10
- ultralytics/engine/predictor.py +3 -2
- ultralytics/engine/trainer.py +8 -0
- ultralytics/models/rtdetr/val.py +5 -1
- ultralytics/models/sam/__init__.py +14 -1
- ultralytics/models/sam/build.py +17 -8
- ultralytics/models/sam/build_sam3.py +374 -0
- ultralytics/models/sam/model.py +12 -4
- ultralytics/models/sam/modules/blocks.py +20 -8
- ultralytics/models/sam/modules/decoders.py +2 -3
- ultralytics/models/sam/modules/encoders.py +4 -1
- ultralytics/models/sam/modules/memory_attention.py +6 -2
- ultralytics/models/sam/modules/sam.py +150 -6
- ultralytics/models/sam/modules/utils.py +134 -4
- ultralytics/models/sam/predict.py +2076 -118
- ultralytics/models/sam/sam3/__init__.py +3 -0
- ultralytics/models/sam/sam3/decoder.py +546 -0
- ultralytics/models/sam/sam3/encoder.py +535 -0
- ultralytics/models/sam/sam3/geometry_encoders.py +415 -0
- ultralytics/models/sam/sam3/maskformer_segmentation.py +286 -0
- ultralytics/models/sam/sam3/model_misc.py +198 -0
- ultralytics/models/sam/sam3/necks.py +129 -0
- ultralytics/models/sam/sam3/sam3_image.py +357 -0
- ultralytics/models/sam/sam3/text_encoder_ve.py +307 -0
- ultralytics/models/sam/sam3/tokenizer_ve.py +242 -0
- ultralytics/models/sam/sam3/vitdet.py +546 -0
- ultralytics/models/sam/sam3/vl_combiner.py +165 -0
- ultralytics/models/yolo/obb/val.py +18 -7
- ultralytics/nn/modules/transformer.py +21 -1
- ultralytics/utils/checks.py +2 -2
- ultralytics/utils/ops.py +1 -3
- {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.236.dist-info → dgenerate_ultralytics_headless-8.3.237.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
|
2
|
+
|
|
3
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved
|
|
4
|
+
|
|
5
|
+
"""Provides utility to combine a vision backbone with a language backbone."""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from copy import copy
|
|
10
|
+
|
|
11
|
+
import torch
|
|
12
|
+
import torch.nn as nn
|
|
13
|
+
from torch.nn.attention import SDPBackend, sdpa_kernel
|
|
14
|
+
|
|
15
|
+
from .necks import Sam3DualViTDetNeck
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class SAM3VLBackbone(nn.Module):
|
|
19
|
+
"""This backbone combines a vision backbone and a language backbone without fusion. As such it is more of a
|
|
20
|
+
convenience wrapper to handle the two backbones together.
|
|
21
|
+
|
|
22
|
+
It adds support for activation checkpointing and compilation.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
visual: Sam3DualViTDetNeck,
|
|
28
|
+
text,
|
|
29
|
+
compile_visual: bool = False,
|
|
30
|
+
act_ckpt_whole_vision_backbone: bool = False,
|
|
31
|
+
act_ckpt_whole_language_backbone: bool = False,
|
|
32
|
+
scalp=0,
|
|
33
|
+
):
|
|
34
|
+
"""Initialize the backbone combiner.
|
|
35
|
+
|
|
36
|
+
:param visual: The vision backbone to use
|
|
37
|
+
:param text: The text encoder to use
|
|
38
|
+
"""
|
|
39
|
+
super().__init__()
|
|
40
|
+
self.vision_backbone: Sam3DualViTDetNeck = torch.compile(visual) if compile_visual else visual
|
|
41
|
+
self.language_backbone = text
|
|
42
|
+
self.scalp = scalp
|
|
43
|
+
# allow running activation checkpointing on the entire vision and language backbones
|
|
44
|
+
self.act_ckpt_whole_vision_backbone = act_ckpt_whole_vision_backbone
|
|
45
|
+
self.act_ckpt_whole_language_backbone = act_ckpt_whole_language_backbone
|
|
46
|
+
|
|
47
|
+
def forward(
|
|
48
|
+
self,
|
|
49
|
+
samples: torch.Tensor,
|
|
50
|
+
captions: list[str],
|
|
51
|
+
input_boxes: torch.Tensor = None,
|
|
52
|
+
additional_text: list[str] | None = None,
|
|
53
|
+
):
|
|
54
|
+
"""Forward pass of the backbone combiner.
|
|
55
|
+
|
|
56
|
+
:param samples: The input images
|
|
57
|
+
:param captions: The input captions
|
|
58
|
+
:param input_boxes: If the text contains place-holders for boxes, this
|
|
59
|
+
parameter contains the tensor containing their spatial features
|
|
60
|
+
:param additional_text: This can be used to encode some additional text
|
|
61
|
+
(different from the captions) in the same forward of the backbone
|
|
62
|
+
:return: Output dictionary with the following keys:
|
|
63
|
+
- vision_features: The output of the vision backbone
|
|
64
|
+
- language_features: The output of the language backbone
|
|
65
|
+
- language_mask: The attention mask of the language backbone
|
|
66
|
+
- vision_pos_enc: The positional encoding of the vision backbone
|
|
67
|
+
- (optional) additional_text_features: The output of the language
|
|
68
|
+
backbone for the additional text
|
|
69
|
+
- (optional) additional_text_mask: The attention mask of the
|
|
70
|
+
language backbone for the additional text
|
|
71
|
+
"""
|
|
72
|
+
output = self.forward_image(samples)
|
|
73
|
+
output.update(self.forward_text(captions, input_boxes, additional_text))
|
|
74
|
+
return output
|
|
75
|
+
|
|
76
|
+
def forward_image(self, samples: torch.Tensor):
|
|
77
|
+
"""Forward pass of the vision backbone and get both SAM3 and SAM2 features."""
|
|
78
|
+
# Forward through backbone
|
|
79
|
+
sam3_features, sam3_pos, sam2_features, sam2_pos = self.vision_backbone.forward(samples)
|
|
80
|
+
if self.scalp > 0:
|
|
81
|
+
# Discard the lowest resolution features
|
|
82
|
+
sam3_features, sam3_pos = (
|
|
83
|
+
sam3_features[: -self.scalp],
|
|
84
|
+
sam3_pos[: -self.scalp],
|
|
85
|
+
)
|
|
86
|
+
if sam2_features is not None and sam2_pos is not None:
|
|
87
|
+
sam2_features, sam2_pos = (
|
|
88
|
+
sam2_features[: -self.scalp],
|
|
89
|
+
sam2_pos[: -self.scalp],
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
sam2_output = None
|
|
93
|
+
|
|
94
|
+
if sam2_features is not None and sam2_pos is not None:
|
|
95
|
+
sam2_src = sam2_features[-1]
|
|
96
|
+
sam2_output = {
|
|
97
|
+
"vision_features": sam2_src,
|
|
98
|
+
"vision_pos_enc": sam2_pos,
|
|
99
|
+
"backbone_fpn": sam2_features,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
sam3_src = sam3_features[-1]
|
|
103
|
+
return {
|
|
104
|
+
"vision_features": sam3_src,
|
|
105
|
+
"vision_pos_enc": sam3_pos,
|
|
106
|
+
"backbone_fpn": sam3_features,
|
|
107
|
+
"sam2_backbone_out": sam2_output,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
def forward_image_sam2(self, samples: torch.Tensor):
|
|
111
|
+
"""Forward pass of the vision backbone to get SAM2 features only."""
|
|
112
|
+
xs = self.vision_backbone.trunk(samples)
|
|
113
|
+
sam2_features, sam2_pos = [], []
|
|
114
|
+
x = xs[-1] # simpleFPN
|
|
115
|
+
|
|
116
|
+
assert self.vision_backbone.sam2_convs is not None, "SAM2 neck is not available."
|
|
117
|
+
for i in range(len(self.vision_backbone.sam2_convs)):
|
|
118
|
+
sam2_x_out = self.vision_backbone.sam2_convs[i](x)
|
|
119
|
+
sam2_pos_out = self.vision_backbone.position_encoding(sam2_x_out).to(sam2_x_out.dtype)
|
|
120
|
+
sam2_features.append(sam2_x_out)
|
|
121
|
+
sam2_pos.append(sam2_pos_out)
|
|
122
|
+
|
|
123
|
+
if self.scalp > 0:
|
|
124
|
+
# Discard the lowest resolution features
|
|
125
|
+
sam2_features, sam2_pos = (
|
|
126
|
+
sam2_features[: -self.scalp],
|
|
127
|
+
sam2_pos[: -self.scalp],
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return {
|
|
131
|
+
"vision_features": sam2_features[-1],
|
|
132
|
+
"vision_pos_enc": sam2_pos,
|
|
133
|
+
"backbone_fpn": sam2_features,
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
def forward_text(self, captions, input_boxes=None, additional_text=None):
|
|
137
|
+
"""Forward pass of the text encoder."""
|
|
138
|
+
output = {}
|
|
139
|
+
|
|
140
|
+
# Forward through text_encoder
|
|
141
|
+
text_to_encode = copy(captions)
|
|
142
|
+
if additional_text is not None:
|
|
143
|
+
# if there are additional_text, we piggy-back them into this forward.
|
|
144
|
+
# They'll be used later for output alignment
|
|
145
|
+
text_to_encode += additional_text
|
|
146
|
+
|
|
147
|
+
with sdpa_kernel([SDPBackend.MATH, SDPBackend.EFFICIENT_ATTENTION, SDPBackend.FLASH_ATTENTION]):
|
|
148
|
+
text_attention_mask, text_memory, text_embeds = self.language_backbone(text_to_encode, input_boxes)
|
|
149
|
+
|
|
150
|
+
if additional_text is not None:
|
|
151
|
+
output["additional_text_features"] = text_memory[:, -len(additional_text) :]
|
|
152
|
+
output["additional_text_mask"] = text_attention_mask[-len(additional_text) :]
|
|
153
|
+
|
|
154
|
+
text_memory = text_memory[:, : len(captions)]
|
|
155
|
+
text_attention_mask = text_attention_mask[: len(captions)]
|
|
156
|
+
text_embeds = text_embeds[:, : len(captions)]
|
|
157
|
+
output["language_features"] = text_memory
|
|
158
|
+
output["language_mask"] = text_attention_mask
|
|
159
|
+
output["language_embeds"] = text_embeds # Text embeddings before forward to the encoder
|
|
160
|
+
|
|
161
|
+
return output
|
|
162
|
+
|
|
163
|
+
def set_imgsz(self, imgsz: list[int] = [1008, 1008]):
|
|
164
|
+
"""Set the image size for the vision backbone."""
|
|
165
|
+
self.vision_backbone.set_imgsz(imgsz)
|
|
@@ -12,6 +12,7 @@ from ultralytics.models.yolo.detect import DetectionValidator
|
|
|
12
12
|
from ultralytics.utils import LOGGER, ops
|
|
13
13
|
from ultralytics.utils.metrics import OBBMetrics, batch_probiou
|
|
14
14
|
from ultralytics.utils.nms import TorchNMS
|
|
15
|
+
from ultralytics.utils.plotting import plot_images
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
class OBBValidator(DetectionValidator):
|
|
@@ -141,24 +142,34 @@ class OBBValidator(DetectionValidator):
|
|
|
141
142
|
"im_file": batch["im_file"][si],
|
|
142
143
|
}
|
|
143
144
|
|
|
144
|
-
def plot_predictions(self, batch: dict[str, Any], preds: list[torch.Tensor], ni: int) -> None:
|
|
145
|
+
def plot_predictions(self, batch: dict[str, Any], preds: list[dict[str, torch.Tensor]], ni: int) -> None:
|
|
145
146
|
"""Plot predicted bounding boxes on input images and save the result.
|
|
146
147
|
|
|
147
148
|
Args:
|
|
148
149
|
batch (dict[str, Any]): Batch data containing images, file paths, and other metadata.
|
|
149
|
-
preds (list[torch.Tensor]): List of prediction
|
|
150
|
+
preds (list[dict[str, torch.Tensor]]): List of prediction dictionaries for each image in the batch.
|
|
150
151
|
ni (int): Batch index used for naming the output file.
|
|
151
152
|
|
|
152
153
|
Examples:
|
|
153
154
|
>>> validator = OBBValidator()
|
|
154
155
|
>>> batch = {"img": images, "im_file": paths}
|
|
155
|
-
>>> preds = [torch.rand(10,
|
|
156
|
+
>>> preds = [{"bboxes": torch.rand(10, 5), "cls": torch.zeros(10), "conf": torch.rand(10)}]
|
|
156
157
|
>>> validator.plot_predictions(batch, preds, 0)
|
|
157
158
|
"""
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
159
|
+
if not preds:
|
|
160
|
+
return
|
|
161
|
+
for i, pred in enumerate(preds):
|
|
162
|
+
pred["batch_idx"] = torch.ones_like(pred["conf"]) * i
|
|
163
|
+
keys = preds[0].keys()
|
|
164
|
+
batched_preds = {k: torch.cat([x[k] for x in preds], dim=0) for k in keys}
|
|
165
|
+
plot_images(
|
|
166
|
+
images=batch["img"],
|
|
167
|
+
labels=batched_preds,
|
|
168
|
+
paths=batch["im_file"],
|
|
169
|
+
fname=self.save_dir / f"val_batch{ni}_pred.jpg",
|
|
170
|
+
names=self.names,
|
|
171
|
+
on_plot=self.on_plot,
|
|
172
|
+
)
|
|
162
173
|
|
|
163
174
|
def pred_to_json(self, predn: dict[str, torch.Tensor], pbatch: dict[str, Any]) -> None:
|
|
164
175
|
"""Convert YOLO predictions to COCO JSON format with rotated bounding box information.
|
|
@@ -359,7 +359,15 @@ class MLP(nn.Module):
|
|
|
359
359
|
"""
|
|
360
360
|
|
|
361
361
|
def __init__(
|
|
362
|
-
self,
|
|
362
|
+
self,
|
|
363
|
+
input_dim: int,
|
|
364
|
+
hidden_dim: int,
|
|
365
|
+
output_dim: int,
|
|
366
|
+
num_layers: int,
|
|
367
|
+
act=nn.ReLU,
|
|
368
|
+
sigmoid: bool = False,
|
|
369
|
+
residual: bool = False,
|
|
370
|
+
out_norm: nn.Module = None,
|
|
363
371
|
):
|
|
364
372
|
"""Initialize the MLP with specified input, hidden, output dimensions and number of layers.
|
|
365
373
|
|
|
@@ -370,6 +378,8 @@ class MLP(nn.Module):
|
|
|
370
378
|
num_layers (int): Number of layers.
|
|
371
379
|
act (nn.Module): Activation function.
|
|
372
380
|
sigmoid (bool): Whether to apply sigmoid to the output.
|
|
381
|
+
residual (bool): Whether to use residual connections.
|
|
382
|
+
out_norm (nn.Module, optional): Normalization layer for the output.
|
|
373
383
|
"""
|
|
374
384
|
super().__init__()
|
|
375
385
|
self.num_layers = num_layers
|
|
@@ -377,6 +387,12 @@ class MLP(nn.Module):
|
|
|
377
387
|
self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim, *h], [*h, output_dim]))
|
|
378
388
|
self.sigmoid = sigmoid
|
|
379
389
|
self.act = act()
|
|
390
|
+
if residual and input_dim != output_dim:
|
|
391
|
+
raise ValueError("residual is only supported if input_dim == output_dim")
|
|
392
|
+
self.residual = residual
|
|
393
|
+
# whether to apply a normalization layer to the output
|
|
394
|
+
assert isinstance(out_norm, nn.Module) or out_norm is None
|
|
395
|
+
self.out_norm = out_norm or nn.Identity()
|
|
380
396
|
|
|
381
397
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
|
382
398
|
"""Forward pass for the entire MLP.
|
|
@@ -387,8 +403,12 @@ class MLP(nn.Module):
|
|
|
387
403
|
Returns:
|
|
388
404
|
(torch.Tensor): Output tensor after MLP.
|
|
389
405
|
"""
|
|
406
|
+
orig_x = x
|
|
390
407
|
for i, layer in enumerate(self.layers):
|
|
391
408
|
x = getattr(self, "act", nn.ReLU())(layer(x)) if i < self.num_layers - 1 else layer(x)
|
|
409
|
+
if getattr(self, "residual", False):
|
|
410
|
+
x = x + orig_x
|
|
411
|
+
x = getattr(self, "out_norm", nn.Identity())(x)
|
|
392
412
|
return x.sigmoid() if getattr(self, "sigmoid", False) else x
|
|
393
413
|
|
|
394
414
|
|
ultralytics/utils/checks.py
CHANGED
|
@@ -379,8 +379,8 @@ def check_apt_requirements(requirements):
|
|
|
379
379
|
f"{prefix} Ultralytics requirement{'s' * (len(missing_packages) > 1)} {missing_packages} not found, attempting AutoUpdate..."
|
|
380
380
|
)
|
|
381
381
|
# Optionally update package list first
|
|
382
|
-
if is_sudo_available()
|
|
383
|
-
|
|
382
|
+
cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "update"]
|
|
383
|
+
result = subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
384
384
|
|
|
385
385
|
# Build and run the install command
|
|
386
386
|
cmd = (["sudo"] if is_sudo_available() else []) + ["apt", "install", "-y"] + missing_packages
|
ultralytics/utils/ops.py
CHANGED
|
@@ -660,6 +660,4 @@ def clean_str(s):
|
|
|
660
660
|
|
|
661
661
|
def empty_like(x):
|
|
662
662
|
"""Create empty torch.Tensor or np.ndarray with same shape as input and float32 dtype."""
|
|
663
|
-
return (
|
|
664
|
-
torch.empty_like(x, dtype=torch.float32) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=np.float32)
|
|
665
|
-
)
|
|
663
|
+
return torch.empty_like(x, dtype=x.dtype) if isinstance(x, torch.Tensor) else np.empty_like(x, dtype=x.dtype)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|