dgenerate-ultralytics-headless 8.3.190__py3-none-any.whl → 8.3.192__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/METADATA +1 -1
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/RECORD +103 -102
- tests/test_cuda.py +6 -5
- tests/test_exports.py +1 -6
- tests/test_python.py +1 -4
- tests/test_solutions.py +1 -1
- ultralytics/__init__.py +1 -1
- ultralytics/cfg/__init__.py +16 -14
- ultralytics/cfg/datasets/SKU-110K.yaml +1 -1
- ultralytics/cfg/datasets/VisDrone.yaml +4 -4
- ultralytics/data/annotator.py +6 -6
- ultralytics/data/augment.py +53 -51
- ultralytics/data/base.py +15 -13
- ultralytics/data/build.py +7 -4
- ultralytics/data/converter.py +9 -10
- ultralytics/data/dataset.py +24 -22
- ultralytics/data/loaders.py +13 -11
- ultralytics/data/split.py +4 -3
- ultralytics/data/split_dota.py +14 -12
- ultralytics/data/utils.py +29 -23
- ultralytics/engine/exporter.py +2 -2
- ultralytics/engine/model.py +16 -14
- ultralytics/engine/predictor.py +8 -6
- ultralytics/engine/results.py +54 -52
- ultralytics/engine/trainer.py +8 -3
- ultralytics/engine/tuner.py +230 -42
- ultralytics/hub/google/__init__.py +7 -6
- ultralytics/hub/session.py +8 -6
- ultralytics/hub/utils.py +3 -4
- ultralytics/models/fastsam/model.py +8 -6
- ultralytics/models/nas/model.py +5 -3
- ultralytics/models/rtdetr/train.py +4 -3
- ultralytics/models/rtdetr/val.py +6 -4
- ultralytics/models/sam/amg.py +13 -10
- ultralytics/models/sam/model.py +3 -2
- ultralytics/models/sam/modules/blocks.py +21 -21
- ultralytics/models/sam/modules/decoders.py +11 -11
- ultralytics/models/sam/modules/encoders.py +25 -25
- ultralytics/models/sam/modules/memory_attention.py +9 -8
- ultralytics/models/sam/modules/sam.py +8 -10
- ultralytics/models/sam/modules/tiny_encoder.py +21 -20
- ultralytics/models/sam/modules/transformer.py +6 -5
- ultralytics/models/sam/modules/utils.py +7 -5
- ultralytics/models/sam/predict.py +32 -31
- ultralytics/models/utils/loss.py +29 -27
- ultralytics/models/utils/ops.py +10 -8
- ultralytics/models/yolo/classify/train.py +9 -7
- ultralytics/models/yolo/classify/val.py +11 -9
- ultralytics/models/yolo/detect/predict.py +1 -1
- ultralytics/models/yolo/detect/train.py +8 -6
- ultralytics/models/yolo/detect/val.py +22 -20
- ultralytics/models/yolo/model.py +14 -14
- ultralytics/models/yolo/obb/train.py +5 -3
- ultralytics/models/yolo/obb/val.py +11 -9
- ultralytics/models/yolo/pose/train.py +7 -5
- ultralytics/models/yolo/pose/val.py +12 -10
- ultralytics/models/yolo/segment/train.py +4 -5
- ultralytics/models/yolo/segment/val.py +13 -11
- ultralytics/models/yolo/world/train.py +10 -8
- ultralytics/models/yolo/yoloe/train.py +10 -10
- ultralytics/models/yolo/yoloe/val.py +11 -9
- ultralytics/nn/autobackend.py +17 -19
- ultralytics/nn/modules/block.py +12 -12
- ultralytics/nn/modules/conv.py +4 -3
- ultralytics/nn/modules/head.py +41 -37
- ultralytics/nn/modules/transformer.py +22 -21
- ultralytics/nn/tasks.py +2 -2
- ultralytics/nn/text_model.py +6 -5
- ultralytics/solutions/analytics.py +7 -5
- ultralytics/solutions/config.py +12 -10
- ultralytics/solutions/distance_calculation.py +3 -3
- ultralytics/solutions/heatmap.py +4 -2
- ultralytics/solutions/object_counter.py +5 -3
- ultralytics/solutions/parking_management.py +4 -2
- ultralytics/solutions/region_counter.py +7 -5
- ultralytics/solutions/similarity_search.py +5 -3
- ultralytics/solutions/solutions.py +38 -36
- ultralytics/solutions/streamlit_inference.py +8 -7
- ultralytics/trackers/bot_sort.py +11 -9
- ultralytics/trackers/byte_tracker.py +17 -15
- ultralytics/trackers/utils/gmc.py +4 -3
- ultralytics/utils/__init__.py +16 -88
- ultralytics/utils/autobatch.py +3 -2
- ultralytics/utils/autodevice.py +10 -10
- ultralytics/utils/benchmarks.py +11 -10
- ultralytics/utils/callbacks/comet.py +9 -9
- ultralytics/utils/checks.py +17 -26
- ultralytics/utils/export.py +12 -11
- ultralytics/utils/files.py +8 -7
- ultralytics/utils/git.py +139 -0
- ultralytics/utils/instance.py +8 -7
- ultralytics/utils/loss.py +15 -13
- ultralytics/utils/metrics.py +62 -62
- ultralytics/utils/ops.py +3 -2
- ultralytics/utils/patches.py +6 -4
- ultralytics/utils/plotting.py +20 -18
- ultralytics/utils/torch_utils.py +4 -2
- ultralytics/utils/tqdm.py +18 -14
- ultralytics/utils/triton.py +3 -2
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/WHEEL +0 -0
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/entry_points.txt +0 -0
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/licenses/LICENSE +0 -0
- {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/top_level.txt +0 -0
@@ -9,8 +9,9 @@
|
|
9
9
|
# Build the TinyViT Model
|
10
10
|
# --------------------------------------------------------
|
11
11
|
|
12
|
+
from __future__ import annotations
|
13
|
+
|
12
14
|
import itertools
|
13
|
-
from typing import List, Optional, Tuple, Union
|
14
15
|
|
15
16
|
import torch
|
16
17
|
import torch.nn as nn
|
@@ -106,7 +107,7 @@ class PatchEmbed(nn.Module):
|
|
106
107
|
activation (nn.Module): Activation function to use between convolutions.
|
107
108
|
"""
|
108
109
|
super().__init__()
|
109
|
-
img_size:
|
110
|
+
img_size: tuple[int, int] = to_2tuple(resolution)
|
110
111
|
self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
|
111
112
|
self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
|
112
113
|
self.in_chans = in_chans
|
@@ -219,7 +220,7 @@ class PatchMerging(nn.Module):
|
|
219
220
|
torch.Size([4, 3136, 128])
|
220
221
|
"""
|
221
222
|
|
222
|
-
def __init__(self, input_resolution:
|
223
|
+
def __init__(self, input_resolution: tuple[int, int], dim: int, out_dim: int, activation):
|
223
224
|
"""
|
224
225
|
Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
|
225
226
|
|
@@ -283,13 +284,13 @@ class ConvLayer(nn.Module):
|
|
283
284
|
def __init__(
|
284
285
|
self,
|
285
286
|
dim: int,
|
286
|
-
input_resolution:
|
287
|
+
input_resolution: tuple[int, int],
|
287
288
|
depth: int,
|
288
289
|
activation,
|
289
|
-
drop_path:
|
290
|
-
downsample:
|
290
|
+
drop_path: float | list[float] = 0.0,
|
291
|
+
downsample: nn.Module | None = None,
|
291
292
|
use_checkpoint: bool = False,
|
292
|
-
out_dim:
|
293
|
+
out_dim: int | None = None,
|
293
294
|
conv_expand_ratio: float = 4.0,
|
294
295
|
):
|
295
296
|
"""
|
@@ -370,8 +371,8 @@ class MLP(nn.Module):
|
|
370
371
|
def __init__(
|
371
372
|
self,
|
372
373
|
in_features: int,
|
373
|
-
hidden_features:
|
374
|
-
out_features:
|
374
|
+
hidden_features: int | None = None,
|
375
|
+
out_features: int | None = None,
|
375
376
|
activation=nn.GELU,
|
376
377
|
drop: float = 0.0,
|
377
378
|
):
|
@@ -441,7 +442,7 @@ class Attention(torch.nn.Module):
|
|
441
442
|
key_dim: int,
|
442
443
|
num_heads: int = 8,
|
443
444
|
attn_ratio: float = 4,
|
444
|
-
resolution:
|
445
|
+
resolution: tuple[int, int] = (14, 14),
|
445
446
|
):
|
446
447
|
"""
|
447
448
|
Initialize the Attention module for multi-head attention with spatial awareness.
|
@@ -549,7 +550,7 @@ class TinyViTBlock(nn.Module):
|
|
549
550
|
def __init__(
|
550
551
|
self,
|
551
552
|
dim: int,
|
552
|
-
input_resolution:
|
553
|
+
input_resolution: tuple[int, int],
|
553
554
|
num_heads: int,
|
554
555
|
window_size: int = 7,
|
555
556
|
mlp_ratio: float = 4.0,
|
@@ -690,18 +691,18 @@ class BasicLayer(nn.Module):
|
|
690
691
|
def __init__(
|
691
692
|
self,
|
692
693
|
dim: int,
|
693
|
-
input_resolution:
|
694
|
+
input_resolution: tuple[int, int],
|
694
695
|
depth: int,
|
695
696
|
num_heads: int,
|
696
697
|
window_size: int,
|
697
698
|
mlp_ratio: float = 4.0,
|
698
699
|
drop: float = 0.0,
|
699
|
-
drop_path:
|
700
|
-
downsample:
|
700
|
+
drop_path: float | list[float] = 0.0,
|
701
|
+
downsample: nn.Module | None = None,
|
701
702
|
use_checkpoint: bool = False,
|
702
703
|
local_conv_size: int = 3,
|
703
704
|
activation=nn.GELU,
|
704
|
-
out_dim:
|
705
|
+
out_dim: int | None = None,
|
705
706
|
):
|
706
707
|
"""
|
707
708
|
Initialize a BasicLayer in the TinyViT architecture.
|
@@ -800,10 +801,10 @@ class TinyViT(nn.Module):
|
|
800
801
|
img_size: int = 224,
|
801
802
|
in_chans: int = 3,
|
802
803
|
num_classes: int = 1000,
|
803
|
-
embed_dims:
|
804
|
-
depths:
|
805
|
-
num_heads:
|
806
|
-
window_sizes:
|
804
|
+
embed_dims: tuple[int, int, int, int] = (96, 192, 384, 768),
|
805
|
+
depths: tuple[int, int, int, int] = (2, 2, 6, 2),
|
806
|
+
num_heads: tuple[int, int, int, int] = (3, 6, 12, 24),
|
807
|
+
window_sizes: tuple[int, int, int, int] = (7, 7, 14, 7),
|
807
808
|
mlp_ratio: float = 4.0,
|
808
809
|
drop_rate: float = 0.0,
|
809
810
|
drop_path_rate: float = 0.1,
|
@@ -980,7 +981,7 @@ class TinyViT(nn.Module):
|
|
980
981
|
"""Perform the forward pass through the TinyViT model, extracting features from the input image."""
|
981
982
|
return self.forward_features(x)
|
982
983
|
|
983
|
-
def set_imgsz(self, imgsz:
|
984
|
+
def set_imgsz(self, imgsz: list[int] = [1024, 1024]):
|
984
985
|
"""Set image size to make model compatible with different image sizes."""
|
985
986
|
imgsz = [s // 4 for s in imgsz]
|
986
987
|
self.patches_resolution = imgsz
|
@@ -1,7 +1,8 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
+
from __future__ import annotations
|
4
|
+
|
3
5
|
import math
|
4
|
-
from typing import Tuple, Type
|
5
6
|
|
6
7
|
import torch
|
7
8
|
from torch import Tensor, nn
|
@@ -44,7 +45,7 @@ class TwoWayTransformer(nn.Module):
|
|
44
45
|
embedding_dim: int,
|
45
46
|
num_heads: int,
|
46
47
|
mlp_dim: int,
|
47
|
-
activation:
|
48
|
+
activation: type[nn.Module] = nn.ReLU,
|
48
49
|
attention_downsample_rate: int = 2,
|
49
50
|
) -> None:
|
50
51
|
"""
|
@@ -85,7 +86,7 @@ class TwoWayTransformer(nn.Module):
|
|
85
86
|
image_embedding: torch.Tensor,
|
86
87
|
image_pe: torch.Tensor,
|
87
88
|
point_embedding: torch.Tensor,
|
88
|
-
) ->
|
89
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
89
90
|
"""
|
90
91
|
Process image and point embeddings through the Two-Way Transformer.
|
91
92
|
|
@@ -162,7 +163,7 @@ class TwoWayAttentionBlock(nn.Module):
|
|
162
163
|
embedding_dim: int,
|
163
164
|
num_heads: int,
|
164
165
|
mlp_dim: int = 2048,
|
165
|
-
activation:
|
166
|
+
activation: type[nn.Module] = nn.ReLU,
|
166
167
|
attention_downsample_rate: int = 2,
|
167
168
|
skip_first_layer_pe: bool = False,
|
168
169
|
) -> None:
|
@@ -198,7 +199,7 @@ class TwoWayAttentionBlock(nn.Module):
|
|
198
199
|
|
199
200
|
def forward(
|
200
201
|
self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
|
201
|
-
) ->
|
202
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
202
203
|
"""
|
203
204
|
Apply two-way attention to process query and key embeddings in a transformer block.
|
204
205
|
|
@@ -1,12 +1,14 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
-
from
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Any
|
4
6
|
|
5
7
|
import torch
|
6
8
|
import torch.nn.functional as F
|
7
9
|
|
8
10
|
|
9
|
-
def select_closest_cond_frames(frame_idx: int, cond_frame_outputs:
|
11
|
+
def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: dict[int, Any], max_cond_frame_num: int):
|
10
12
|
"""
|
11
13
|
Select the closest conditioning frames to a given frame index.
|
12
14
|
|
@@ -248,7 +250,7 @@ def window_partition(x: torch.Tensor, window_size: int):
|
|
248
250
|
return windows, (Hp, Wp)
|
249
251
|
|
250
252
|
|
251
|
-
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw:
|
253
|
+
def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: tuple[int, int], hw: tuple[int, int]):
|
252
254
|
"""
|
253
255
|
Unpartition windowed sequences into original sequences and remove padding.
|
254
256
|
|
@@ -333,8 +335,8 @@ def add_decomposed_rel_pos(
|
|
333
335
|
q: torch.Tensor,
|
334
336
|
rel_pos_h: torch.Tensor,
|
335
337
|
rel_pos_w: torch.Tensor,
|
336
|
-
q_size:
|
337
|
-
k_size:
|
338
|
+
q_size: tuple[int, int],
|
339
|
+
k_size: tuple[int, int],
|
338
340
|
) -> torch.Tensor:
|
339
341
|
"""
|
340
342
|
Add decomposed Relative Positional Embeddings to the attention map.
|
@@ -8,8 +8,10 @@ using SAM. It forms an integral part of the Ultralytics framework and is designe
|
|
8
8
|
segmentation tasks.
|
9
9
|
"""
|
10
10
|
|
11
|
+
from __future__ import annotations
|
12
|
+
|
11
13
|
from collections import OrderedDict
|
12
|
-
from typing import Any
|
14
|
+
from typing import Any
|
13
15
|
|
14
16
|
import cv2
|
15
17
|
import numpy as np
|
@@ -1717,9 +1719,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1717
1719
|
def __init__(
|
1718
1720
|
self,
|
1719
1721
|
cfg: Any = DEFAULT_CFG,
|
1720
|
-
overrides:
|
1722
|
+
overrides: dict[str, Any] | None = None,
|
1721
1723
|
max_obj_num: int = 3,
|
1722
|
-
_callbacks:
|
1724
|
+
_callbacks: dict[str, Any] | None = None,
|
1723
1725
|
) -> None:
|
1724
1726
|
"""
|
1725
1727
|
Initialize the predictor with configuration and optional overrides.
|
@@ -1759,14 +1761,14 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1759
1761
|
@smart_inference_mode()
|
1760
1762
|
def inference(
|
1761
1763
|
self,
|
1762
|
-
img:
|
1763
|
-
bboxes:
|
1764
|
-
masks:
|
1765
|
-
points:
|
1766
|
-
labels:
|
1767
|
-
obj_ids:
|
1764
|
+
img: torch.Tensor | np.ndarray,
|
1765
|
+
bboxes: list[list[float]] | None = None,
|
1766
|
+
masks: torch.Tensor | np.ndarray | None = None,
|
1767
|
+
points: list[list[float]] | None = None,
|
1768
|
+
labels: list[int] | None = None,
|
1769
|
+
obj_ids: list[int] | None = None,
|
1768
1770
|
update_memory: bool = False,
|
1769
|
-
) ->
|
1771
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
1770
1772
|
"""
|
1771
1773
|
Perform inference on a single image with optional bounding boxes, masks, points and object IDs.
|
1772
1774
|
It has two modes: one is to run inference on a single image without updating the memory,
|
@@ -1824,7 +1826,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1824
1826
|
pred_scores = torch.clamp_(pred_scores / 32, min=0)
|
1825
1827
|
return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
|
1826
1828
|
|
1827
|
-
def get_im_features(self, img:
|
1829
|
+
def get_im_features(self, img: torch.Tensor | np.ndarray) -> None:
|
1828
1830
|
"""
|
1829
1831
|
Initialize the image state by processing the input image and extracting features.
|
1830
1832
|
|
@@ -1844,10 +1846,10 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1844
1846
|
@smart_inference_mode()
|
1845
1847
|
def update_memory(
|
1846
1848
|
self,
|
1847
|
-
obj_ids:
|
1848
|
-
points:
|
1849
|
-
labels:
|
1850
|
-
masks:
|
1849
|
+
obj_ids: list[int] = None,
|
1850
|
+
points: torch.Tensor | None = None,
|
1851
|
+
labels: torch.Tensor | None = None,
|
1852
|
+
masks: torch.Tensor | None = None,
|
1851
1853
|
) -> None:
|
1852
1854
|
"""
|
1853
1855
|
Append the imgState to the memory_bank and update the memory for the model.
|
@@ -1923,7 +1925,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1923
1925
|
consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
|
1924
1926
|
self.memory_bank.append(consolidated_out)
|
1925
1927
|
|
1926
|
-
def _prepare_memory_conditioned_features(self, obj_idx:
|
1928
|
+
def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
|
1927
1929
|
"""
|
1928
1930
|
Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
|
1929
1931
|
prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features for all
|
@@ -1958,7 +1960,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1958
1960
|
*self.feat_sizes[-1],
|
1959
1961
|
)
|
1960
1962
|
|
1961
|
-
def get_maskmem_enc(self) ->
|
1963
|
+
def get_maskmem_enc(self) -> tuple[torch.Tensor, torch.Tensor]:
|
1962
1964
|
"""Get the memory and positional encoding from the memory, which is used to condition the current image
|
1963
1965
|
features.
|
1964
1966
|
"""
|
@@ -1973,7 +1975,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1973
1975
|
memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
|
1974
1976
|
return memory, memory_pos_embed
|
1975
1977
|
|
1976
|
-
def _obj_id_to_idx(self, obj_id: int) ->
|
1978
|
+
def _obj_id_to_idx(self, obj_id: int) -> int | None:
|
1977
1979
|
"""
|
1978
1980
|
Map client-side object id to model-side object index.
|
1979
1981
|
|
@@ -1987,11 +1989,11 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
1987
1989
|
|
1988
1990
|
def track_step(
|
1989
1991
|
self,
|
1990
|
-
obj_idx:
|
1991
|
-
point:
|
1992
|
-
label:
|
1993
|
-
mask:
|
1994
|
-
) ->
|
1992
|
+
obj_idx: int | None = None,
|
1993
|
+
point: torch.Tensor | None = None,
|
1994
|
+
label: torch.Tensor | None = None,
|
1995
|
+
mask: torch.Tensor | None = None,
|
1996
|
+
) -> dict[str, Any]:
|
1995
1997
|
"""
|
1996
1998
|
Tracking step for the current image state to predict masks.
|
1997
1999
|
|
@@ -2010,7 +2012,6 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
2010
2012
|
current_out (Dict[str, Any]): A dictionary containing the current output with mask predictions and object pointers.
|
2011
2013
|
Keys include 'point_inputs', 'mask_inputs', 'pred_masks', 'pred_masks_high_res', 'obj_ptr', 'object_score_logits'.
|
2012
2014
|
"""
|
2013
|
-
current_out = {}
|
2014
2015
|
if mask is not None and self.model.use_mask_input_as_output_without_sam:
|
2015
2016
|
# When use_mask_input_as_output_without_sam=True, we directly output the mask input
|
2016
2017
|
# (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
|
@@ -2021,7 +2022,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
2021
2022
|
# fused the visual feature with previous memory features in the memory bank
|
2022
2023
|
pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
|
2023
2024
|
# calculate the first feature if adding obj_idx exists(means adding prompts)
|
2024
|
-
pix_feat_with_mem = pix_feat_with_mem[
|
2025
|
+
pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
|
2025
2026
|
_, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
|
2026
2027
|
backbone_features=pix_feat_with_mem,
|
2027
2028
|
point_inputs={"point_coords": point, "point_labels": label} if obj_idx is not None else None,
|
@@ -2029,9 +2030,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
|
|
2029
2030
|
multimask_output=False,
|
2030
2031
|
high_res_features=[feat[: pix_feat_with_mem.size(0)] for feat in self.high_res_features],
|
2031
2032
|
)
|
2032
|
-
|
2033
|
-
|
2034
|
-
|
2035
|
-
|
2036
|
-
|
2037
|
-
|
2033
|
+
return {
|
2034
|
+
"pred_masks": low_res_masks,
|
2035
|
+
"pred_masks_high_res": high_res_masks,
|
2036
|
+
"obj_ptr": obj_ptr,
|
2037
|
+
"object_score_logits": object_score_logits,
|
2038
|
+
}
|
ultralytics/models/utils/loss.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
-
from
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Any
|
4
6
|
|
5
7
|
import torch
|
6
8
|
import torch.nn as nn
|
@@ -36,7 +38,7 @@ class DETRLoss(nn.Module):
|
|
36
38
|
def __init__(
|
37
39
|
self,
|
38
40
|
nc: int = 80,
|
39
|
-
loss_gain:
|
41
|
+
loss_gain: dict[str, float] | None = None,
|
40
42
|
aux_loss: bool = True,
|
41
43
|
use_fl: bool = True,
|
42
44
|
use_vfl: bool = False,
|
@@ -79,7 +81,7 @@ class DETRLoss(nn.Module):
|
|
79
81
|
|
80
82
|
def _get_loss_class(
|
81
83
|
self, pred_scores: torch.Tensor, targets: torch.Tensor, gt_scores: torch.Tensor, num_gts: int, postfix: str = ""
|
82
|
-
) ->
|
84
|
+
) -> dict[str, torch.Tensor]:
|
83
85
|
"""
|
84
86
|
Compute classification loss based on predictions, target values, and ground truth scores.
|
85
87
|
|
@@ -121,7 +123,7 @@ class DETRLoss(nn.Module):
|
|
121
123
|
|
122
124
|
def _get_loss_bbox(
|
123
125
|
self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, postfix: str = ""
|
124
|
-
) ->
|
126
|
+
) -> dict[str, torch.Tensor]:
|
125
127
|
"""
|
126
128
|
Compute bounding box and GIoU losses for predicted and ground truth bounding boxes.
|
127
129
|
|
@@ -191,12 +193,12 @@ class DETRLoss(nn.Module):
|
|
191
193
|
pred_scores: torch.Tensor,
|
192
194
|
gt_bboxes: torch.Tensor,
|
193
195
|
gt_cls: torch.Tensor,
|
194
|
-
gt_groups:
|
195
|
-
match_indices:
|
196
|
+
gt_groups: list[int],
|
197
|
+
match_indices: list[tuple] | None = None,
|
196
198
|
postfix: str = "",
|
197
|
-
masks:
|
198
|
-
gt_mask:
|
199
|
-
) ->
|
199
|
+
masks: torch.Tensor | None = None,
|
200
|
+
gt_mask: torch.Tensor | None = None,
|
201
|
+
) -> dict[str, torch.Tensor]:
|
200
202
|
"""
|
201
203
|
Get auxiliary losses for intermediate decoder layers.
|
202
204
|
|
@@ -258,7 +260,7 @@ class DETRLoss(nn.Module):
|
|
258
260
|
return loss
|
259
261
|
|
260
262
|
@staticmethod
|
261
|
-
def _get_index(match_indices:
|
263
|
+
def _get_index(match_indices: list[tuple]) -> tuple[tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
|
262
264
|
"""
|
263
265
|
Extract batch indices, source indices, and destination indices from match indices.
|
264
266
|
|
@@ -275,8 +277,8 @@ class DETRLoss(nn.Module):
|
|
275
277
|
return (batch_idx, src_idx), dst_idx
|
276
278
|
|
277
279
|
def _get_assigned_bboxes(
|
278
|
-
self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, match_indices:
|
279
|
-
) ->
|
280
|
+
self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, match_indices: list[tuple]
|
281
|
+
) -> tuple[torch.Tensor, torch.Tensor]:
|
280
282
|
"""
|
281
283
|
Assign predicted bounding boxes to ground truth bounding boxes based on match indices.
|
282
284
|
|
@@ -309,12 +311,12 @@ class DETRLoss(nn.Module):
|
|
309
311
|
pred_scores: torch.Tensor,
|
310
312
|
gt_bboxes: torch.Tensor,
|
311
313
|
gt_cls: torch.Tensor,
|
312
|
-
gt_groups:
|
313
|
-
masks:
|
314
|
-
gt_mask:
|
314
|
+
gt_groups: list[int],
|
315
|
+
masks: torch.Tensor | None = None,
|
316
|
+
gt_mask: torch.Tensor | None = None,
|
315
317
|
postfix: str = "",
|
316
|
-
match_indices:
|
317
|
-
) ->
|
318
|
+
match_indices: list[tuple] | None = None,
|
319
|
+
) -> dict[str, torch.Tensor]:
|
318
320
|
"""
|
319
321
|
Calculate losses for a single prediction layer.
|
320
322
|
|
@@ -358,10 +360,10 @@ class DETRLoss(nn.Module):
|
|
358
360
|
self,
|
359
361
|
pred_bboxes: torch.Tensor,
|
360
362
|
pred_scores: torch.Tensor,
|
361
|
-
batch:
|
363
|
+
batch: dict[str, Any],
|
362
364
|
postfix: str = "",
|
363
365
|
**kwargs: Any,
|
364
|
-
) ->
|
366
|
+
) -> dict[str, torch.Tensor]:
|
365
367
|
"""
|
366
368
|
Calculate loss for predicted bounding boxes and scores.
|
367
369
|
|
@@ -407,12 +409,12 @@ class RTDETRDetectionLoss(DETRLoss):
|
|
407
409
|
|
408
410
|
def forward(
|
409
411
|
self,
|
410
|
-
preds:
|
411
|
-
batch:
|
412
|
-
dn_bboxes:
|
413
|
-
dn_scores:
|
414
|
-
dn_meta:
|
415
|
-
) ->
|
412
|
+
preds: tuple[torch.Tensor, torch.Tensor],
|
413
|
+
batch: dict[str, Any],
|
414
|
+
dn_bboxes: torch.Tensor | None = None,
|
415
|
+
dn_scores: torch.Tensor | None = None,
|
416
|
+
dn_meta: dict[str, Any] | None = None,
|
417
|
+
) -> dict[str, torch.Tensor]:
|
416
418
|
"""
|
417
419
|
Forward pass to compute detection loss with optional denoising loss.
|
418
420
|
|
@@ -448,8 +450,8 @@ class RTDETRDetectionLoss(DETRLoss):
|
|
448
450
|
|
449
451
|
@staticmethod
|
450
452
|
def get_dn_match_indices(
|
451
|
-
dn_pos_idx:
|
452
|
-
) ->
|
453
|
+
dn_pos_idx: list[torch.Tensor], dn_num_group: int, gt_groups: list[int]
|
454
|
+
) -> list[tuple[torch.Tensor, torch.Tensor]]:
|
453
455
|
"""
|
454
456
|
Get match indices for denoising.
|
455
457
|
|
ultralytics/models/utils/ops.py
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
-
from
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
from typing import Any
|
4
6
|
|
5
7
|
import torch
|
6
8
|
import torch.nn as nn
|
@@ -47,7 +49,7 @@ class HungarianMatcher(nn.Module):
|
|
47
49
|
|
48
50
|
def __init__(
|
49
51
|
self,
|
50
|
-
cost_gain:
|
52
|
+
cost_gain: dict[str, float] | None = None,
|
51
53
|
use_fl: bool = True,
|
52
54
|
with_mask: bool = False,
|
53
55
|
num_sample_points: int = 12544,
|
@@ -82,10 +84,10 @@ class HungarianMatcher(nn.Module):
|
|
82
84
|
pred_scores: torch.Tensor,
|
83
85
|
gt_bboxes: torch.Tensor,
|
84
86
|
gt_cls: torch.Tensor,
|
85
|
-
gt_groups:
|
86
|
-
masks:
|
87
|
-
gt_mask:
|
88
|
-
) ->
|
87
|
+
gt_groups: list[int],
|
88
|
+
masks: torch.Tensor | None = None,
|
89
|
+
gt_mask: list[torch.Tensor] | None = None,
|
90
|
+
) -> list[tuple[torch.Tensor, torch.Tensor]]:
|
89
91
|
"""
|
90
92
|
Compute optimal assignment between predictions and ground truth using Hungarian algorithm.
|
91
93
|
|
@@ -187,7 +189,7 @@ class HungarianMatcher(nn.Module):
|
|
187
189
|
|
188
190
|
|
189
191
|
def get_cdn_group(
|
190
|
-
batch:
|
192
|
+
batch: dict[str, Any],
|
191
193
|
num_classes: int,
|
192
194
|
num_queries: int,
|
193
195
|
class_embed: torch.Tensor,
|
@@ -195,7 +197,7 @@ def get_cdn_group(
|
|
195
197
|
cls_noise_ratio: float = 0.5,
|
196
198
|
box_noise_scale: float = 1.0,
|
197
199
|
training: bool = False,
|
198
|
-
) ->
|
200
|
+
) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, dict[str, Any] | None]:
|
199
201
|
"""
|
200
202
|
Generate contrastive denoising training group with positive and negative samples from ground truths.
|
201
203
|
|
@@ -1,7 +1,9 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
+
from __future__ import annotations
|
4
|
+
|
3
5
|
from copy import copy
|
4
|
-
from typing import Any
|
6
|
+
from typing import Any
|
5
7
|
|
6
8
|
import torch
|
7
9
|
|
@@ -49,7 +51,7 @@ class ClassificationTrainer(BaseTrainer):
|
|
49
51
|
>>> trainer.train()
|
50
52
|
"""
|
51
53
|
|
52
|
-
def __init__(self, cfg=DEFAULT_CFG, overrides:
|
54
|
+
def __init__(self, cfg=DEFAULT_CFG, overrides: dict[str, Any] | None = None, _callbacks=None):
|
53
55
|
"""
|
54
56
|
Initialize a ClassificationTrainer object.
|
55
57
|
|
@@ -162,10 +164,10 @@ class ClassificationTrainer(BaseTrainer):
|
|
162
164
|
self.model.transforms = loader.dataset.torch_transforms
|
163
165
|
return loader
|
164
166
|
|
165
|
-
def preprocess_batch(self, batch:
|
167
|
+
def preprocess_batch(self, batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
|
166
168
|
"""Preprocess a batch of images and classes."""
|
167
|
-
batch["img"] = batch["img"].to(self.device)
|
168
|
-
batch["cls"] = batch["cls"].to(self.device)
|
169
|
+
batch["img"] = batch["img"].to(self.device, non_blocking=True)
|
170
|
+
batch["cls"] = batch["cls"].to(self.device, non_blocking=True)
|
169
171
|
return batch
|
170
172
|
|
171
173
|
def progress_string(self) -> str:
|
@@ -185,7 +187,7 @@ class ClassificationTrainer(BaseTrainer):
|
|
185
187
|
self.test_loader, self.save_dir, args=copy(self.args), _callbacks=self.callbacks
|
186
188
|
)
|
187
189
|
|
188
|
-
def label_loss_items(self, loss_items:
|
190
|
+
def label_loss_items(self, loss_items: torch.Tensor | None = None, prefix: str = "train"):
|
189
191
|
"""
|
190
192
|
Return a loss dict with labelled training loss items tensor.
|
191
193
|
|
@@ -220,7 +222,7 @@ class ClassificationTrainer(BaseTrainer):
|
|
220
222
|
self.metrics.pop("fitness", None)
|
221
223
|
self.run_callbacks("on_fit_epoch_end")
|
222
224
|
|
223
|
-
def plot_training_samples(self, batch:
|
225
|
+
def plot_training_samples(self, batch: dict[str, torch.Tensor], ni: int):
|
224
226
|
"""
|
225
227
|
Plot training samples with their annotations.
|
226
228
|
|
@@ -1,7 +1,9 @@
|
|
1
1
|
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
|
2
2
|
|
3
|
+
from __future__ import annotations
|
4
|
+
|
3
5
|
from pathlib import Path
|
4
|
-
from typing import Any
|
6
|
+
from typing import Any
|
5
7
|
|
6
8
|
import torch
|
7
9
|
|
@@ -85,14 +87,14 @@ class ClassificationValidator(BaseValidator):
|
|
85
87
|
self.targets = []
|
86
88
|
self.confusion_matrix = ConfusionMatrix(names=model.names)
|
87
89
|
|
88
|
-
def preprocess(self, batch:
|
90
|
+
def preprocess(self, batch: dict[str, Any]) -> dict[str, Any]:
|
89
91
|
"""Preprocess input batch by moving data to device and converting to appropriate dtype."""
|
90
92
|
batch["img"] = batch["img"].to(self.device, non_blocking=True)
|
91
93
|
batch["img"] = batch["img"].half() if self.args.half else batch["img"].float()
|
92
|
-
batch["cls"] = batch["cls"].to(self.device)
|
94
|
+
batch["cls"] = batch["cls"].to(self.device, non_blocking=True)
|
93
95
|
return batch
|
94
96
|
|
95
|
-
def update_metrics(self, preds: torch.Tensor, batch:
|
97
|
+
def update_metrics(self, preds: torch.Tensor, batch: dict[str, Any]) -> None:
|
96
98
|
"""
|
97
99
|
Update running metrics with model predictions and batch targets.
|
98
100
|
|
@@ -131,11 +133,11 @@ class ClassificationValidator(BaseValidator):
|
|
131
133
|
self.metrics.save_dir = self.save_dir
|
132
134
|
self.metrics.confusion_matrix = self.confusion_matrix
|
133
135
|
|
134
|
-
def postprocess(self, preds:
|
136
|
+
def postprocess(self, preds: torch.Tensor | list[torch.Tensor] | tuple[torch.Tensor]) -> torch.Tensor:
|
135
137
|
"""Extract the primary prediction from model output if it's in a list or tuple format."""
|
136
138
|
return preds[0] if isinstance(preds, (list, tuple)) else preds
|
137
139
|
|
138
|
-
def get_stats(self) ->
|
140
|
+
def get_stats(self) -> dict[str, float]:
|
139
141
|
"""Calculate and return a dictionary of metrics by processing targets and predictions."""
|
140
142
|
self.metrics.process(self.targets, self.pred)
|
141
143
|
return self.metrics.results_dict
|
@@ -144,7 +146,7 @@ class ClassificationValidator(BaseValidator):
|
|
144
146
|
"""Create a ClassificationDataset instance for validation."""
|
145
147
|
return ClassificationDataset(root=img_path, args=self.args, augment=False, prefix=self.args.split)
|
146
148
|
|
147
|
-
def get_dataloader(self, dataset_path:
|
149
|
+
def get_dataloader(self, dataset_path: Path | str, batch_size: int) -> torch.utils.data.DataLoader:
|
148
150
|
"""
|
149
151
|
Build and return a data loader for classification validation.
|
150
152
|
|
@@ -163,7 +165,7 @@ class ClassificationValidator(BaseValidator):
|
|
163
165
|
pf = "%22s" + "%11.3g" * len(self.metrics.keys) # print format
|
164
166
|
LOGGER.info(pf % ("all", self.metrics.top1, self.metrics.top5))
|
165
167
|
|
166
|
-
def plot_val_samples(self, batch:
|
168
|
+
def plot_val_samples(self, batch: dict[str, Any], ni: int) -> None:
|
167
169
|
"""
|
168
170
|
Plot validation image samples with their ground truth labels.
|
169
171
|
|
@@ -184,7 +186,7 @@ class ClassificationValidator(BaseValidator):
|
|
184
186
|
on_plot=self.on_plot,
|
185
187
|
)
|
186
188
|
|
187
|
-
def plot_predictions(self, batch:
|
189
|
+
def plot_predictions(self, batch: dict[str, Any], preds: torch.Tensor, ni: int) -> None:
|
188
190
|
"""
|
189
191
|
Plot images with their predicted class labels and save the visualization.
|
190
192
|
|
@@ -85,7 +85,7 @@ class DetectionPredictor(BasePredictor):
|
|
85
85
|
"""Extract object features from the feature maps."""
|
86
86
|
import torch
|
87
87
|
|
88
|
-
s = min(
|
88
|
+
s = min(x.shape[1] for x in feat_maps) # find shortest vector length
|
89
89
|
obj_feats = torch.cat(
|
90
90
|
[x.permute(0, 2, 3, 1).reshape(x.shape[0], -1, s, x.shape[1] // s).mean(dim=-1) for x in feat_maps], dim=1
|
91
91
|
) # mean reduce all vectors to same length
|