ultralytics 8.3.189__py3-none-any.whl → 8.3.191__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. tests/test_cuda.py +6 -5
  2. tests/test_exports.py +1 -6
  3. tests/test_python.py +1 -4
  4. tests/test_solutions.py +1 -1
  5. ultralytics/__init__.py +1 -1
  6. ultralytics/cfg/__init__.py +16 -14
  7. ultralytics/cfg/datasets/VisDrone.yaml +4 -4
  8. ultralytics/data/annotator.py +6 -6
  9. ultralytics/data/augment.py +53 -51
  10. ultralytics/data/base.py +15 -13
  11. ultralytics/data/build.py +7 -4
  12. ultralytics/data/converter.py +9 -10
  13. ultralytics/data/dataset.py +24 -22
  14. ultralytics/data/loaders.py +13 -11
  15. ultralytics/data/split.py +4 -3
  16. ultralytics/data/split_dota.py +14 -12
  17. ultralytics/data/utils.py +31 -25
  18. ultralytics/engine/exporter.py +7 -4
  19. ultralytics/engine/model.py +16 -14
  20. ultralytics/engine/predictor.py +9 -7
  21. ultralytics/engine/results.py +59 -57
  22. ultralytics/engine/trainer.py +7 -0
  23. ultralytics/engine/tuner.py +4 -3
  24. ultralytics/engine/validator.py +3 -1
  25. ultralytics/hub/__init__.py +6 -2
  26. ultralytics/hub/auth.py +2 -2
  27. ultralytics/hub/google/__init__.py +9 -8
  28. ultralytics/hub/session.py +11 -11
  29. ultralytics/hub/utils.py +8 -9
  30. ultralytics/models/fastsam/model.py +8 -6
  31. ultralytics/models/nas/model.py +5 -3
  32. ultralytics/models/rtdetr/train.py +4 -3
  33. ultralytics/models/rtdetr/val.py +6 -4
  34. ultralytics/models/sam/amg.py +13 -10
  35. ultralytics/models/sam/model.py +3 -2
  36. ultralytics/models/sam/modules/blocks.py +21 -21
  37. ultralytics/models/sam/modules/decoders.py +11 -11
  38. ultralytics/models/sam/modules/encoders.py +25 -25
  39. ultralytics/models/sam/modules/memory_attention.py +9 -8
  40. ultralytics/models/sam/modules/sam.py +8 -10
  41. ultralytics/models/sam/modules/tiny_encoder.py +21 -20
  42. ultralytics/models/sam/modules/transformer.py +6 -5
  43. ultralytics/models/sam/modules/utils.py +7 -5
  44. ultralytics/models/sam/predict.py +32 -31
  45. ultralytics/models/utils/loss.py +29 -27
  46. ultralytics/models/utils/ops.py +10 -8
  47. ultralytics/models/yolo/classify/train.py +7 -5
  48. ultralytics/models/yolo/classify/val.py +10 -8
  49. ultralytics/models/yolo/detect/predict.py +3 -3
  50. ultralytics/models/yolo/detect/train.py +8 -6
  51. ultralytics/models/yolo/detect/val.py +23 -21
  52. ultralytics/models/yolo/model.py +14 -14
  53. ultralytics/models/yolo/obb/train.py +5 -3
  54. ultralytics/models/yolo/obb/val.py +13 -10
  55. ultralytics/models/yolo/pose/train.py +7 -5
  56. ultralytics/models/yolo/pose/val.py +11 -9
  57. ultralytics/models/yolo/segment/train.py +4 -5
  58. ultralytics/models/yolo/segment/val.py +12 -10
  59. ultralytics/models/yolo/world/train.py +9 -7
  60. ultralytics/models/yolo/yoloe/train.py +7 -6
  61. ultralytics/models/yolo/yoloe/val.py +10 -8
  62. ultralytics/nn/autobackend.py +40 -52
  63. ultralytics/nn/modules/__init__.py +3 -3
  64. ultralytics/nn/modules/block.py +12 -12
  65. ultralytics/nn/modules/conv.py +4 -3
  66. ultralytics/nn/modules/head.py +46 -38
  67. ultralytics/nn/modules/transformer.py +22 -21
  68. ultralytics/nn/tasks.py +2 -2
  69. ultralytics/nn/text_model.py +6 -5
  70. ultralytics/solutions/analytics.py +7 -5
  71. ultralytics/solutions/config.py +12 -10
  72. ultralytics/solutions/distance_calculation.py +3 -3
  73. ultralytics/solutions/heatmap.py +4 -2
  74. ultralytics/solutions/object_counter.py +5 -3
  75. ultralytics/solutions/parking_management.py +4 -2
  76. ultralytics/solutions/region_counter.py +7 -5
  77. ultralytics/solutions/similarity_search.py +5 -3
  78. ultralytics/solutions/solutions.py +38 -36
  79. ultralytics/solutions/streamlit_inference.py +8 -7
  80. ultralytics/trackers/bot_sort.py +11 -9
  81. ultralytics/trackers/byte_tracker.py +17 -15
  82. ultralytics/trackers/utils/gmc.py +4 -3
  83. ultralytics/utils/__init__.py +27 -77
  84. ultralytics/utils/autobatch.py +3 -2
  85. ultralytics/utils/autodevice.py +10 -10
  86. ultralytics/utils/benchmarks.py +11 -10
  87. ultralytics/utils/callbacks/comet.py +9 -9
  88. ultralytics/utils/callbacks/platform.py +2 -1
  89. ultralytics/utils/checks.py +20 -29
  90. ultralytics/utils/downloads.py +2 -2
  91. ultralytics/utils/export.py +12 -11
  92. ultralytics/utils/files.py +8 -7
  93. ultralytics/utils/git.py +139 -0
  94. ultralytics/utils/instance.py +8 -7
  95. ultralytics/utils/logger.py +7 -6
  96. ultralytics/utils/loss.py +15 -13
  97. ultralytics/utils/metrics.py +62 -62
  98. ultralytics/utils/nms.py +346 -0
  99. ultralytics/utils/ops.py +83 -251
  100. ultralytics/utils/patches.py +6 -4
  101. ultralytics/utils/plotting.py +18 -16
  102. ultralytics/utils/tal.py +1 -1
  103. ultralytics/utils/torch_utils.py +4 -2
  104. ultralytics/utils/tqdm.py +47 -33
  105. ultralytics/utils/triton.py +3 -2
  106. {ultralytics-8.3.189.dist-info → ultralytics-8.3.191.dist-info}/METADATA +1 -1
  107. {ultralytics-8.3.189.dist-info → ultralytics-8.3.191.dist-info}/RECORD +111 -109
  108. {ultralytics-8.3.189.dist-info → ultralytics-8.3.191.dist-info}/WHEEL +0 -0
  109. {ultralytics-8.3.189.dist-info → ultralytics-8.3.191.dist-info}/entry_points.txt +0 -0
  110. {ultralytics-8.3.189.dist-info → ultralytics-8.3.191.dist-info}/licenses/LICENSE +0 -0
  111. {ultralytics-8.3.189.dist-info → ultralytics-8.3.191.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,8 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import copy
4
- from typing import Optional
5
6
 
6
7
  import torch
7
8
  from torch import nn
@@ -103,7 +104,7 @@ class MemoryAttentionLayer(nn.Module):
103
104
  self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
104
105
  self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
105
106
 
106
- def _forward_sa(self, tgt: torch.Tensor, query_pos: Optional[torch.Tensor]) -> torch.Tensor:
107
+ def _forward_sa(self, tgt: torch.Tensor, query_pos: torch.Tensor | None) -> torch.Tensor:
107
108
  """Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
108
109
  tgt2 = self.norm1(tgt)
109
110
  q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
@@ -115,8 +116,8 @@ class MemoryAttentionLayer(nn.Module):
115
116
  self,
116
117
  tgt: torch.Tensor,
117
118
  memory: torch.Tensor,
118
- query_pos: Optional[torch.Tensor],
119
- pos: Optional[torch.Tensor],
119
+ query_pos: torch.Tensor | None,
120
+ pos: torch.Tensor | None,
120
121
  num_k_exclude_rope: int = 0,
121
122
  ) -> torch.Tensor:
122
123
  """Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
@@ -140,8 +141,8 @@ class MemoryAttentionLayer(nn.Module):
140
141
  self,
141
142
  tgt: torch.Tensor,
142
143
  memory: torch.Tensor,
143
- pos: Optional[torch.Tensor] = None,
144
- query_pos: Optional[torch.Tensor] = None,
144
+ pos: torch.Tensor | None = None,
145
+ query_pos: torch.Tensor | None = None,
145
146
  num_k_exclude_rope: int = 0,
146
147
  ) -> torch.Tensor:
147
148
  """
@@ -242,8 +243,8 @@ class MemoryAttention(nn.Module):
242
243
  self,
243
244
  curr: torch.Tensor, # self-attention inputs
244
245
  memory: torch.Tensor, # cross-attention inputs
245
- curr_pos: Optional[torch.Tensor] = None, # pos_enc for self-attention inputs
246
- memory_pos: Optional[torch.Tensor] = None, # pos_enc for cross-attention inputs
246
+ curr_pos: torch.Tensor | None = None, # pos_enc for self-attention inputs
247
+ memory_pos: torch.Tensor | None = None, # pos_enc for cross-attention inputs
247
248
  num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
248
249
  ) -> torch.Tensor:
249
250
  """
@@ -3,10 +3,7 @@
3
3
  # Copyright (c) Meta Platforms, Inc. and affiliates.
4
4
  # All rights reserved.
5
5
 
6
- # This source code is licensed under the license found in the
7
- # LICENSE file in the root directory of this source tree.
8
-
9
- from typing import List
6
+ from __future__ import annotations
10
7
 
11
8
  import torch
12
9
  import torch.nn.functional as F
@@ -61,8 +58,8 @@ class SAMModel(nn.Module):
61
58
  image_encoder: ImageEncoderViT,
62
59
  prompt_encoder: PromptEncoder,
63
60
  mask_decoder: MaskDecoder,
64
- pixel_mean: List[float] = (123.675, 116.28, 103.53),
65
- pixel_std: List[float] = (58.395, 57.12, 57.375),
61
+ pixel_mean: list[float] = (123.675, 116.28, 103.53),
62
+ pixel_std: list[float] = (58.395, 57.12, 57.375),
66
63
  ) -> None:
67
64
  """
68
65
  Initialize the SAMModel class to predict object masks from an image and input prompts.
@@ -959,7 +956,6 @@ class SAM2Model(torch.nn.Module):
959
956
  prev_sam_mask_logits=None,
960
957
  ):
961
958
  """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
962
- current_out = {}
963
959
  sam_outputs, _, _ = self._track_step(
964
960
  frame_idx,
965
961
  is_init_cond_frame,
@@ -975,9 +971,11 @@ class SAM2Model(torch.nn.Module):
975
971
  )
976
972
  _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
977
973
 
978
- current_out["pred_masks"] = low_res_masks
979
- current_out["pred_masks_high_res"] = high_res_masks
980
- current_out["obj_ptr"] = obj_ptr
974
+ current_out = {
975
+ "pred_masks": low_res_masks,
976
+ "pred_masks_high_res": high_res_masks,
977
+ "obj_ptr": obj_ptr,
978
+ }
981
979
  if not self.training:
982
980
  # Only add this in inference (to avoid unused param in activation checkpointing;
983
981
  # it's mainly used in the demo to encode spatial memories w/ consolidated masks)
@@ -9,8 +9,9 @@
9
9
  # Build the TinyViT Model
10
10
  # --------------------------------------------------------
11
11
 
12
+ from __future__ import annotations
13
+
12
14
  import itertools
13
- from typing import List, Optional, Tuple, Union
14
15
 
15
16
  import torch
16
17
  import torch.nn as nn
@@ -106,7 +107,7 @@ class PatchEmbed(nn.Module):
106
107
  activation (nn.Module): Activation function to use between convolutions.
107
108
  """
108
109
  super().__init__()
109
- img_size: Tuple[int, int] = to_2tuple(resolution)
110
+ img_size: tuple[int, int] = to_2tuple(resolution)
110
111
  self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
111
112
  self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
112
113
  self.in_chans = in_chans
@@ -219,7 +220,7 @@ class PatchMerging(nn.Module):
219
220
  torch.Size([4, 3136, 128])
220
221
  """
221
222
 
222
- def __init__(self, input_resolution: Tuple[int, int], dim: int, out_dim: int, activation):
223
+ def __init__(self, input_resolution: tuple[int, int], dim: int, out_dim: int, activation):
223
224
  """
224
225
  Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
225
226
 
@@ -283,13 +284,13 @@ class ConvLayer(nn.Module):
283
284
  def __init__(
284
285
  self,
285
286
  dim: int,
286
- input_resolution: Tuple[int, int],
287
+ input_resolution: tuple[int, int],
287
288
  depth: int,
288
289
  activation,
289
- drop_path: Union[float, List[float]] = 0.0,
290
- downsample: Optional[nn.Module] = None,
290
+ drop_path: float | list[float] = 0.0,
291
+ downsample: nn.Module | None = None,
291
292
  use_checkpoint: bool = False,
292
- out_dim: Optional[int] = None,
293
+ out_dim: int | None = None,
293
294
  conv_expand_ratio: float = 4.0,
294
295
  ):
295
296
  """
@@ -370,8 +371,8 @@ class MLP(nn.Module):
370
371
  def __init__(
371
372
  self,
372
373
  in_features: int,
373
- hidden_features: Optional[int] = None,
374
- out_features: Optional[int] = None,
374
+ hidden_features: int | None = None,
375
+ out_features: int | None = None,
375
376
  activation=nn.GELU,
376
377
  drop: float = 0.0,
377
378
  ):
@@ -441,7 +442,7 @@ class Attention(torch.nn.Module):
441
442
  key_dim: int,
442
443
  num_heads: int = 8,
443
444
  attn_ratio: float = 4,
444
- resolution: Tuple[int, int] = (14, 14),
445
+ resolution: tuple[int, int] = (14, 14),
445
446
  ):
446
447
  """
447
448
  Initialize the Attention module for multi-head attention with spatial awareness.
@@ -549,7 +550,7 @@ class TinyViTBlock(nn.Module):
549
550
  def __init__(
550
551
  self,
551
552
  dim: int,
552
- input_resolution: Tuple[int, int],
553
+ input_resolution: tuple[int, int],
553
554
  num_heads: int,
554
555
  window_size: int = 7,
555
556
  mlp_ratio: float = 4.0,
@@ -690,18 +691,18 @@ class BasicLayer(nn.Module):
690
691
  def __init__(
691
692
  self,
692
693
  dim: int,
693
- input_resolution: Tuple[int, int],
694
+ input_resolution: tuple[int, int],
694
695
  depth: int,
695
696
  num_heads: int,
696
697
  window_size: int,
697
698
  mlp_ratio: float = 4.0,
698
699
  drop: float = 0.0,
699
- drop_path: Union[float, List[float]] = 0.0,
700
- downsample: Optional[nn.Module] = None,
700
+ drop_path: float | list[float] = 0.0,
701
+ downsample: nn.Module | None = None,
701
702
  use_checkpoint: bool = False,
702
703
  local_conv_size: int = 3,
703
704
  activation=nn.GELU,
704
- out_dim: Optional[int] = None,
705
+ out_dim: int | None = None,
705
706
  ):
706
707
  """
707
708
  Initialize a BasicLayer in the TinyViT architecture.
@@ -800,10 +801,10 @@ class TinyViT(nn.Module):
800
801
  img_size: int = 224,
801
802
  in_chans: int = 3,
802
803
  num_classes: int = 1000,
803
- embed_dims: Tuple[int, int, int, int] = (96, 192, 384, 768),
804
- depths: Tuple[int, int, int, int] = (2, 2, 6, 2),
805
- num_heads: Tuple[int, int, int, int] = (3, 6, 12, 24),
806
- window_sizes: Tuple[int, int, int, int] = (7, 7, 14, 7),
804
+ embed_dims: tuple[int, int, int, int] = (96, 192, 384, 768),
805
+ depths: tuple[int, int, int, int] = (2, 2, 6, 2),
806
+ num_heads: tuple[int, int, int, int] = (3, 6, 12, 24),
807
+ window_sizes: tuple[int, int, int, int] = (7, 7, 14, 7),
807
808
  mlp_ratio: float = 4.0,
808
809
  drop_rate: float = 0.0,
809
810
  drop_path_rate: float = 0.1,
@@ -980,7 +981,7 @@ class TinyViT(nn.Module):
980
981
  """Perform the forward pass through the TinyViT model, extracting features from the input image."""
981
982
  return self.forward_features(x)
982
983
 
983
- def set_imgsz(self, imgsz: List[int] = [1024, 1024]):
984
+ def set_imgsz(self, imgsz: list[int] = [1024, 1024]):
984
985
  """Set image size to make model compatible with different image sizes."""
985
986
  imgsz = [s // 4 for s in imgsz]
986
987
  self.patches_resolution = imgsz
@@ -1,7 +1,8 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import math
4
- from typing import Tuple, Type
5
6
 
6
7
  import torch
7
8
  from torch import Tensor, nn
@@ -44,7 +45,7 @@ class TwoWayTransformer(nn.Module):
44
45
  embedding_dim: int,
45
46
  num_heads: int,
46
47
  mlp_dim: int,
47
- activation: Type[nn.Module] = nn.ReLU,
48
+ activation: type[nn.Module] = nn.ReLU,
48
49
  attention_downsample_rate: int = 2,
49
50
  ) -> None:
50
51
  """
@@ -85,7 +86,7 @@ class TwoWayTransformer(nn.Module):
85
86
  image_embedding: torch.Tensor,
86
87
  image_pe: torch.Tensor,
87
88
  point_embedding: torch.Tensor,
88
- ) -> Tuple[torch.Tensor, torch.Tensor]:
89
+ ) -> tuple[torch.Tensor, torch.Tensor]:
89
90
  """
90
91
  Process image and point embeddings through the Two-Way Transformer.
91
92
 
@@ -162,7 +163,7 @@ class TwoWayAttentionBlock(nn.Module):
162
163
  embedding_dim: int,
163
164
  num_heads: int,
164
165
  mlp_dim: int = 2048,
165
- activation: Type[nn.Module] = nn.ReLU,
166
+ activation: type[nn.Module] = nn.ReLU,
166
167
  attention_downsample_rate: int = 2,
167
168
  skip_first_layer_pe: bool = False,
168
169
  ) -> None:
@@ -198,7 +199,7 @@ class TwoWayAttentionBlock(nn.Module):
198
199
 
199
200
  def forward(
200
201
  self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
201
- ) -> Tuple[torch.Tensor, torch.Tensor]:
202
+ ) -> tuple[torch.Tensor, torch.Tensor]:
202
203
  """
203
204
  Apply two-way attention to process query and key embeddings in a transformer block.
204
205
 
@@ -1,12 +1,14 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
- from typing import Any, Dict, Tuple
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
4
6
 
5
7
  import torch
6
8
  import torch.nn.functional as F
7
9
 
8
10
 
9
- def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: Dict[int, Any], max_cond_frame_num: int):
11
+ def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: dict[int, Any], max_cond_frame_num: int):
10
12
  """
11
13
  Select the closest conditioning frames to a given frame index.
12
14
 
@@ -248,7 +250,7 @@ def window_partition(x: torch.Tensor, window_size: int):
248
250
  return windows, (Hp, Wp)
249
251
 
250
252
 
251
- def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]):
253
+ def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: tuple[int, int], hw: tuple[int, int]):
252
254
  """
253
255
  Unpartition windowed sequences into original sequences and remove padding.
254
256
 
@@ -333,8 +335,8 @@ def add_decomposed_rel_pos(
333
335
  q: torch.Tensor,
334
336
  rel_pos_h: torch.Tensor,
335
337
  rel_pos_w: torch.Tensor,
336
- q_size: Tuple[int, int],
337
- k_size: Tuple[int, int],
338
+ q_size: tuple[int, int],
339
+ k_size: tuple[int, int],
338
340
  ) -> torch.Tensor:
339
341
  """
340
342
  Add decomposed Relative Positional Embeddings to the attention map.
@@ -8,8 +8,10 @@ using SAM. It forms an integral part of the Ultralytics framework and is designe
8
8
  segmentation tasks.
9
9
  """
10
10
 
11
+ from __future__ import annotations
12
+
11
13
  from collections import OrderedDict
12
- from typing import Any, Dict, List, Optional, Tuple, Union
14
+ from typing import Any
13
15
 
14
16
  import cv2
15
17
  import numpy as np
@@ -1717,9 +1719,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1717
1719
  def __init__(
1718
1720
  self,
1719
1721
  cfg: Any = DEFAULT_CFG,
1720
- overrides: Optional[Dict[str, Any]] = None,
1722
+ overrides: dict[str, Any] | None = None,
1721
1723
  max_obj_num: int = 3,
1722
- _callbacks: Optional[Dict[str, Any]] = None,
1724
+ _callbacks: dict[str, Any] | None = None,
1723
1725
  ) -> None:
1724
1726
  """
1725
1727
  Initialize the predictor with configuration and optional overrides.
@@ -1759,14 +1761,14 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1759
1761
  @smart_inference_mode()
1760
1762
  def inference(
1761
1763
  self,
1762
- img: Union[torch.Tensor, np.ndarray],
1763
- bboxes: Optional[List[List[float]]] = None,
1764
- masks: Optional[Union[torch.Tensor, np.ndarray]] = None,
1765
- points: Optional[List[List[float]]] = None,
1766
- labels: Optional[List[int]] = None,
1767
- obj_ids: Optional[List[int]] = None,
1764
+ img: torch.Tensor | np.ndarray,
1765
+ bboxes: list[list[float]] | None = None,
1766
+ masks: torch.Tensor | np.ndarray | None = None,
1767
+ points: list[list[float]] | None = None,
1768
+ labels: list[int] | None = None,
1769
+ obj_ids: list[int] | None = None,
1768
1770
  update_memory: bool = False,
1769
- ) -> Tuple[torch.Tensor, torch.Tensor]:
1771
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1770
1772
  """
1771
1773
  Perform inference on a single image with optional bounding boxes, masks, points and object IDs.
1772
1774
  It has two modes: one is to run inference on a single image without updating the memory,
@@ -1824,7 +1826,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1824
1826
  pred_scores = torch.clamp_(pred_scores / 32, min=0)
1825
1827
  return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
1826
1828
 
1827
- def get_im_features(self, img: Union[torch.Tensor, np.ndarray]) -> None:
1829
+ def get_im_features(self, img: torch.Tensor | np.ndarray) -> None:
1828
1830
  """
1829
1831
  Initialize the image state by processing the input image and extracting features.
1830
1832
 
@@ -1844,10 +1846,10 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1844
1846
  @smart_inference_mode()
1845
1847
  def update_memory(
1846
1848
  self,
1847
- obj_ids: List[int] = None,
1848
- points: Optional[torch.Tensor] = None,
1849
- labels: Optional[torch.Tensor] = None,
1850
- masks: Optional[torch.Tensor] = None,
1849
+ obj_ids: list[int] = None,
1850
+ points: torch.Tensor | None = None,
1851
+ labels: torch.Tensor | None = None,
1852
+ masks: torch.Tensor | None = None,
1851
1853
  ) -> None:
1852
1854
  """
1853
1855
  Append the imgState to the memory_bank and update the memory for the model.
@@ -1923,7 +1925,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1923
1925
  consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
1924
1926
  self.memory_bank.append(consolidated_out)
1925
1927
 
1926
- def _prepare_memory_conditioned_features(self, obj_idx: Optional[int]) -> torch.Tensor:
1928
+ def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
1927
1929
  """
1928
1930
  Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
1929
1931
  prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features for all
@@ -1958,7 +1960,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1958
1960
  *self.feat_sizes[-1],
1959
1961
  )
1960
1962
 
1961
- def get_maskmem_enc(self) -> Tuple[torch.Tensor, torch.Tensor]:
1963
+ def get_maskmem_enc(self) -> tuple[torch.Tensor, torch.Tensor]:
1962
1964
  """Get the memory and positional encoding from the memory, which is used to condition the current image
1963
1965
  features.
1964
1966
  """
@@ -1973,7 +1975,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1973
1975
  memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
1974
1976
  return memory, memory_pos_embed
1975
1977
 
1976
- def _obj_id_to_idx(self, obj_id: int) -> Optional[int]:
1978
+ def _obj_id_to_idx(self, obj_id: int) -> int | None:
1977
1979
  """
1978
1980
  Map client-side object id to model-side object index.
1979
1981
 
@@ -1987,11 +1989,11 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1987
1989
 
1988
1990
  def track_step(
1989
1991
  self,
1990
- obj_idx: Optional[int] = None,
1991
- point: Optional[torch.Tensor] = None,
1992
- label: Optional[torch.Tensor] = None,
1993
- mask: Optional[torch.Tensor] = None,
1994
- ) -> Dict[str, Any]:
1992
+ obj_idx: int | None = None,
1993
+ point: torch.Tensor | None = None,
1994
+ label: torch.Tensor | None = None,
1995
+ mask: torch.Tensor | None = None,
1996
+ ) -> dict[str, Any]:
1995
1997
  """
1996
1998
  Tracking step for the current image state to predict masks.
1997
1999
 
@@ -2010,7 +2012,6 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2010
2012
  current_out (Dict[str, Any]): A dictionary containing the current output with mask predictions and object pointers.
2011
2013
  Keys include 'point_inputs', 'mask_inputs', 'pred_masks', 'pred_masks_high_res', 'obj_ptr', 'object_score_logits'.
2012
2014
  """
2013
- current_out = {}
2014
2015
  if mask is not None and self.model.use_mask_input_as_output_without_sam:
2015
2016
  # When use_mask_input_as_output_without_sam=True, we directly output the mask input
2016
2017
  # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
@@ -2021,7 +2022,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2021
2022
  # fused the visual feature with previous memory features in the memory bank
2022
2023
  pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
2023
2024
  # calculate the first feature if adding obj_idx exists(means adding prompts)
2024
- pix_feat_with_mem = pix_feat_with_mem[0:1] if obj_idx is not None else pix_feat_with_mem
2025
+ pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
2025
2026
  _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
2026
2027
  backbone_features=pix_feat_with_mem,
2027
2028
  point_inputs={"point_coords": point, "point_labels": label} if obj_idx is not None else None,
@@ -2029,9 +2030,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2029
2030
  multimask_output=False,
2030
2031
  high_res_features=[feat[: pix_feat_with_mem.size(0)] for feat in self.high_res_features],
2031
2032
  )
2032
- current_out["pred_masks"] = low_res_masks
2033
- current_out["pred_masks_high_res"] = high_res_masks
2034
- current_out["obj_ptr"] = obj_ptr
2035
- current_out["object_score_logits"] = object_score_logits
2036
-
2037
- return current_out
2033
+ return {
2034
+ "pred_masks": low_res_masks,
2035
+ "pred_masks_high_res": high_res_masks,
2036
+ "obj_ptr": obj_ptr,
2037
+ "object_score_logits": object_score_logits,
2038
+ }
@@ -1,6 +1,8 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
- from typing import Any, Dict, List, Optional, Tuple
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
4
6
 
5
7
  import torch
6
8
  import torch.nn as nn
@@ -36,7 +38,7 @@ class DETRLoss(nn.Module):
36
38
  def __init__(
37
39
  self,
38
40
  nc: int = 80,
39
- loss_gain: Optional[Dict[str, float]] = None,
41
+ loss_gain: dict[str, float] | None = None,
40
42
  aux_loss: bool = True,
41
43
  use_fl: bool = True,
42
44
  use_vfl: bool = False,
@@ -79,7 +81,7 @@ class DETRLoss(nn.Module):
79
81
 
80
82
  def _get_loss_class(
81
83
  self, pred_scores: torch.Tensor, targets: torch.Tensor, gt_scores: torch.Tensor, num_gts: int, postfix: str = ""
82
- ) -> Dict[str, torch.Tensor]:
84
+ ) -> dict[str, torch.Tensor]:
83
85
  """
84
86
  Compute classification loss based on predictions, target values, and ground truth scores.
85
87
 
@@ -121,7 +123,7 @@ class DETRLoss(nn.Module):
121
123
 
122
124
  def _get_loss_bbox(
123
125
  self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, postfix: str = ""
124
- ) -> Dict[str, torch.Tensor]:
126
+ ) -> dict[str, torch.Tensor]:
125
127
  """
126
128
  Compute bounding box and GIoU losses for predicted and ground truth bounding boxes.
127
129
 
@@ -191,12 +193,12 @@ class DETRLoss(nn.Module):
191
193
  pred_scores: torch.Tensor,
192
194
  gt_bboxes: torch.Tensor,
193
195
  gt_cls: torch.Tensor,
194
- gt_groups: List[int],
195
- match_indices: Optional[List[Tuple]] = None,
196
+ gt_groups: list[int],
197
+ match_indices: list[tuple] | None = None,
196
198
  postfix: str = "",
197
- masks: Optional[torch.Tensor] = None,
198
- gt_mask: Optional[torch.Tensor] = None,
199
- ) -> Dict[str, torch.Tensor]:
199
+ masks: torch.Tensor | None = None,
200
+ gt_mask: torch.Tensor | None = None,
201
+ ) -> dict[str, torch.Tensor]:
200
202
  """
201
203
  Get auxiliary losses for intermediate decoder layers.
202
204
 
@@ -258,7 +260,7 @@ class DETRLoss(nn.Module):
258
260
  return loss
259
261
 
260
262
  @staticmethod
261
- def _get_index(match_indices: List[Tuple]) -> Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
263
+ def _get_index(match_indices: list[tuple]) -> tuple[tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
262
264
  """
263
265
  Extract batch indices, source indices, and destination indices from match indices.
264
266
 
@@ -275,8 +277,8 @@ class DETRLoss(nn.Module):
275
277
  return (batch_idx, src_idx), dst_idx
276
278
 
277
279
  def _get_assigned_bboxes(
278
- self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, match_indices: List[Tuple]
279
- ) -> Tuple[torch.Tensor, torch.Tensor]:
280
+ self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, match_indices: list[tuple]
281
+ ) -> tuple[torch.Tensor, torch.Tensor]:
280
282
  """
281
283
  Assign predicted bounding boxes to ground truth bounding boxes based on match indices.
282
284
 
@@ -309,12 +311,12 @@ class DETRLoss(nn.Module):
309
311
  pred_scores: torch.Tensor,
310
312
  gt_bboxes: torch.Tensor,
311
313
  gt_cls: torch.Tensor,
312
- gt_groups: List[int],
313
- masks: Optional[torch.Tensor] = None,
314
- gt_mask: Optional[torch.Tensor] = None,
314
+ gt_groups: list[int],
315
+ masks: torch.Tensor | None = None,
316
+ gt_mask: torch.Tensor | None = None,
315
317
  postfix: str = "",
316
- match_indices: Optional[List[Tuple]] = None,
317
- ) -> Dict[str, torch.Tensor]:
318
+ match_indices: list[tuple] | None = None,
319
+ ) -> dict[str, torch.Tensor]:
318
320
  """
319
321
  Calculate losses for a single prediction layer.
320
322
 
@@ -358,10 +360,10 @@ class DETRLoss(nn.Module):
358
360
  self,
359
361
  pred_bboxes: torch.Tensor,
360
362
  pred_scores: torch.Tensor,
361
- batch: Dict[str, Any],
363
+ batch: dict[str, Any],
362
364
  postfix: str = "",
363
365
  **kwargs: Any,
364
- ) -> Dict[str, torch.Tensor]:
366
+ ) -> dict[str, torch.Tensor]:
365
367
  """
366
368
  Calculate loss for predicted bounding boxes and scores.
367
369
 
@@ -407,12 +409,12 @@ class RTDETRDetectionLoss(DETRLoss):
407
409
 
408
410
  def forward(
409
411
  self,
410
- preds: Tuple[torch.Tensor, torch.Tensor],
411
- batch: Dict[str, Any],
412
- dn_bboxes: Optional[torch.Tensor] = None,
413
- dn_scores: Optional[torch.Tensor] = None,
414
- dn_meta: Optional[Dict[str, Any]] = None,
415
- ) -> Dict[str, torch.Tensor]:
412
+ preds: tuple[torch.Tensor, torch.Tensor],
413
+ batch: dict[str, Any],
414
+ dn_bboxes: torch.Tensor | None = None,
415
+ dn_scores: torch.Tensor | None = None,
416
+ dn_meta: dict[str, Any] | None = None,
417
+ ) -> dict[str, torch.Tensor]:
416
418
  """
417
419
  Forward pass to compute detection loss with optional denoising loss.
418
420
 
@@ -448,8 +450,8 @@ class RTDETRDetectionLoss(DETRLoss):
448
450
 
449
451
  @staticmethod
450
452
  def get_dn_match_indices(
451
- dn_pos_idx: List[torch.Tensor], dn_num_group: int, gt_groups: List[int]
452
- ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
453
+ dn_pos_idx: list[torch.Tensor], dn_num_group: int, gt_groups: list[int]
454
+ ) -> list[tuple[torch.Tensor, torch.Tensor]]:
453
455
  """
454
456
  Get match indices for denoising.
455
457
 
@@ -1,6 +1,8 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
- from typing import Any, Dict, List, Optional, Tuple
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
4
6
 
5
7
  import torch
6
8
  import torch.nn as nn
@@ -47,7 +49,7 @@ class HungarianMatcher(nn.Module):
47
49
 
48
50
  def __init__(
49
51
  self,
50
- cost_gain: Optional[Dict[str, float]] = None,
52
+ cost_gain: dict[str, float] | None = None,
51
53
  use_fl: bool = True,
52
54
  with_mask: bool = False,
53
55
  num_sample_points: int = 12544,
@@ -82,10 +84,10 @@ class HungarianMatcher(nn.Module):
82
84
  pred_scores: torch.Tensor,
83
85
  gt_bboxes: torch.Tensor,
84
86
  gt_cls: torch.Tensor,
85
- gt_groups: List[int],
86
- masks: Optional[torch.Tensor] = None,
87
- gt_mask: Optional[List[torch.Tensor]] = None,
88
- ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
87
+ gt_groups: list[int],
88
+ masks: torch.Tensor | None = None,
89
+ gt_mask: list[torch.Tensor] | None = None,
90
+ ) -> list[tuple[torch.Tensor, torch.Tensor]]:
89
91
  """
90
92
  Compute optimal assignment between predictions and ground truth using Hungarian algorithm.
91
93
 
@@ -187,7 +189,7 @@ class HungarianMatcher(nn.Module):
187
189
 
188
190
 
189
191
  def get_cdn_group(
190
- batch: Dict[str, Any],
192
+ batch: dict[str, Any],
191
193
  num_classes: int,
192
194
  num_queries: int,
193
195
  class_embed: torch.Tensor,
@@ -195,7 +197,7 @@ def get_cdn_group(
195
197
  cls_noise_ratio: float = 0.5,
196
198
  box_noise_scale: float = 1.0,
197
199
  training: bool = False,
198
- ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor], Optional[Dict[str, Any]]]:
200
+ ) -> tuple[torch.Tensor | None, torch.Tensor | None, torch.Tensor | None, dict[str, Any] | None]:
199
201
  """
200
202
  Generate contrastive denoising training group with positive and negative samples from ground truths.
201
203