dgenerate-ultralytics-headless 8.3.190__py3-none-any.whl → 8.3.191__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/METADATA +1 -1
  2. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/RECORD +102 -101
  3. tests/test_cuda.py +6 -5
  4. tests/test_exports.py +1 -6
  5. tests/test_python.py +1 -4
  6. tests/test_solutions.py +1 -1
  7. ultralytics/__init__.py +1 -1
  8. ultralytics/cfg/__init__.py +16 -14
  9. ultralytics/cfg/datasets/VisDrone.yaml +4 -4
  10. ultralytics/data/annotator.py +6 -6
  11. ultralytics/data/augment.py +53 -51
  12. ultralytics/data/base.py +15 -13
  13. ultralytics/data/build.py +7 -4
  14. ultralytics/data/converter.py +9 -10
  15. ultralytics/data/dataset.py +24 -22
  16. ultralytics/data/loaders.py +13 -11
  17. ultralytics/data/split.py +4 -3
  18. ultralytics/data/split_dota.py +14 -12
  19. ultralytics/data/utils.py +29 -23
  20. ultralytics/engine/exporter.py +2 -2
  21. ultralytics/engine/model.py +16 -14
  22. ultralytics/engine/predictor.py +8 -6
  23. ultralytics/engine/results.py +54 -52
  24. ultralytics/engine/trainer.py +7 -2
  25. ultralytics/engine/tuner.py +4 -3
  26. ultralytics/hub/google/__init__.py +7 -6
  27. ultralytics/hub/session.py +8 -6
  28. ultralytics/hub/utils.py +3 -4
  29. ultralytics/models/fastsam/model.py +8 -6
  30. ultralytics/models/nas/model.py +5 -3
  31. ultralytics/models/rtdetr/train.py +4 -3
  32. ultralytics/models/rtdetr/val.py +6 -4
  33. ultralytics/models/sam/amg.py +13 -10
  34. ultralytics/models/sam/model.py +3 -2
  35. ultralytics/models/sam/modules/blocks.py +21 -21
  36. ultralytics/models/sam/modules/decoders.py +11 -11
  37. ultralytics/models/sam/modules/encoders.py +25 -25
  38. ultralytics/models/sam/modules/memory_attention.py +9 -8
  39. ultralytics/models/sam/modules/sam.py +8 -10
  40. ultralytics/models/sam/modules/tiny_encoder.py +21 -20
  41. ultralytics/models/sam/modules/transformer.py +6 -5
  42. ultralytics/models/sam/modules/utils.py +7 -5
  43. ultralytics/models/sam/predict.py +32 -31
  44. ultralytics/models/utils/loss.py +29 -27
  45. ultralytics/models/utils/ops.py +10 -8
  46. ultralytics/models/yolo/classify/train.py +7 -5
  47. ultralytics/models/yolo/classify/val.py +10 -8
  48. ultralytics/models/yolo/detect/predict.py +1 -1
  49. ultralytics/models/yolo/detect/train.py +8 -6
  50. ultralytics/models/yolo/detect/val.py +21 -19
  51. ultralytics/models/yolo/model.py +14 -14
  52. ultralytics/models/yolo/obb/train.py +5 -3
  53. ultralytics/models/yolo/obb/val.py +11 -9
  54. ultralytics/models/yolo/pose/train.py +7 -5
  55. ultralytics/models/yolo/pose/val.py +11 -9
  56. ultralytics/models/yolo/segment/train.py +4 -5
  57. ultralytics/models/yolo/segment/val.py +12 -10
  58. ultralytics/models/yolo/world/train.py +9 -7
  59. ultralytics/models/yolo/yoloe/train.py +7 -6
  60. ultralytics/models/yolo/yoloe/val.py +10 -8
  61. ultralytics/nn/autobackend.py +17 -19
  62. ultralytics/nn/modules/block.py +12 -12
  63. ultralytics/nn/modules/conv.py +4 -3
  64. ultralytics/nn/modules/head.py +41 -37
  65. ultralytics/nn/modules/transformer.py +22 -21
  66. ultralytics/nn/tasks.py +2 -2
  67. ultralytics/nn/text_model.py +6 -5
  68. ultralytics/solutions/analytics.py +7 -5
  69. ultralytics/solutions/config.py +12 -10
  70. ultralytics/solutions/distance_calculation.py +3 -3
  71. ultralytics/solutions/heatmap.py +4 -2
  72. ultralytics/solutions/object_counter.py +5 -3
  73. ultralytics/solutions/parking_management.py +4 -2
  74. ultralytics/solutions/region_counter.py +7 -5
  75. ultralytics/solutions/similarity_search.py +5 -3
  76. ultralytics/solutions/solutions.py +38 -36
  77. ultralytics/solutions/streamlit_inference.py +8 -7
  78. ultralytics/trackers/bot_sort.py +11 -9
  79. ultralytics/trackers/byte_tracker.py +17 -15
  80. ultralytics/trackers/utils/gmc.py +4 -3
  81. ultralytics/utils/__init__.py +16 -88
  82. ultralytics/utils/autobatch.py +3 -2
  83. ultralytics/utils/autodevice.py +10 -10
  84. ultralytics/utils/benchmarks.py +11 -10
  85. ultralytics/utils/callbacks/comet.py +9 -9
  86. ultralytics/utils/checks.py +17 -26
  87. ultralytics/utils/export.py +12 -11
  88. ultralytics/utils/files.py +8 -7
  89. ultralytics/utils/git.py +139 -0
  90. ultralytics/utils/instance.py +8 -7
  91. ultralytics/utils/loss.py +15 -13
  92. ultralytics/utils/metrics.py +62 -62
  93. ultralytics/utils/ops.py +3 -2
  94. ultralytics/utils/patches.py +6 -4
  95. ultralytics/utils/plotting.py +18 -16
  96. ultralytics/utils/torch_utils.py +4 -2
  97. ultralytics/utils/tqdm.py +15 -12
  98. ultralytics/utils/triton.py +3 -2
  99. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/WHEEL +0 -0
  100. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/entry_points.txt +0 -0
  101. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/licenses/LICENSE +0 -0
  102. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
- from typing import List, Optional, Tuple, Type
3
+ from __future__ import annotations
4
4
 
5
5
  import torch
6
6
  import torch.nn as nn
@@ -56,13 +56,13 @@ class ImageEncoderViT(nn.Module):
56
56
  mlp_ratio: float = 4.0,
57
57
  out_chans: int = 256,
58
58
  qkv_bias: bool = True,
59
- norm_layer: Type[nn.Module] = nn.LayerNorm,
60
- act_layer: Type[nn.Module] = nn.GELU,
59
+ norm_layer: type[nn.Module] = nn.LayerNorm,
60
+ act_layer: type[nn.Module] = nn.GELU,
61
61
  use_abs_pos: bool = True,
62
62
  use_rel_pos: bool = False,
63
63
  rel_pos_zero_init: bool = True,
64
64
  window_size: int = 0,
65
- global_attn_indexes: Tuple[int, ...] = (),
65
+ global_attn_indexes: tuple[int, ...] = (),
66
66
  ) -> None:
67
67
  """
68
68
  Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
@@ -101,7 +101,7 @@ class ImageEncoderViT(nn.Module):
101
101
  embed_dim=embed_dim,
102
102
  )
103
103
 
104
- self.pos_embed: Optional[nn.Parameter] = None
104
+ self.pos_embed: nn.Parameter | None = None
105
105
  if use_abs_pos:
106
106
  # Initialize absolute positional embedding with pretrain image size
107
107
  self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
@@ -188,10 +188,10 @@ class PromptEncoder(nn.Module):
188
188
  def __init__(
189
189
  self,
190
190
  embed_dim: int,
191
- image_embedding_size: Tuple[int, int],
192
- input_image_size: Tuple[int, int],
191
+ image_embedding_size: tuple[int, int],
192
+ input_image_size: tuple[int, int],
193
193
  mask_in_chans: int,
194
- activation: Type[nn.Module] = nn.GELU,
194
+ activation: type[nn.Module] = nn.GELU,
195
195
  ) -> None:
196
196
  """
197
197
  Initialize the PromptEncoder module for encoding various types of prompts.
@@ -286,9 +286,9 @@ class PromptEncoder(nn.Module):
286
286
 
287
287
  @staticmethod
288
288
  def _get_batch_size(
289
- points: Optional[Tuple[torch.Tensor, torch.Tensor]],
290
- boxes: Optional[torch.Tensor],
291
- masks: Optional[torch.Tensor],
289
+ points: tuple[torch.Tensor, torch.Tensor] | None,
290
+ boxes: torch.Tensor | None,
291
+ masks: torch.Tensor | None,
292
292
  ) -> int:
293
293
  """Get the batch size of the output given the batch size of the input prompts."""
294
294
  if points is not None:
@@ -302,10 +302,10 @@ class PromptEncoder(nn.Module):
302
302
 
303
303
  def forward(
304
304
  self,
305
- points: Optional[Tuple[torch.Tensor, torch.Tensor]],
306
- boxes: Optional[torch.Tensor],
307
- masks: Optional[torch.Tensor],
308
- ) -> Tuple[torch.Tensor, torch.Tensor]:
305
+ points: tuple[torch.Tensor, torch.Tensor] | None,
306
+ boxes: torch.Tensor | None,
307
+ masks: torch.Tensor | None,
308
+ ) -> tuple[torch.Tensor, torch.Tensor]:
309
309
  """
310
310
  Embed different types of prompts, returning both sparse and dense embeddings.
311
311
 
@@ -542,13 +542,13 @@ class FpnNeck(nn.Module):
542
542
  def __init__(
543
543
  self,
544
544
  d_model: int,
545
- backbone_channel_list: List[int],
545
+ backbone_channel_list: list[int],
546
546
  kernel_size: int = 1,
547
547
  stride: int = 1,
548
548
  padding: int = 0,
549
549
  fpn_interp_model: str = "bilinear",
550
550
  fuse_type: str = "sum",
551
- fpn_top_down_levels: Optional[List[int]] = None,
551
+ fpn_top_down_levels: list[int] | None = None,
552
552
  ):
553
553
  """
554
554
  Initialize a modified Feature Pyramid Network (FPN) neck.
@@ -602,7 +602,7 @@ class FpnNeck(nn.Module):
602
602
  fpn_top_down_levels = range(len(self.convs))
603
603
  self.fpn_top_down_levels = list(fpn_top_down_levels)
604
604
 
605
- def forward(self, xs: List[torch.Tensor]):
605
+ def forward(self, xs: list[torch.Tensor]):
606
606
  """
607
607
  Perform forward pass through the Feature Pyramid Network (FPN) neck.
608
608
 
@@ -695,20 +695,20 @@ class Hiera(nn.Module):
695
695
  num_heads: int = 1, # initial number of heads
696
696
  drop_path_rate: float = 0.0, # stochastic depth
697
697
  q_pool: int = 3, # number of q_pool stages
698
- q_stride: Tuple[int, int] = (2, 2), # downsample stride bet. stages
699
- stages: Tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
698
+ q_stride: tuple[int, int] = (2, 2), # downsample stride bet. stages
699
+ stages: tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
700
700
  dim_mul: float = 2.0, # dim_mul factor at stage shift
701
701
  head_mul: float = 2.0, # head_mul factor at stage shift
702
- window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
702
+ window_pos_embed_bkg_spatial_size: tuple[int, int] = (14, 14),
703
703
  # window size per stage, when not using global att.
704
- window_spec: Tuple[int, ...] = (
704
+ window_spec: tuple[int, ...] = (
705
705
  8,
706
706
  4,
707
707
  14,
708
708
  7,
709
709
  ),
710
710
  # global attn in these blocks
711
- global_att_blocks: Tuple[int, ...] = (
711
+ global_att_blocks: tuple[int, ...] = (
712
712
  12,
713
713
  16,
714
714
  20,
@@ -806,7 +806,7 @@ class Hiera(nn.Module):
806
806
  else [self.blocks[-1].dim_out]
807
807
  )
808
808
 
809
- def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
809
+ def _get_pos_embed(self, hw: tuple[int, int]) -> torch.Tensor:
810
810
  """Generate positional embeddings by interpolating and combining window and background embeddings."""
811
811
  h, w = hw
812
812
  window_embed = self.pos_embed_window
@@ -815,7 +815,7 @@ class Hiera(nn.Module):
815
815
  pos_embed = pos_embed.permute(0, 2, 3, 1)
816
816
  return pos_embed
817
817
 
818
- def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
818
+ def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
819
819
  """
820
820
  Perform forward pass through Hiera model, extracting multiscale features from input images.
821
821
 
@@ -1,7 +1,8 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import copy
4
- from typing import Optional
5
6
 
6
7
  import torch
7
8
  from torch import nn
@@ -103,7 +104,7 @@ class MemoryAttentionLayer(nn.Module):
103
104
  self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
104
105
  self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
105
106
 
106
- def _forward_sa(self, tgt: torch.Tensor, query_pos: Optional[torch.Tensor]) -> torch.Tensor:
107
+ def _forward_sa(self, tgt: torch.Tensor, query_pos: torch.Tensor | None) -> torch.Tensor:
107
108
  """Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
108
109
  tgt2 = self.norm1(tgt)
109
110
  q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
@@ -115,8 +116,8 @@ class MemoryAttentionLayer(nn.Module):
115
116
  self,
116
117
  tgt: torch.Tensor,
117
118
  memory: torch.Tensor,
118
- query_pos: Optional[torch.Tensor],
119
- pos: Optional[torch.Tensor],
119
+ query_pos: torch.Tensor | None,
120
+ pos: torch.Tensor | None,
120
121
  num_k_exclude_rope: int = 0,
121
122
  ) -> torch.Tensor:
122
123
  """Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
@@ -140,8 +141,8 @@ class MemoryAttentionLayer(nn.Module):
140
141
  self,
141
142
  tgt: torch.Tensor,
142
143
  memory: torch.Tensor,
143
- pos: Optional[torch.Tensor] = None,
144
- query_pos: Optional[torch.Tensor] = None,
144
+ pos: torch.Tensor | None = None,
145
+ query_pos: torch.Tensor | None = None,
145
146
  num_k_exclude_rope: int = 0,
146
147
  ) -> torch.Tensor:
147
148
  """
@@ -242,8 +243,8 @@ class MemoryAttention(nn.Module):
242
243
  self,
243
244
  curr: torch.Tensor, # self-attention inputs
244
245
  memory: torch.Tensor, # cross-attention inputs
245
- curr_pos: Optional[torch.Tensor] = None, # pos_enc for self-attention inputs
246
- memory_pos: Optional[torch.Tensor] = None, # pos_enc for cross-attention inputs
246
+ curr_pos: torch.Tensor | None = None, # pos_enc for self-attention inputs
247
+ memory_pos: torch.Tensor | None = None, # pos_enc for cross-attention inputs
247
248
  num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
248
249
  ) -> torch.Tensor:
249
250
  """
@@ -3,10 +3,7 @@
3
3
  # Copyright (c) Meta Platforms, Inc. and affiliates.
4
4
  # All rights reserved.
5
5
 
6
- # This source code is licensed under the license found in the
7
- # LICENSE file in the root directory of this source tree.
8
-
9
- from typing import List
6
+ from __future__ import annotations
10
7
 
11
8
  import torch
12
9
  import torch.nn.functional as F
@@ -61,8 +58,8 @@ class SAMModel(nn.Module):
61
58
  image_encoder: ImageEncoderViT,
62
59
  prompt_encoder: PromptEncoder,
63
60
  mask_decoder: MaskDecoder,
64
- pixel_mean: List[float] = (123.675, 116.28, 103.53),
65
- pixel_std: List[float] = (58.395, 57.12, 57.375),
61
+ pixel_mean: list[float] = (123.675, 116.28, 103.53),
62
+ pixel_std: list[float] = (58.395, 57.12, 57.375),
66
63
  ) -> None:
67
64
  """
68
65
  Initialize the SAMModel class to predict object masks from an image and input prompts.
@@ -959,7 +956,6 @@ class SAM2Model(torch.nn.Module):
959
956
  prev_sam_mask_logits=None,
960
957
  ):
961
958
  """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
962
- current_out = {}
963
959
  sam_outputs, _, _ = self._track_step(
964
960
  frame_idx,
965
961
  is_init_cond_frame,
@@ -975,9 +971,11 @@ class SAM2Model(torch.nn.Module):
975
971
  )
976
972
  _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
977
973
 
978
- current_out["pred_masks"] = low_res_masks
979
- current_out["pred_masks_high_res"] = high_res_masks
980
- current_out["obj_ptr"] = obj_ptr
974
+ current_out = {
975
+ "pred_masks": low_res_masks,
976
+ "pred_masks_high_res": high_res_masks,
977
+ "obj_ptr": obj_ptr,
978
+ }
981
979
  if not self.training:
982
980
  # Only add this in inference (to avoid unused param in activation checkpointing;
983
981
  # it's mainly used in the demo to encode spatial memories w/ consolidated masks)
@@ -9,8 +9,9 @@
9
9
  # Build the TinyViT Model
10
10
  # --------------------------------------------------------
11
11
 
12
+ from __future__ import annotations
13
+
12
14
  import itertools
13
- from typing import List, Optional, Tuple, Union
14
15
 
15
16
  import torch
16
17
  import torch.nn as nn
@@ -106,7 +107,7 @@ class PatchEmbed(nn.Module):
106
107
  activation (nn.Module): Activation function to use between convolutions.
107
108
  """
108
109
  super().__init__()
109
- img_size: Tuple[int, int] = to_2tuple(resolution)
110
+ img_size: tuple[int, int] = to_2tuple(resolution)
110
111
  self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
111
112
  self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
112
113
  self.in_chans = in_chans
@@ -219,7 +220,7 @@ class PatchMerging(nn.Module):
219
220
  torch.Size([4, 3136, 128])
220
221
  """
221
222
 
222
- def __init__(self, input_resolution: Tuple[int, int], dim: int, out_dim: int, activation):
223
+ def __init__(self, input_resolution: tuple[int, int], dim: int, out_dim: int, activation):
223
224
  """
224
225
  Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
225
226
 
@@ -283,13 +284,13 @@ class ConvLayer(nn.Module):
283
284
  def __init__(
284
285
  self,
285
286
  dim: int,
286
- input_resolution: Tuple[int, int],
287
+ input_resolution: tuple[int, int],
287
288
  depth: int,
288
289
  activation,
289
- drop_path: Union[float, List[float]] = 0.0,
290
- downsample: Optional[nn.Module] = None,
290
+ drop_path: float | list[float] = 0.0,
291
+ downsample: nn.Module | None = None,
291
292
  use_checkpoint: bool = False,
292
- out_dim: Optional[int] = None,
293
+ out_dim: int | None = None,
293
294
  conv_expand_ratio: float = 4.0,
294
295
  ):
295
296
  """
@@ -370,8 +371,8 @@ class MLP(nn.Module):
370
371
  def __init__(
371
372
  self,
372
373
  in_features: int,
373
- hidden_features: Optional[int] = None,
374
- out_features: Optional[int] = None,
374
+ hidden_features: int | None = None,
375
+ out_features: int | None = None,
375
376
  activation=nn.GELU,
376
377
  drop: float = 0.0,
377
378
  ):
@@ -441,7 +442,7 @@ class Attention(torch.nn.Module):
441
442
  key_dim: int,
442
443
  num_heads: int = 8,
443
444
  attn_ratio: float = 4,
444
- resolution: Tuple[int, int] = (14, 14),
445
+ resolution: tuple[int, int] = (14, 14),
445
446
  ):
446
447
  """
447
448
  Initialize the Attention module for multi-head attention with spatial awareness.
@@ -549,7 +550,7 @@ class TinyViTBlock(nn.Module):
549
550
  def __init__(
550
551
  self,
551
552
  dim: int,
552
- input_resolution: Tuple[int, int],
553
+ input_resolution: tuple[int, int],
553
554
  num_heads: int,
554
555
  window_size: int = 7,
555
556
  mlp_ratio: float = 4.0,
@@ -690,18 +691,18 @@ class BasicLayer(nn.Module):
690
691
  def __init__(
691
692
  self,
692
693
  dim: int,
693
- input_resolution: Tuple[int, int],
694
+ input_resolution: tuple[int, int],
694
695
  depth: int,
695
696
  num_heads: int,
696
697
  window_size: int,
697
698
  mlp_ratio: float = 4.0,
698
699
  drop: float = 0.0,
699
- drop_path: Union[float, List[float]] = 0.0,
700
- downsample: Optional[nn.Module] = None,
700
+ drop_path: float | list[float] = 0.0,
701
+ downsample: nn.Module | None = None,
701
702
  use_checkpoint: bool = False,
702
703
  local_conv_size: int = 3,
703
704
  activation=nn.GELU,
704
- out_dim: Optional[int] = None,
705
+ out_dim: int | None = None,
705
706
  ):
706
707
  """
707
708
  Initialize a BasicLayer in the TinyViT architecture.
@@ -800,10 +801,10 @@ class TinyViT(nn.Module):
800
801
  img_size: int = 224,
801
802
  in_chans: int = 3,
802
803
  num_classes: int = 1000,
803
- embed_dims: Tuple[int, int, int, int] = (96, 192, 384, 768),
804
- depths: Tuple[int, int, int, int] = (2, 2, 6, 2),
805
- num_heads: Tuple[int, int, int, int] = (3, 6, 12, 24),
806
- window_sizes: Tuple[int, int, int, int] = (7, 7, 14, 7),
804
+ embed_dims: tuple[int, int, int, int] = (96, 192, 384, 768),
805
+ depths: tuple[int, int, int, int] = (2, 2, 6, 2),
806
+ num_heads: tuple[int, int, int, int] = (3, 6, 12, 24),
807
+ window_sizes: tuple[int, int, int, int] = (7, 7, 14, 7),
807
808
  mlp_ratio: float = 4.0,
808
809
  drop_rate: float = 0.0,
809
810
  drop_path_rate: float = 0.1,
@@ -980,7 +981,7 @@ class TinyViT(nn.Module):
980
981
  """Perform the forward pass through the TinyViT model, extracting features from the input image."""
981
982
  return self.forward_features(x)
982
983
 
983
- def set_imgsz(self, imgsz: List[int] = [1024, 1024]):
984
+ def set_imgsz(self, imgsz: list[int] = [1024, 1024]):
984
985
  """Set image size to make model compatible with different image sizes."""
985
986
  imgsz = [s // 4 for s in imgsz]
986
987
  self.patches_resolution = imgsz
@@ -1,7 +1,8 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import math
4
- from typing import Tuple, Type
5
6
 
6
7
  import torch
7
8
  from torch import Tensor, nn
@@ -44,7 +45,7 @@ class TwoWayTransformer(nn.Module):
44
45
  embedding_dim: int,
45
46
  num_heads: int,
46
47
  mlp_dim: int,
47
- activation: Type[nn.Module] = nn.ReLU,
48
+ activation: type[nn.Module] = nn.ReLU,
48
49
  attention_downsample_rate: int = 2,
49
50
  ) -> None:
50
51
  """
@@ -85,7 +86,7 @@ class TwoWayTransformer(nn.Module):
85
86
  image_embedding: torch.Tensor,
86
87
  image_pe: torch.Tensor,
87
88
  point_embedding: torch.Tensor,
88
- ) -> Tuple[torch.Tensor, torch.Tensor]:
89
+ ) -> tuple[torch.Tensor, torch.Tensor]:
89
90
  """
90
91
  Process image and point embeddings through the Two-Way Transformer.
91
92
 
@@ -162,7 +163,7 @@ class TwoWayAttentionBlock(nn.Module):
162
163
  embedding_dim: int,
163
164
  num_heads: int,
164
165
  mlp_dim: int = 2048,
165
- activation: Type[nn.Module] = nn.ReLU,
166
+ activation: type[nn.Module] = nn.ReLU,
166
167
  attention_downsample_rate: int = 2,
167
168
  skip_first_layer_pe: bool = False,
168
169
  ) -> None:
@@ -198,7 +199,7 @@ class TwoWayAttentionBlock(nn.Module):
198
199
 
199
200
  def forward(
200
201
  self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
201
- ) -> Tuple[torch.Tensor, torch.Tensor]:
202
+ ) -> tuple[torch.Tensor, torch.Tensor]:
202
203
  """
203
204
  Apply two-way attention to process query and key embeddings in a transformer block.
204
205
 
@@ -1,12 +1,14 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
- from typing import Any, Dict, Tuple
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
4
6
 
5
7
  import torch
6
8
  import torch.nn.functional as F
7
9
 
8
10
 
9
- def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: Dict[int, Any], max_cond_frame_num: int):
11
+ def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: dict[int, Any], max_cond_frame_num: int):
10
12
  """
11
13
  Select the closest conditioning frames to a given frame index.
12
14
 
@@ -248,7 +250,7 @@ def window_partition(x: torch.Tensor, window_size: int):
248
250
  return windows, (Hp, Wp)
249
251
 
250
252
 
251
- def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]):
253
+ def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: tuple[int, int], hw: tuple[int, int]):
252
254
  """
253
255
  Unpartition windowed sequences into original sequences and remove padding.
254
256
 
@@ -333,8 +335,8 @@ def add_decomposed_rel_pos(
333
335
  q: torch.Tensor,
334
336
  rel_pos_h: torch.Tensor,
335
337
  rel_pos_w: torch.Tensor,
336
- q_size: Tuple[int, int],
337
- k_size: Tuple[int, int],
338
+ q_size: tuple[int, int],
339
+ k_size: tuple[int, int],
338
340
  ) -> torch.Tensor:
339
341
  """
340
342
  Add decomposed Relative Positional Embeddings to the attention map.
@@ -8,8 +8,10 @@ using SAM. It forms an integral part of the Ultralytics framework and is designe
8
8
  segmentation tasks.
9
9
  """
10
10
 
11
+ from __future__ import annotations
12
+
11
13
  from collections import OrderedDict
12
- from typing import Any, Dict, List, Optional, Tuple, Union
14
+ from typing import Any
13
15
 
14
16
  import cv2
15
17
  import numpy as np
@@ -1717,9 +1719,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1717
1719
  def __init__(
1718
1720
  self,
1719
1721
  cfg: Any = DEFAULT_CFG,
1720
- overrides: Optional[Dict[str, Any]] = None,
1722
+ overrides: dict[str, Any] | None = None,
1721
1723
  max_obj_num: int = 3,
1722
- _callbacks: Optional[Dict[str, Any]] = None,
1724
+ _callbacks: dict[str, Any] | None = None,
1723
1725
  ) -> None:
1724
1726
  """
1725
1727
  Initialize the predictor with configuration and optional overrides.
@@ -1759,14 +1761,14 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1759
1761
  @smart_inference_mode()
1760
1762
  def inference(
1761
1763
  self,
1762
- img: Union[torch.Tensor, np.ndarray],
1763
- bboxes: Optional[List[List[float]]] = None,
1764
- masks: Optional[Union[torch.Tensor, np.ndarray]] = None,
1765
- points: Optional[List[List[float]]] = None,
1766
- labels: Optional[List[int]] = None,
1767
- obj_ids: Optional[List[int]] = None,
1764
+ img: torch.Tensor | np.ndarray,
1765
+ bboxes: list[list[float]] | None = None,
1766
+ masks: torch.Tensor | np.ndarray | None = None,
1767
+ points: list[list[float]] | None = None,
1768
+ labels: list[int] | None = None,
1769
+ obj_ids: list[int] | None = None,
1768
1770
  update_memory: bool = False,
1769
- ) -> Tuple[torch.Tensor, torch.Tensor]:
1771
+ ) -> tuple[torch.Tensor, torch.Tensor]:
1770
1772
  """
1771
1773
  Perform inference on a single image with optional bounding boxes, masks, points and object IDs.
1772
1774
  It has two modes: one is to run inference on a single image without updating the memory,
@@ -1824,7 +1826,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1824
1826
  pred_scores = torch.clamp_(pred_scores / 32, min=0)
1825
1827
  return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
1826
1828
 
1827
- def get_im_features(self, img: Union[torch.Tensor, np.ndarray]) -> None:
1829
+ def get_im_features(self, img: torch.Tensor | np.ndarray) -> None:
1828
1830
  """
1829
1831
  Initialize the image state by processing the input image and extracting features.
1830
1832
 
@@ -1844,10 +1846,10 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1844
1846
  @smart_inference_mode()
1845
1847
  def update_memory(
1846
1848
  self,
1847
- obj_ids: List[int] = None,
1848
- points: Optional[torch.Tensor] = None,
1849
- labels: Optional[torch.Tensor] = None,
1850
- masks: Optional[torch.Tensor] = None,
1849
+ obj_ids: list[int] = None,
1850
+ points: torch.Tensor | None = None,
1851
+ labels: torch.Tensor | None = None,
1852
+ masks: torch.Tensor | None = None,
1851
1853
  ) -> None:
1852
1854
  """
1853
1855
  Append the imgState to the memory_bank and update the memory for the model.
@@ -1923,7 +1925,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1923
1925
  consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
1924
1926
  self.memory_bank.append(consolidated_out)
1925
1927
 
1926
- def _prepare_memory_conditioned_features(self, obj_idx: Optional[int]) -> torch.Tensor:
1928
+ def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
1927
1929
  """
1928
1930
  Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
1929
1931
  prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features for all
@@ -1958,7 +1960,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1958
1960
  *self.feat_sizes[-1],
1959
1961
  )
1960
1962
 
1961
- def get_maskmem_enc(self) -> Tuple[torch.Tensor, torch.Tensor]:
1963
+ def get_maskmem_enc(self) -> tuple[torch.Tensor, torch.Tensor]:
1962
1964
  """Get the memory and positional encoding from the memory, which is used to condition the current image
1963
1965
  features.
1964
1966
  """
@@ -1973,7 +1975,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1973
1975
  memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
1974
1976
  return memory, memory_pos_embed
1975
1977
 
1976
- def _obj_id_to_idx(self, obj_id: int) -> Optional[int]:
1978
+ def _obj_id_to_idx(self, obj_id: int) -> int | None:
1977
1979
  """
1978
1980
  Map client-side object id to model-side object index.
1979
1981
 
@@ -1987,11 +1989,11 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
1987
1989
 
1988
1990
  def track_step(
1989
1991
  self,
1990
- obj_idx: Optional[int] = None,
1991
- point: Optional[torch.Tensor] = None,
1992
- label: Optional[torch.Tensor] = None,
1993
- mask: Optional[torch.Tensor] = None,
1994
- ) -> Dict[str, Any]:
1992
+ obj_idx: int | None = None,
1993
+ point: torch.Tensor | None = None,
1994
+ label: torch.Tensor | None = None,
1995
+ mask: torch.Tensor | None = None,
1996
+ ) -> dict[str, Any]:
1995
1997
  """
1996
1998
  Tracking step for the current image state to predict masks.
1997
1999
 
@@ -2010,7 +2012,6 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2010
2012
  current_out (Dict[str, Any]): A dictionary containing the current output with mask predictions and object pointers.
2011
2013
  Keys include 'point_inputs', 'mask_inputs', 'pred_masks', 'pred_masks_high_res', 'obj_ptr', 'object_score_logits'.
2012
2014
  """
2013
- current_out = {}
2014
2015
  if mask is not None and self.model.use_mask_input_as_output_without_sam:
2015
2016
  # When use_mask_input_as_output_without_sam=True, we directly output the mask input
2016
2017
  # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
@@ -2021,7 +2022,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2021
2022
  # fused the visual feature with previous memory features in the memory bank
2022
2023
  pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
2023
2024
  # calculate the first feature if adding obj_idx exists(means adding prompts)
2024
- pix_feat_with_mem = pix_feat_with_mem[0:1] if obj_idx is not None else pix_feat_with_mem
2025
+ pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
2025
2026
  _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
2026
2027
  backbone_features=pix_feat_with_mem,
2027
2028
  point_inputs={"point_coords": point, "point_labels": label} if obj_idx is not None else None,
@@ -2029,9 +2030,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
2029
2030
  multimask_output=False,
2030
2031
  high_res_features=[feat[: pix_feat_with_mem.size(0)] for feat in self.high_res_features],
2031
2032
  )
2032
- current_out["pred_masks"] = low_res_masks
2033
- current_out["pred_masks_high_res"] = high_res_masks
2034
- current_out["obj_ptr"] = obj_ptr
2035
- current_out["object_score_logits"] = object_score_logits
2036
-
2037
- return current_out
2033
+ return {
2034
+ "pred_masks": low_res_masks,
2035
+ "pred_masks_high_res": high_res_masks,
2036
+ "obj_ptr": obj_ptr,
2037
+ "object_score_logits": object_score_logits,
2038
+ }