dgenerate-ultralytics-headless 8.3.190__py3-none-any.whl → 8.3.192__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/METADATA +1 -1
  2. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/RECORD +103 -102
  3. tests/test_cuda.py +6 -5
  4. tests/test_exports.py +1 -6
  5. tests/test_python.py +1 -4
  6. tests/test_solutions.py +1 -1
  7. ultralytics/__init__.py +1 -1
  8. ultralytics/cfg/__init__.py +16 -14
  9. ultralytics/cfg/datasets/SKU-110K.yaml +1 -1
  10. ultralytics/cfg/datasets/VisDrone.yaml +4 -4
  11. ultralytics/data/annotator.py +6 -6
  12. ultralytics/data/augment.py +53 -51
  13. ultralytics/data/base.py +15 -13
  14. ultralytics/data/build.py +7 -4
  15. ultralytics/data/converter.py +9 -10
  16. ultralytics/data/dataset.py +24 -22
  17. ultralytics/data/loaders.py +13 -11
  18. ultralytics/data/split.py +4 -3
  19. ultralytics/data/split_dota.py +14 -12
  20. ultralytics/data/utils.py +29 -23
  21. ultralytics/engine/exporter.py +2 -2
  22. ultralytics/engine/model.py +16 -14
  23. ultralytics/engine/predictor.py +8 -6
  24. ultralytics/engine/results.py +54 -52
  25. ultralytics/engine/trainer.py +8 -3
  26. ultralytics/engine/tuner.py +230 -42
  27. ultralytics/hub/google/__init__.py +7 -6
  28. ultralytics/hub/session.py +8 -6
  29. ultralytics/hub/utils.py +3 -4
  30. ultralytics/models/fastsam/model.py +8 -6
  31. ultralytics/models/nas/model.py +5 -3
  32. ultralytics/models/rtdetr/train.py +4 -3
  33. ultralytics/models/rtdetr/val.py +6 -4
  34. ultralytics/models/sam/amg.py +13 -10
  35. ultralytics/models/sam/model.py +3 -2
  36. ultralytics/models/sam/modules/blocks.py +21 -21
  37. ultralytics/models/sam/modules/decoders.py +11 -11
  38. ultralytics/models/sam/modules/encoders.py +25 -25
  39. ultralytics/models/sam/modules/memory_attention.py +9 -8
  40. ultralytics/models/sam/modules/sam.py +8 -10
  41. ultralytics/models/sam/modules/tiny_encoder.py +21 -20
  42. ultralytics/models/sam/modules/transformer.py +6 -5
  43. ultralytics/models/sam/modules/utils.py +7 -5
  44. ultralytics/models/sam/predict.py +32 -31
  45. ultralytics/models/utils/loss.py +29 -27
  46. ultralytics/models/utils/ops.py +10 -8
  47. ultralytics/models/yolo/classify/train.py +9 -7
  48. ultralytics/models/yolo/classify/val.py +11 -9
  49. ultralytics/models/yolo/detect/predict.py +1 -1
  50. ultralytics/models/yolo/detect/train.py +8 -6
  51. ultralytics/models/yolo/detect/val.py +22 -20
  52. ultralytics/models/yolo/model.py +14 -14
  53. ultralytics/models/yolo/obb/train.py +5 -3
  54. ultralytics/models/yolo/obb/val.py +11 -9
  55. ultralytics/models/yolo/pose/train.py +7 -5
  56. ultralytics/models/yolo/pose/val.py +12 -10
  57. ultralytics/models/yolo/segment/train.py +4 -5
  58. ultralytics/models/yolo/segment/val.py +13 -11
  59. ultralytics/models/yolo/world/train.py +10 -8
  60. ultralytics/models/yolo/yoloe/train.py +10 -10
  61. ultralytics/models/yolo/yoloe/val.py +11 -9
  62. ultralytics/nn/autobackend.py +17 -19
  63. ultralytics/nn/modules/block.py +12 -12
  64. ultralytics/nn/modules/conv.py +4 -3
  65. ultralytics/nn/modules/head.py +41 -37
  66. ultralytics/nn/modules/transformer.py +22 -21
  67. ultralytics/nn/tasks.py +2 -2
  68. ultralytics/nn/text_model.py +6 -5
  69. ultralytics/solutions/analytics.py +7 -5
  70. ultralytics/solutions/config.py +12 -10
  71. ultralytics/solutions/distance_calculation.py +3 -3
  72. ultralytics/solutions/heatmap.py +4 -2
  73. ultralytics/solutions/object_counter.py +5 -3
  74. ultralytics/solutions/parking_management.py +4 -2
  75. ultralytics/solutions/region_counter.py +7 -5
  76. ultralytics/solutions/similarity_search.py +5 -3
  77. ultralytics/solutions/solutions.py +38 -36
  78. ultralytics/solutions/streamlit_inference.py +8 -7
  79. ultralytics/trackers/bot_sort.py +11 -9
  80. ultralytics/trackers/byte_tracker.py +17 -15
  81. ultralytics/trackers/utils/gmc.py +4 -3
  82. ultralytics/utils/__init__.py +16 -88
  83. ultralytics/utils/autobatch.py +3 -2
  84. ultralytics/utils/autodevice.py +10 -10
  85. ultralytics/utils/benchmarks.py +11 -10
  86. ultralytics/utils/callbacks/comet.py +9 -9
  87. ultralytics/utils/checks.py +17 -26
  88. ultralytics/utils/export.py +12 -11
  89. ultralytics/utils/files.py +8 -7
  90. ultralytics/utils/git.py +139 -0
  91. ultralytics/utils/instance.py +8 -7
  92. ultralytics/utils/loss.py +15 -13
  93. ultralytics/utils/metrics.py +62 -62
  94. ultralytics/utils/ops.py +3 -2
  95. ultralytics/utils/patches.py +6 -4
  96. ultralytics/utils/plotting.py +20 -18
  97. ultralytics/utils/torch_utils.py +4 -2
  98. ultralytics/utils/tqdm.py +18 -14
  99. ultralytics/utils/triton.py +3 -2
  100. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/WHEEL +0 -0
  101. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/entry_points.txt +0 -0
  102. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/licenses/LICENSE +0 -0
  103. {dgenerate_ultralytics_headless-8.3.190.dist-info → dgenerate_ultralytics_headless-8.3.192.dist-info}/top_level.txt +0 -0
@@ -1,15 +1,18 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import math
6
+ from collections.abc import Generator
4
7
  from itertools import product
5
- from typing import Any, Generator, List, Tuple
8
+ from typing import Any
6
9
 
7
10
  import numpy as np
8
11
  import torch
9
12
 
10
13
 
11
14
  def is_box_near_crop_edge(
12
- boxes: torch.Tensor, crop_box: List[int], orig_box: List[int], atol: float = 20.0
15
+ boxes: torch.Tensor, crop_box: list[int], orig_box: list[int], atol: float = 20.0
13
16
  ) -> torch.Tensor:
14
17
  """
15
18
  Determine if bounding boxes are near the edge of a cropped image region using a specified tolerance.
@@ -38,7 +41,7 @@ def is_box_near_crop_edge(
38
41
  return torch.any(near_crop_edge, dim=1)
39
42
 
40
43
 
41
- def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
44
+ def batch_iterator(batch_size: int, *args) -> Generator[list[Any]]:
42
45
  """
43
46
  Yield batches of data from input arguments with specified batch size for efficient processing.
44
47
 
@@ -106,14 +109,14 @@ def build_point_grid(n_per_side: int) -> np.ndarray:
106
109
  return np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
107
110
 
108
111
 
109
- def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer: int) -> List[np.ndarray]:
112
+ def build_all_layer_point_grids(n_per_side: int, n_layers: int, scale_per_layer: int) -> list[np.ndarray]:
110
113
  """Generate point grids for multiple crop layers with varying scales and densities."""
111
114
  return [build_point_grid(int(n_per_side / (scale_per_layer**i))) for i in range(n_layers + 1)]
112
115
 
113
116
 
114
117
  def generate_crop_boxes(
115
- im_size: Tuple[int, ...], n_layers: int, overlap_ratio: float
116
- ) -> Tuple[List[List[int]], List[int]]:
118
+ im_size: tuple[int, ...], n_layers: int, overlap_ratio: float
119
+ ) -> tuple[list[list[int]], list[int]]:
117
120
  """
118
121
  Generate crop boxes of varying sizes for multiscale image processing, with layered overlapping regions.
119
122
 
@@ -163,7 +166,7 @@ def generate_crop_boxes(
163
166
  return crop_boxes, layer_idxs
164
167
 
165
168
 
166
- def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
169
+ def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: list[int]) -> torch.Tensor:
167
170
  """Uncrop bounding boxes by adding the crop box offset to their coordinates."""
168
171
  x0, y0, _, _ = crop_box
169
172
  offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
@@ -173,7 +176,7 @@ def uncrop_boxes_xyxy(boxes: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
173
176
  return boxes + offset
174
177
 
175
178
 
176
- def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
179
+ def uncrop_points(points: torch.Tensor, crop_box: list[int]) -> torch.Tensor:
177
180
  """Uncrop points by adding the crop box offset to their coordinates."""
178
181
  x0, y0, _, _ = crop_box
179
182
  offset = torch.tensor([[x0, y0]], device=points.device)
@@ -183,7 +186,7 @@ def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
183
186
  return points + offset
184
187
 
185
188
 
186
- def uncrop_masks(masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w: int) -> torch.Tensor:
189
+ def uncrop_masks(masks: torch.Tensor, crop_box: list[int], orig_h: int, orig_w: int) -> torch.Tensor:
187
190
  """Uncrop masks by padding them to the original image size, handling coordinate transformations."""
188
191
  x0, y0, x1, y1 = crop_box
189
192
  if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
@@ -194,7 +197,7 @@ def uncrop_masks(masks: torch.Tensor, crop_box: List[int], orig_h: int, orig_w:
194
197
  return torch.nn.functional.pad(masks, pad, value=0)
195
198
 
196
199
 
197
- def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> Tuple[np.ndarray, bool]:
200
+ def remove_small_regions(mask: np.ndarray, area_thresh: float, mode: str) -> tuple[np.ndarray, bool]:
198
201
  """
199
202
  Remove small disconnected regions or holes in a mask based on area threshold and mode.
200
203
 
@@ -14,8 +14,9 @@ Key Features:
14
14
  - Trained on SA-1B dataset
15
15
  """
16
16
 
17
+ from __future__ import annotations
18
+
17
19
  from pathlib import Path
18
- from typing import Dict, Type
19
20
 
20
21
  from ultralytics.engine.model import Model
21
22
  from ultralytics.utils.torch_utils import model_info
@@ -154,7 +155,7 @@ class SAM(Model):
154
155
  return model_info(self.model, detailed=detailed, verbose=verbose)
155
156
 
156
157
  @property
157
- def task_map(self) -> Dict[str, Dict[str, Type[Predictor]]]:
158
+ def task_map(self) -> dict[str, dict[str, type[Predictor]]]:
158
159
  """
159
160
  Provide a mapping from the 'segment' task to its corresponding 'Predictor'.
160
161
 
@@ -1,9 +1,9 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
+ from __future__ import annotations
2
3
 
3
4
  import copy
4
5
  import math
5
6
  from functools import partial
6
- from typing import Optional, Tuple, Type, Union
7
7
 
8
8
  import numpy as np
9
9
  import torch
@@ -81,7 +81,7 @@ class MaskDownSampler(nn.Module):
81
81
  stride: int = 4,
82
82
  padding: int = 0,
83
83
  total_stride: int = 16,
84
- activation: Type[nn.Module] = nn.GELU,
84
+ activation: type[nn.Module] = nn.GELU,
85
85
  ):
86
86
  """Initialize a mask downsampler module for progressive downsampling and channel expansion."""
87
87
  super().__init__()
@@ -227,7 +227,7 @@ class Fuser(nn.Module):
227
227
  torch.Size([1, 256, 32, 32])
228
228
  """
229
229
 
230
- def __init__(self, layer: nn.Module, num_layers: int, dim: Optional[int] = None, input_projection: bool = False):
230
+ def __init__(self, layer: nn.Module, num_layers: int, dim: int | None = None, input_projection: bool = False):
231
231
  """
232
232
  Initialize the Fuser module for feature fusion through multiple layers.
233
233
 
@@ -295,7 +295,7 @@ class SAM2TwoWayAttentionBlock(TwoWayAttentionBlock):
295
295
  embedding_dim: int,
296
296
  num_heads: int,
297
297
  mlp_dim: int = 2048,
298
- activation: Type[nn.Module] = nn.ReLU,
298
+ activation: type[nn.Module] = nn.ReLU,
299
299
  attention_downsample_rate: int = 2,
300
300
  skip_first_layer_pe: bool = False,
301
301
  ) -> None:
@@ -359,7 +359,7 @@ class SAM2TwoWayTransformer(TwoWayTransformer):
359
359
  embedding_dim: int,
360
360
  num_heads: int,
361
361
  mlp_dim: int,
362
- activation: Type[nn.Module] = nn.ReLU,
362
+ activation: type[nn.Module] = nn.ReLU,
363
363
  attention_downsample_rate: int = 2,
364
364
  ) -> None:
365
365
  """
@@ -432,7 +432,7 @@ class RoPEAttention(Attention):
432
432
  *args,
433
433
  rope_theta: float = 10000.0,
434
434
  rope_k_repeat: bool = False,
435
- feat_sizes: Tuple[int, int] = (32, 32), # [w, h] for stride 16 feats at 512 resolution
435
+ feat_sizes: tuple[int, int] = (32, 32), # [w, h] for stride 16 feats at 512 resolution
436
436
  **kwargs,
437
437
  ):
438
438
  """Initialize RoPEAttention with rotary position encoding for enhanced positional awareness."""
@@ -618,9 +618,9 @@ class MultiScaleBlock(nn.Module):
618
618
  num_heads: int,
619
619
  mlp_ratio: float = 4.0,
620
620
  drop_path: float = 0.0,
621
- norm_layer: Union[nn.Module, str] = "LayerNorm",
622
- q_stride: Tuple[int, int] = None,
623
- act_layer: Type[nn.Module] = nn.GELU,
621
+ norm_layer: nn.Module | str = "LayerNorm",
622
+ q_stride: tuple[int, int] = None,
623
+ act_layer: type[nn.Module] = nn.GELU,
624
624
  window_size: int = 0,
625
625
  ):
626
626
  """Initialize a multiscale attention block with window partitioning and optional query pooling."""
@@ -728,7 +728,7 @@ class PositionEmbeddingSine(nn.Module):
728
728
  num_pos_feats: int,
729
729
  temperature: int = 10000,
730
730
  normalize: bool = True,
731
- scale: Optional[float] = None,
731
+ scale: float | None = None,
732
732
  ):
733
733
  """Initialize sinusoidal position embeddings for 2D image inputs."""
734
734
  super().__init__()
@@ -744,7 +744,7 @@ class PositionEmbeddingSine(nn.Module):
744
744
 
745
745
  self.cache = {}
746
746
 
747
- def _encode_xy(self, x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
747
+ def _encode_xy(self, x: torch.Tensor, y: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
748
748
  """Encode 2D positions using sine/cosine functions for transformer positional embeddings."""
749
749
  assert len(x) == len(y) and x.ndim == y.ndim == 1
750
750
  x_embed = x * self.scale
@@ -833,7 +833,7 @@ class PositionEmbeddingRandom(nn.Module):
833
833
  torch.Size([128, 32, 32])
834
834
  """
835
835
 
836
- def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None:
836
+ def __init__(self, num_pos_feats: int = 64, scale: float | None = None) -> None:
837
837
  """Initialize random spatial frequency position embedding for transformers."""
838
838
  super().__init__()
839
839
  if scale is None or scale <= 0.0:
@@ -853,7 +853,7 @@ class PositionEmbeddingRandom(nn.Module):
853
853
  # Outputs d_1 x ... x d_n x C shape
854
854
  return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
855
855
 
856
- def forward(self, size: Tuple[int, int]) -> torch.Tensor:
856
+ def forward(self, size: tuple[int, int]) -> torch.Tensor:
857
857
  """Generate positional encoding for a grid using random spatial frequencies."""
858
858
  h, w = size
859
859
  grid = torch.ones(
@@ -869,7 +869,7 @@ class PositionEmbeddingRandom(nn.Module):
869
869
  pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
870
870
  return pe.permute(2, 0, 1) # C x H x W
871
871
 
872
- def forward_with_coords(self, coords_input: torch.Tensor, image_size: Tuple[int, int]) -> torch.Tensor:
872
+ def forward_with_coords(self, coords_input: torch.Tensor, image_size: tuple[int, int]) -> torch.Tensor:
873
873
  """Positionally encode input coordinates, normalizing them to [0,1] based on the given image size."""
874
874
  coords = coords_input.clone()
875
875
  coords[:, :, 0] = coords[:, :, 0] / image_size[1]
@@ -910,12 +910,12 @@ class Block(nn.Module):
910
910
  num_heads: int,
911
911
  mlp_ratio: float = 4.0,
912
912
  qkv_bias: bool = True,
913
- norm_layer: Type[nn.Module] = nn.LayerNorm,
914
- act_layer: Type[nn.Module] = nn.GELU,
913
+ norm_layer: type[nn.Module] = nn.LayerNorm,
914
+ act_layer: type[nn.Module] = nn.GELU,
915
915
  use_rel_pos: bool = False,
916
916
  rel_pos_zero_init: bool = True,
917
917
  window_size: int = 0,
918
- input_size: Optional[Tuple[int, int]] = None,
918
+ input_size: tuple[int, int] | None = None,
919
919
  ) -> None:
920
920
  """
921
921
  Initialize a transformer block with optional window attention and relative positional embeddings.
@@ -1012,7 +1012,7 @@ class REAttention(nn.Module):
1012
1012
  qkv_bias: bool = True,
1013
1013
  use_rel_pos: bool = False,
1014
1014
  rel_pos_zero_init: bool = True,
1015
- input_size: Optional[Tuple[int, int]] = None,
1015
+ input_size: tuple[int, int] | None = None,
1016
1016
  ) -> None:
1017
1017
  """
1018
1018
  Initialize a Relative Position Attention module for transformer-based architectures.
@@ -1093,9 +1093,9 @@ class PatchEmbed(nn.Module):
1093
1093
 
1094
1094
  def __init__(
1095
1095
  self,
1096
- kernel_size: Tuple[int, int] = (16, 16),
1097
- stride: Tuple[int, int] = (16, 16),
1098
- padding: Tuple[int, int] = (0, 0),
1096
+ kernel_size: tuple[int, int] = (16, 16),
1097
+ stride: tuple[int, int] = (16, 16),
1098
+ padding: tuple[int, int] = (0, 0),
1099
1099
  in_chans: int = 3,
1100
1100
  embed_dim: int = 768,
1101
1101
  ) -> None:
@@ -1,6 +1,6 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
- from typing import List, Optional, Tuple, Type
3
+ from __future__ import annotations
4
4
 
5
5
  import torch
6
6
  from torch import nn
@@ -43,7 +43,7 @@ class MaskDecoder(nn.Module):
43
43
  transformer_dim: int,
44
44
  transformer: nn.Module,
45
45
  num_multimask_outputs: int = 3,
46
- activation: Type[nn.Module] = nn.GELU,
46
+ activation: type[nn.Module] = nn.GELU,
47
47
  iou_head_depth: int = 3,
48
48
  iou_head_hidden_dim: int = 256,
49
49
  ) -> None:
@@ -93,7 +93,7 @@ class MaskDecoder(nn.Module):
93
93
  sparse_prompt_embeddings: torch.Tensor,
94
94
  dense_prompt_embeddings: torch.Tensor,
95
95
  multimask_output: bool,
96
- ) -> Tuple[torch.Tensor, torch.Tensor]:
96
+ ) -> tuple[torch.Tensor, torch.Tensor]:
97
97
  """
98
98
  Predict masks given image and prompt embeddings.
99
99
 
@@ -137,7 +137,7 @@ class MaskDecoder(nn.Module):
137
137
  image_pe: torch.Tensor,
138
138
  sparse_prompt_embeddings: torch.Tensor,
139
139
  dense_prompt_embeddings: torch.Tensor,
140
- ) -> Tuple[torch.Tensor, torch.Tensor]:
140
+ ) -> tuple[torch.Tensor, torch.Tensor]:
141
141
  """Predict masks and quality scores using image and prompt embeddings via transformer architecture."""
142
142
  # Concatenate output tokens
143
143
  output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
@@ -158,7 +158,7 @@ class MaskDecoder(nn.Module):
158
158
  # Upscale mask embeddings and predict masks using the mask tokens
159
159
  src = src.transpose(1, 2).view(b, c, h, w)
160
160
  upscaled_embedding = self.output_upscaling(src)
161
- hyper_in_list: List[torch.Tensor] = [
161
+ hyper_in_list: list[torch.Tensor] = [
162
162
  self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
163
163
  ]
164
164
  hyper_in = torch.stack(hyper_in_list, dim=1)
@@ -221,7 +221,7 @@ class SAM2MaskDecoder(nn.Module):
221
221
  transformer_dim: int,
222
222
  transformer: nn.Module,
223
223
  num_multimask_outputs: int = 3,
224
- activation: Type[nn.Module] = nn.GELU,
224
+ activation: type[nn.Module] = nn.GELU,
225
225
  iou_head_depth: int = 3,
226
226
  iou_head_hidden_dim: int = 256,
227
227
  use_high_res_features: bool = False,
@@ -317,8 +317,8 @@ class SAM2MaskDecoder(nn.Module):
317
317
  dense_prompt_embeddings: torch.Tensor,
318
318
  multimask_output: bool,
319
319
  repeat_image: bool,
320
- high_res_features: Optional[List[torch.Tensor]] = None,
321
- ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
320
+ high_res_features: list[torch.Tensor] | None = None,
321
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
322
322
  """
323
323
  Predict masks given image and prompt embeddings.
324
324
 
@@ -385,8 +385,8 @@ class SAM2MaskDecoder(nn.Module):
385
385
  sparse_prompt_embeddings: torch.Tensor,
386
386
  dense_prompt_embeddings: torch.Tensor,
387
387
  repeat_image: bool,
388
- high_res_features: Optional[List[torch.Tensor]] = None,
389
- ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
388
+ high_res_features: list[torch.Tensor] | None = None,
389
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
390
390
  """Predict instance segmentation masks from image and prompt embeddings using a transformer."""
391
391
  # Concatenate output tokens
392
392
  s = 0
@@ -431,7 +431,7 @@ class SAM2MaskDecoder(nn.Module):
431
431
  upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
432
432
  upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
433
433
 
434
- hyper_in_list: List[torch.Tensor] = [
434
+ hyper_in_list: list[torch.Tensor] = [
435
435
  self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
436
436
  ]
437
437
  hyper_in = torch.stack(hyper_in_list, dim=1)
@@ -1,6 +1,6 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
- from typing import List, Optional, Tuple, Type
3
+ from __future__ import annotations
4
4
 
5
5
  import torch
6
6
  import torch.nn as nn
@@ -56,13 +56,13 @@ class ImageEncoderViT(nn.Module):
56
56
  mlp_ratio: float = 4.0,
57
57
  out_chans: int = 256,
58
58
  qkv_bias: bool = True,
59
- norm_layer: Type[nn.Module] = nn.LayerNorm,
60
- act_layer: Type[nn.Module] = nn.GELU,
59
+ norm_layer: type[nn.Module] = nn.LayerNorm,
60
+ act_layer: type[nn.Module] = nn.GELU,
61
61
  use_abs_pos: bool = True,
62
62
  use_rel_pos: bool = False,
63
63
  rel_pos_zero_init: bool = True,
64
64
  window_size: int = 0,
65
- global_attn_indexes: Tuple[int, ...] = (),
65
+ global_attn_indexes: tuple[int, ...] = (),
66
66
  ) -> None:
67
67
  """
68
68
  Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
@@ -101,7 +101,7 @@ class ImageEncoderViT(nn.Module):
101
101
  embed_dim=embed_dim,
102
102
  )
103
103
 
104
- self.pos_embed: Optional[nn.Parameter] = None
104
+ self.pos_embed: nn.Parameter | None = None
105
105
  if use_abs_pos:
106
106
  # Initialize absolute positional embedding with pretrain image size
107
107
  self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
@@ -188,10 +188,10 @@ class PromptEncoder(nn.Module):
188
188
  def __init__(
189
189
  self,
190
190
  embed_dim: int,
191
- image_embedding_size: Tuple[int, int],
192
- input_image_size: Tuple[int, int],
191
+ image_embedding_size: tuple[int, int],
192
+ input_image_size: tuple[int, int],
193
193
  mask_in_chans: int,
194
- activation: Type[nn.Module] = nn.GELU,
194
+ activation: type[nn.Module] = nn.GELU,
195
195
  ) -> None:
196
196
  """
197
197
  Initialize the PromptEncoder module for encoding various types of prompts.
@@ -286,9 +286,9 @@ class PromptEncoder(nn.Module):
286
286
 
287
287
  @staticmethod
288
288
  def _get_batch_size(
289
- points: Optional[Tuple[torch.Tensor, torch.Tensor]],
290
- boxes: Optional[torch.Tensor],
291
- masks: Optional[torch.Tensor],
289
+ points: tuple[torch.Tensor, torch.Tensor] | None,
290
+ boxes: torch.Tensor | None,
291
+ masks: torch.Tensor | None,
292
292
  ) -> int:
293
293
  """Get the batch size of the output given the batch size of the input prompts."""
294
294
  if points is not None:
@@ -302,10 +302,10 @@ class PromptEncoder(nn.Module):
302
302
 
303
303
  def forward(
304
304
  self,
305
- points: Optional[Tuple[torch.Tensor, torch.Tensor]],
306
- boxes: Optional[torch.Tensor],
307
- masks: Optional[torch.Tensor],
308
- ) -> Tuple[torch.Tensor, torch.Tensor]:
305
+ points: tuple[torch.Tensor, torch.Tensor] | None,
306
+ boxes: torch.Tensor | None,
307
+ masks: torch.Tensor | None,
308
+ ) -> tuple[torch.Tensor, torch.Tensor]:
309
309
  """
310
310
  Embed different types of prompts, returning both sparse and dense embeddings.
311
311
 
@@ -542,13 +542,13 @@ class FpnNeck(nn.Module):
542
542
  def __init__(
543
543
  self,
544
544
  d_model: int,
545
- backbone_channel_list: List[int],
545
+ backbone_channel_list: list[int],
546
546
  kernel_size: int = 1,
547
547
  stride: int = 1,
548
548
  padding: int = 0,
549
549
  fpn_interp_model: str = "bilinear",
550
550
  fuse_type: str = "sum",
551
- fpn_top_down_levels: Optional[List[int]] = None,
551
+ fpn_top_down_levels: list[int] | None = None,
552
552
  ):
553
553
  """
554
554
  Initialize a modified Feature Pyramid Network (FPN) neck.
@@ -602,7 +602,7 @@ class FpnNeck(nn.Module):
602
602
  fpn_top_down_levels = range(len(self.convs))
603
603
  self.fpn_top_down_levels = list(fpn_top_down_levels)
604
604
 
605
- def forward(self, xs: List[torch.Tensor]):
605
+ def forward(self, xs: list[torch.Tensor]):
606
606
  """
607
607
  Perform forward pass through the Feature Pyramid Network (FPN) neck.
608
608
 
@@ -695,20 +695,20 @@ class Hiera(nn.Module):
695
695
  num_heads: int = 1, # initial number of heads
696
696
  drop_path_rate: float = 0.0, # stochastic depth
697
697
  q_pool: int = 3, # number of q_pool stages
698
- q_stride: Tuple[int, int] = (2, 2), # downsample stride bet. stages
699
- stages: Tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
698
+ q_stride: tuple[int, int] = (2, 2), # downsample stride bet. stages
699
+ stages: tuple[int, ...] = (2, 3, 16, 3), # blocks per stage
700
700
  dim_mul: float = 2.0, # dim_mul factor at stage shift
701
701
  head_mul: float = 2.0, # head_mul factor at stage shift
702
- window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
702
+ window_pos_embed_bkg_spatial_size: tuple[int, int] = (14, 14),
703
703
  # window size per stage, when not using global att.
704
- window_spec: Tuple[int, ...] = (
704
+ window_spec: tuple[int, ...] = (
705
705
  8,
706
706
  4,
707
707
  14,
708
708
  7,
709
709
  ),
710
710
  # global attn in these blocks
711
- global_att_blocks: Tuple[int, ...] = (
711
+ global_att_blocks: tuple[int, ...] = (
712
712
  12,
713
713
  16,
714
714
  20,
@@ -806,7 +806,7 @@ class Hiera(nn.Module):
806
806
  else [self.blocks[-1].dim_out]
807
807
  )
808
808
 
809
- def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
809
+ def _get_pos_embed(self, hw: tuple[int, int]) -> torch.Tensor:
810
810
  """Generate positional embeddings by interpolating and combining window and background embeddings."""
811
811
  h, w = hw
812
812
  window_embed = self.pos_embed_window
@@ -815,7 +815,7 @@ class Hiera(nn.Module):
815
815
  pos_embed = pos_embed.permute(0, 2, 3, 1)
816
816
  return pos_embed
817
817
 
818
- def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
818
+ def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
819
819
  """
820
820
  Perform forward pass through Hiera model, extracting multiscale features from input images.
821
821
 
@@ -1,7 +1,8 @@
1
1
  # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import copy
4
- from typing import Optional
5
6
 
6
7
  import torch
7
8
  from torch import nn
@@ -103,7 +104,7 @@ class MemoryAttentionLayer(nn.Module):
103
104
  self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
104
105
  self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
105
106
 
106
- def _forward_sa(self, tgt: torch.Tensor, query_pos: Optional[torch.Tensor]) -> torch.Tensor:
107
+ def _forward_sa(self, tgt: torch.Tensor, query_pos: torch.Tensor | None) -> torch.Tensor:
107
108
  """Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
108
109
  tgt2 = self.norm1(tgt)
109
110
  q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
@@ -115,8 +116,8 @@ class MemoryAttentionLayer(nn.Module):
115
116
  self,
116
117
  tgt: torch.Tensor,
117
118
  memory: torch.Tensor,
118
- query_pos: Optional[torch.Tensor],
119
- pos: Optional[torch.Tensor],
119
+ query_pos: torch.Tensor | None,
120
+ pos: torch.Tensor | None,
120
121
  num_k_exclude_rope: int = 0,
121
122
  ) -> torch.Tensor:
122
123
  """Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
@@ -140,8 +141,8 @@ class MemoryAttentionLayer(nn.Module):
140
141
  self,
141
142
  tgt: torch.Tensor,
142
143
  memory: torch.Tensor,
143
- pos: Optional[torch.Tensor] = None,
144
- query_pos: Optional[torch.Tensor] = None,
144
+ pos: torch.Tensor | None = None,
145
+ query_pos: torch.Tensor | None = None,
145
146
  num_k_exclude_rope: int = 0,
146
147
  ) -> torch.Tensor:
147
148
  """
@@ -242,8 +243,8 @@ class MemoryAttention(nn.Module):
242
243
  self,
243
244
  curr: torch.Tensor, # self-attention inputs
244
245
  memory: torch.Tensor, # cross-attention inputs
245
- curr_pos: Optional[torch.Tensor] = None, # pos_enc for self-attention inputs
246
- memory_pos: Optional[torch.Tensor] = None, # pos_enc for cross-attention inputs
246
+ curr_pos: torch.Tensor | None = None, # pos_enc for self-attention inputs
247
+ memory_pos: torch.Tensor | None = None, # pos_enc for cross-attention inputs
247
248
  num_obj_ptr_tokens: int = 0, # number of object pointer *tokens*
248
249
  ) -> torch.Tensor:
249
250
  """
@@ -3,10 +3,7 @@
3
3
  # Copyright (c) Meta Platforms, Inc. and affiliates.
4
4
  # All rights reserved.
5
5
 
6
- # This source code is licensed under the license found in the
7
- # LICENSE file in the root directory of this source tree.
8
-
9
- from typing import List
6
+ from __future__ import annotations
10
7
 
11
8
  import torch
12
9
  import torch.nn.functional as F
@@ -61,8 +58,8 @@ class SAMModel(nn.Module):
61
58
  image_encoder: ImageEncoderViT,
62
59
  prompt_encoder: PromptEncoder,
63
60
  mask_decoder: MaskDecoder,
64
- pixel_mean: List[float] = (123.675, 116.28, 103.53),
65
- pixel_std: List[float] = (58.395, 57.12, 57.375),
61
+ pixel_mean: list[float] = (123.675, 116.28, 103.53),
62
+ pixel_std: list[float] = (58.395, 57.12, 57.375),
66
63
  ) -> None:
67
64
  """
68
65
  Initialize the SAMModel class to predict object masks from an image and input prompts.
@@ -959,7 +956,6 @@ class SAM2Model(torch.nn.Module):
959
956
  prev_sam_mask_logits=None,
960
957
  ):
961
958
  """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
962
- current_out = {}
963
959
  sam_outputs, _, _ = self._track_step(
964
960
  frame_idx,
965
961
  is_init_cond_frame,
@@ -975,9 +971,11 @@ class SAM2Model(torch.nn.Module):
975
971
  )
976
972
  _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
977
973
 
978
- current_out["pred_masks"] = low_res_masks
979
- current_out["pred_masks_high_res"] = high_res_masks
980
- current_out["obj_ptr"] = obj_ptr
974
+ current_out = {
975
+ "pred_masks": low_res_masks,
976
+ "pred_masks_high_res": high_res_masks,
977
+ "obj_ptr": obj_ptr,
978
+ }
981
979
  if not self.training:
982
980
  # Only add this in inference (to avoid unused param in activation checkpointing;
983
981
  # it's mainly used in the demo to encode spatial memories w/ consolidated masks)