PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.189__py3-none-any.whl → 8.3.191__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.189py3-none-any.whl → 8.3.191py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (111) hide show

{dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/METADATA +1 -1
{dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/RECORD +111 -109
tests/test_cuda.py +6 -5
tests/test_exports.py +1 -6
tests/test_python.py +1 -4
tests/test_solutions.py +1 -1
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +16 -14
ultralytics/cfg/datasets/VisDrone.yaml +4 -4
ultralytics/data/annotator.py +6 -6
ultralytics/data/augment.py +53 -51
ultralytics/data/base.py +15 -13
ultralytics/data/build.py +7 -4
ultralytics/data/converter.py +9 -10
ultralytics/data/dataset.py +24 -22
ultralytics/data/loaders.py +13 -11
ultralytics/data/split.py +4 -3
ultralytics/data/split_dota.py +14 -12
ultralytics/data/utils.py +31 -25
ultralytics/engine/exporter.py +7 -4
ultralytics/engine/model.py +16 -14
ultralytics/engine/predictor.py +9 -7
ultralytics/engine/results.py +59 -57
ultralytics/engine/trainer.py +7 -0
ultralytics/engine/tuner.py +4 -3
ultralytics/engine/validator.py +3 -1
ultralytics/hub/__init__.py +6 -2
ultralytics/hub/auth.py +2 -2
ultralytics/hub/google/__init__.py +9 -8
ultralytics/hub/session.py +11 -11
ultralytics/hub/utils.py +8 -9
ultralytics/models/fastsam/model.py +8 -6
ultralytics/models/nas/model.py +5 -3
ultralytics/models/rtdetr/train.py +4 -3
ultralytics/models/rtdetr/val.py +6 -4
ultralytics/models/sam/amg.py +13 -10
ultralytics/models/sam/model.py +3 -2
ultralytics/models/sam/modules/blocks.py +21 -21
ultralytics/models/sam/modules/decoders.py +11 -11
ultralytics/models/sam/modules/encoders.py +25 -25
ultralytics/models/sam/modules/memory_attention.py +9 -8
ultralytics/models/sam/modules/sam.py +8 -10
ultralytics/models/sam/modules/tiny_encoder.py +21 -20
ultralytics/models/sam/modules/transformer.py +6 -5
ultralytics/models/sam/modules/utils.py +7 -5
ultralytics/models/sam/predict.py +32 -31
ultralytics/models/utils/loss.py +29 -27
ultralytics/models/utils/ops.py +10 -8
ultralytics/models/yolo/classify/train.py +7 -5
ultralytics/models/yolo/classify/val.py +10 -8
ultralytics/models/yolo/detect/predict.py +3 -3
ultralytics/models/yolo/detect/train.py +8 -6
ultralytics/models/yolo/detect/val.py +23 -21
ultralytics/models/yolo/model.py +14 -14
ultralytics/models/yolo/obb/train.py +5 -3
ultralytics/models/yolo/obb/val.py +13 -10
ultralytics/models/yolo/pose/train.py +7 -5
ultralytics/models/yolo/pose/val.py +11 -9
ultralytics/models/yolo/segment/train.py +4 -5
ultralytics/models/yolo/segment/val.py +12 -10
ultralytics/models/yolo/world/train.py +9 -7
ultralytics/models/yolo/yoloe/train.py +7 -6
ultralytics/models/yolo/yoloe/val.py +10 -8
ultralytics/nn/autobackend.py +40 -52
ultralytics/nn/modules/__init__.py +3 -3
ultralytics/nn/modules/block.py +12 -12
ultralytics/nn/modules/conv.py +4 -3
ultralytics/nn/modules/head.py +46 -38
ultralytics/nn/modules/transformer.py +22 -21
ultralytics/nn/tasks.py +2 -2
ultralytics/nn/text_model.py +6 -5
ultralytics/solutions/analytics.py +7 -5
ultralytics/solutions/config.py +12 -10
ultralytics/solutions/distance_calculation.py +3 -3
ultralytics/solutions/heatmap.py +4 -2
ultralytics/solutions/object_counter.py +5 -3
ultralytics/solutions/parking_management.py +4 -2
ultralytics/solutions/region_counter.py +7 -5
ultralytics/solutions/similarity_search.py +5 -3
ultralytics/solutions/solutions.py +38 -36
ultralytics/solutions/streamlit_inference.py +8 -7
ultralytics/trackers/bot_sort.py +11 -9
ultralytics/trackers/byte_tracker.py +17 -15
ultralytics/trackers/utils/gmc.py +4 -3
ultralytics/utils/__init__.py +27 -77
ultralytics/utils/autobatch.py +3 -2
ultralytics/utils/autodevice.py +10 -10
ultralytics/utils/benchmarks.py +11 -10
ultralytics/utils/callbacks/comet.py +9 -9
ultralytics/utils/callbacks/platform.py +2 -1
ultralytics/utils/checks.py +20 -29
ultralytics/utils/downloads.py +2 -2
ultralytics/utils/export.py +12 -11
ultralytics/utils/files.py +8 -7
ultralytics/utils/git.py +139 -0
ultralytics/utils/instance.py +8 -7
ultralytics/utils/logger.py +7 -6
ultralytics/utils/loss.py +15 -13
ultralytics/utils/metrics.py +62 -62
ultralytics/utils/nms.py +346 -0
ultralytics/utils/ops.py +83 -251
ultralytics/utils/patches.py +6 -4
ultralytics/utils/plotting.py +18 -16
ultralytics/utils/tal.py +1 -1
ultralytics/utils/torch_utils.py +4 -2
ultralytics/utils/tqdm.py +47 -33
ultralytics/utils/triton.py +3 -2
{dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.189.dist-info → dgenerate_ultralytics_headless-8.3.191.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/modules/decoders.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-from typing import List, Optional, Tuple, Type
+from __future__ import annotations
 import torch
 from torch import nn
@@ -43,7 +43,7 @@ class MaskDecoder(nn.Module):
         transformer_dim: int,
         transformer: nn.Module,
         num_multimask_outputs: int = 3,
-        activation: Type[nn.Module] = nn.GELU,
+        activation: type[nn.Module] = nn.GELU,
         iou_head_depth: int = 3,
         iou_head_hidden_dim: int = 256,
     ) -> None:
@@ -93,7 +93,7 @@ class MaskDecoder(nn.Module):
         sparse_prompt_embeddings: torch.Tensor,
         dense_prompt_embeddings: torch.Tensor,
         multimask_output: bool,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Predict masks given image and prompt embeddings.
@@ -137,7 +137,7 @@ class MaskDecoder(nn.Module):
         image_pe: torch.Tensor,
         sparse_prompt_embeddings: torch.Tensor,
         dense_prompt_embeddings: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Predict masks and quality scores using image and prompt embeddings via transformer architecture."""
         # Concatenate output tokens
         output_tokens = torch.cat([self.iou_token.weight, self.mask_tokens.weight], dim=0)
@@ -158,7 +158,7 @@ class MaskDecoder(nn.Module):
         # Upscale mask embeddings and predict masks using the mask tokens
         src = src.transpose(1, 2).view(b, c, h, w)
         upscaled_embedding = self.output_upscaling(src)
-        hyper_in_list: List[torch.Tensor] = [
+        hyper_in_list: list[torch.Tensor] = [
             self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
         ]
         hyper_in = torch.stack(hyper_in_list, dim=1)
@@ -221,7 +221,7 @@ class SAM2MaskDecoder(nn.Module):
         transformer_dim: int,
         transformer: nn.Module,
         num_multimask_outputs: int = 3,
-        activation: Type[nn.Module] = nn.GELU,
+        activation: type[nn.Module] = nn.GELU,
         iou_head_depth: int = 3,
         iou_head_hidden_dim: int = 256,
         use_high_res_features: bool = False,
@@ -317,8 +317,8 @@ class SAM2MaskDecoder(nn.Module):
         dense_prompt_embeddings: torch.Tensor,
         multimask_output: bool,
         repeat_image: bool,
-        high_res_features: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        high_res_features: list[torch.Tensor] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Predict masks given image and prompt embeddings.
@@ -385,8 +385,8 @@ class SAM2MaskDecoder(nn.Module):
         sparse_prompt_embeddings: torch.Tensor,
         dense_prompt_embeddings: torch.Tensor,
         repeat_image: bool,
-        high_res_features: Optional[List[torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        high_res_features: list[torch.Tensor] | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
         """Predict instance segmentation masks from image and prompt embeddings using a transformer."""
         # Concatenate output tokens
         s = 0
@@ -431,7 +431,7 @@ class SAM2MaskDecoder(nn.Module):
             upscaled_embedding = act1(ln1(dc1(src) + feat_s1))
             upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0)
-        hyper_in_list: List[torch.Tensor] = [
+        hyper_in_list: list[torch.Tensor] = [
             self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) for i in range(self.num_mask_tokens)
         ]
         hyper_in = torch.stack(hyper_in_list, dim=1)

ultralytics/models/sam/modules/encoders.py CHANGED Viewed

@@ -1,6 +1,6 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-from typing import List, Optional, Tuple, Type
+from __future__ import annotations
 import torch
 import torch.nn as nn
@@ -56,13 +56,13 @@ class ImageEncoderViT(nn.Module):
         mlp_ratio: float = 4.0,
         out_chans: int = 256,
         qkv_bias: bool = True,
-        norm_layer: Type[nn.Module] = nn.LayerNorm,
-        act_layer: Type[nn.Module] = nn.GELU,
+        norm_layer: type[nn.Module] = nn.LayerNorm,
+        act_layer: type[nn.Module] = nn.GELU,
         use_abs_pos: bool = True,
         use_rel_pos: bool = False,
         rel_pos_zero_init: bool = True,
         window_size: int = 0,
-        global_attn_indexes: Tuple[int, ...] = (),
+        global_attn_indexes: tuple[int, ...] = (),
     ) -> None:
         """
         Initialize an ImageEncoderViT instance for encoding images using Vision Transformer architecture.
@@ -101,7 +101,7 @@ class ImageEncoderViT(nn.Module):
             embed_dim=embed_dim,
         )
-        self.pos_embed: Optional[nn.Parameter] = None
+        self.pos_embed: nn.Parameter | None = None
         if use_abs_pos:
             # Initialize absolute positional embedding with pretrain image size
             self.pos_embed = nn.Parameter(torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim))
@@ -188,10 +188,10 @@ class PromptEncoder(nn.Module):
     def __init__(
         self,
         embed_dim: int,
-        image_embedding_size: Tuple[int, int],
-        input_image_size: Tuple[int, int],
+        image_embedding_size: tuple[int, int],
+        input_image_size: tuple[int, int],
         mask_in_chans: int,
-        activation: Type[nn.Module] = nn.GELU,
+        activation: type[nn.Module] = nn.GELU,
     ) -> None:
         """
         Initialize the PromptEncoder module for encoding various types of prompts.
@@ -286,9 +286,9 @@ class PromptEncoder(nn.Module):
     @staticmethod
     def _get_batch_size(
-        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
-        boxes: Optional[torch.Tensor],
-        masks: Optional[torch.Tensor],
+        points: tuple[torch.Tensor, torch.Tensor] | None,
+        boxes: torch.Tensor | None,
+        masks: torch.Tensor | None,
     ) -> int:
         """Get the batch size of the output given the batch size of the input prompts."""
         if points is not None:
@@ -302,10 +302,10 @@ class PromptEncoder(nn.Module):
     def forward(
         self,
-        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
-        boxes: Optional[torch.Tensor],
-        masks: Optional[torch.Tensor],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        points: tuple[torch.Tensor, torch.Tensor] | None,
+        boxes: torch.Tensor | None,
+        masks: torch.Tensor | None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Embed different types of prompts, returning both sparse and dense embeddings.
@@ -542,13 +542,13 @@ class FpnNeck(nn.Module):
     def __init__(
         self,
         d_model: int,
-        backbone_channel_list: List[int],
+        backbone_channel_list: list[int],
         kernel_size: int = 1,
         stride: int = 1,
         padding: int = 0,
         fpn_interp_model: str = "bilinear",
         fuse_type: str = "sum",
-        fpn_top_down_levels: Optional[List[int]] = None,
+        fpn_top_down_levels: list[int] | None = None,
     ):
         """
         Initialize a modified Feature Pyramid Network (FPN) neck.
@@ -602,7 +602,7 @@ class FpnNeck(nn.Module):
             fpn_top_down_levels = range(len(self.convs))
         self.fpn_top_down_levels = list(fpn_top_down_levels)
-    def forward(self, xs: List[torch.Tensor]):
+    def forward(self, xs: list[torch.Tensor]):
         """
         Perform forward pass through the Feature Pyramid Network (FPN) neck.
@@ -695,20 +695,20 @@ class Hiera(nn.Module):
         num_heads: int = 1,  # initial number of heads
         drop_path_rate: float = 0.0,  # stochastic depth
         q_pool: int = 3,  # number of q_pool stages
-        q_stride: Tuple[int, int] = (2, 2),  # downsample stride bet. stages
-        stages: Tuple[int, ...] = (2, 3, 16, 3),  # blocks per stage
+        q_stride: tuple[int, int] = (2, 2),  # downsample stride bet. stages
+        stages: tuple[int, ...] = (2, 3, 16, 3),  # blocks per stage
         dim_mul: float = 2.0,  # dim_mul factor at stage shift
         head_mul: float = 2.0,  # head_mul factor at stage shift
-        window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14),
+        window_pos_embed_bkg_spatial_size: tuple[int, int] = (14, 14),
         # window size per stage, when not using global att.
-        window_spec: Tuple[int, ...] = (
+        window_spec: tuple[int, ...] = (
             8,
             4,
             14,
             7,
         ),
         # global attn in these blocks
-        global_att_blocks: Tuple[int, ...] = (
+        global_att_blocks: tuple[int, ...] = (
             12,
             16,
             20,
@@ -806,7 +806,7 @@ class Hiera(nn.Module):
             else [self.blocks[-1].dim_out]
         )
-    def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor:
+    def _get_pos_embed(self, hw: tuple[int, int]) -> torch.Tensor:
         """Generate positional embeddings by interpolating and combining window and background embeddings."""
         h, w = hw
         window_embed = self.pos_embed_window
@@ -815,7 +815,7 @@ class Hiera(nn.Module):
         pos_embed = pos_embed.permute(0, 2, 3, 1)
         return pos_embed
-    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
+    def forward(self, x: torch.Tensor) -> list[torch.Tensor]:
         """
         Perform forward pass through Hiera model, extracting multiscale features from input images.

ultralytics/models/sam/modules/memory_attention.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+from __future__ import annotations
 import copy
-from typing import Optional
 import torch
 from torch import nn
@@ -103,7 +104,7 @@ class MemoryAttentionLayer(nn.Module):
         self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries
         self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys
-    def _forward_sa(self, tgt: torch.Tensor, query_pos: Optional[torch.Tensor]) -> torch.Tensor:
+    def _forward_sa(self, tgt: torch.Tensor, query_pos: torch.Tensor | None) -> torch.Tensor:
         """Perform self-attention on input tensor using positional encoding and RoPE attention mechanism."""
         tgt2 = self.norm1(tgt)
         q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2
@@ -115,8 +116,8 @@ class MemoryAttentionLayer(nn.Module):
         self,
         tgt: torch.Tensor,
         memory: torch.Tensor,
-        query_pos: Optional[torch.Tensor],
-        pos: Optional[torch.Tensor],
+        query_pos: torch.Tensor | None,
+        pos: torch.Tensor | None,
         num_k_exclude_rope: int = 0,
     ) -> torch.Tensor:
         """Perform cross-attention between target and memory tensors using RoPEAttention mechanism."""
@@ -140,8 +141,8 @@ class MemoryAttentionLayer(nn.Module):
         self,
         tgt: torch.Tensor,
         memory: torch.Tensor,
-        pos: Optional[torch.Tensor] = None,
-        query_pos: Optional[torch.Tensor] = None,
+        pos: torch.Tensor | None = None,
+        query_pos: torch.Tensor | None = None,
         num_k_exclude_rope: int = 0,
     ) -> torch.Tensor:
         """
@@ -242,8 +243,8 @@ class MemoryAttention(nn.Module):
         self,
         curr: torch.Tensor,  # self-attention inputs
         memory: torch.Tensor,  # cross-attention inputs
-        curr_pos: Optional[torch.Tensor] = None,  # pos_enc for self-attention inputs
-        memory_pos: Optional[torch.Tensor] = None,  # pos_enc for cross-attention inputs
+        curr_pos: torch.Tensor | None = None,  # pos_enc for self-attention inputs
+        memory_pos: torch.Tensor | None = None,  # pos_enc for cross-attention inputs
         num_obj_ptr_tokens: int = 0,  # number of object pointer *tokens*
     ) -> torch.Tensor:
         """

ultralytics/models/sam/modules/sam.py CHANGED Viewed

@@ -3,10 +3,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-from typing import List
+from __future__ import annotations
 import torch
 import torch.nn.functional as F
@@ -61,8 +58,8 @@ class SAMModel(nn.Module):
         image_encoder: ImageEncoderViT,
         prompt_encoder: PromptEncoder,
         mask_decoder: MaskDecoder,
-        pixel_mean: List[float] = (123.675, 116.28, 103.53),
-        pixel_std: List[float] = (58.395, 57.12, 57.375),
+        pixel_mean: list[float] = (123.675, 116.28, 103.53),
+        pixel_std: list[float] = (58.395, 57.12, 57.375),
     ) -> None:
         """
         Initialize the SAMModel class to predict object masks from an image and input prompts.
@@ -959,7 +956,6 @@ class SAM2Model(torch.nn.Module):
         prev_sam_mask_logits=None,
     ):
         """Perform a single tracking step, updating object masks and memory features based on current frame inputs."""
-        current_out = {}
         sam_outputs, _, _ = self._track_step(
             frame_idx,
             is_init_cond_frame,
@@ -975,9 +971,11 @@ class SAM2Model(torch.nn.Module):
         )
         _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = sam_outputs
-        current_out["pred_masks"] = low_res_masks
-        current_out["pred_masks_high_res"] = high_res_masks
-        current_out["obj_ptr"] = obj_ptr
+        current_out = {
+            "pred_masks": low_res_masks,
+            "pred_masks_high_res": high_res_masks,
+            "obj_ptr": obj_ptr,
+        }
         if not self.training:
             # Only add this in inference (to avoid unused param in activation checkpointing;
             # it's mainly used in the demo to encode spatial memories w/ consolidated masks)

ultralytics/models/sam/modules/tiny_encoder.py CHANGED Viewed

@@ -9,8 +9,9 @@
 # Build the TinyViT Model
 # --------------------------------------------------------
+from __future__ import annotations
 import itertools
-from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
@@ -106,7 +107,7 @@ class PatchEmbed(nn.Module):
             activation (nn.Module): Activation function to use between convolutions.
         """
         super().__init__()
-        img_size: Tuple[int, int] = to_2tuple(resolution)
+        img_size: tuple[int, int] = to_2tuple(resolution)
         self.patches_resolution = (img_size[0] // 4, img_size[1] // 4)
         self.num_patches = self.patches_resolution[0] * self.patches_resolution[1]
         self.in_chans = in_chans
@@ -219,7 +220,7 @@ class PatchMerging(nn.Module):
         torch.Size([4, 3136, 128])
     """
-    def __init__(self, input_resolution: Tuple[int, int], dim: int, out_dim: int, activation):
+    def __init__(self, input_resolution: tuple[int, int], dim: int, out_dim: int, activation):
         """
         Initialize the PatchMerging module for merging and projecting neighboring patches in feature maps.
@@ -283,13 +284,13 @@ class ConvLayer(nn.Module):
     def __init__(
         self,
         dim: int,
-        input_resolution: Tuple[int, int],
+        input_resolution: tuple[int, int],
         depth: int,
         activation,
-        drop_path: Union[float, List[float]] = 0.0,
-        downsample: Optional[nn.Module] = None,
+        drop_path: float | list[float] = 0.0,
+        downsample: nn.Module | None = None,
         use_checkpoint: bool = False,
-        out_dim: Optional[int] = None,
+        out_dim: int | None = None,
         conv_expand_ratio: float = 4.0,
     ):
         """
@@ -370,8 +371,8 @@ class MLP(nn.Module):
     def __init__(
         self,
         in_features: int,
-        hidden_features: Optional[int] = None,
-        out_features: Optional[int] = None,
+        hidden_features: int | None = None,
+        out_features: int | None = None,
         activation=nn.GELU,
         drop: float = 0.0,
     ):
@@ -441,7 +442,7 @@ class Attention(torch.nn.Module):
         key_dim: int,
         num_heads: int = 8,
         attn_ratio: float = 4,
-        resolution: Tuple[int, int] = (14, 14),
+        resolution: tuple[int, int] = (14, 14),
     ):
         """
         Initialize the Attention module for multi-head attention with spatial awareness.
@@ -549,7 +550,7 @@ class TinyViTBlock(nn.Module):
     def __init__(
         self,
         dim: int,
-        input_resolution: Tuple[int, int],
+        input_resolution: tuple[int, int],
         num_heads: int,
         window_size: int = 7,
         mlp_ratio: float = 4.0,
@@ -690,18 +691,18 @@ class BasicLayer(nn.Module):
     def __init__(
         self,
         dim: int,
-        input_resolution: Tuple[int, int],
+        input_resolution: tuple[int, int],
         depth: int,
         num_heads: int,
         window_size: int,
         mlp_ratio: float = 4.0,
         drop: float = 0.0,
-        drop_path: Union[float, List[float]] = 0.0,
-        downsample: Optional[nn.Module] = None,
+        drop_path: float | list[float] = 0.0,
+        downsample: nn.Module | None = None,
         use_checkpoint: bool = False,
         local_conv_size: int = 3,
         activation=nn.GELU,
-        out_dim: Optional[int] = None,
+        out_dim: int | None = None,
     ):
         """
         Initialize a BasicLayer in the TinyViT architecture.
@@ -800,10 +801,10 @@ class TinyViT(nn.Module):
         img_size: int = 224,
         in_chans: int = 3,
         num_classes: int = 1000,
-        embed_dims: Tuple[int, int, int, int] = (96, 192, 384, 768),
-        depths: Tuple[int, int, int, int] = (2, 2, 6, 2),
-        num_heads: Tuple[int, int, int, int] = (3, 6, 12, 24),
-        window_sizes: Tuple[int, int, int, int] = (7, 7, 14, 7),
+        embed_dims: tuple[int, int, int, int] = (96, 192, 384, 768),
+        depths: tuple[int, int, int, int] = (2, 2, 6, 2),
+        num_heads: tuple[int, int, int, int] = (3, 6, 12, 24),
+        window_sizes: tuple[int, int, int, int] = (7, 7, 14, 7),
         mlp_ratio: float = 4.0,
         drop_rate: float = 0.0,
         drop_path_rate: float = 0.1,
@@ -980,7 +981,7 @@ class TinyViT(nn.Module):
         """Perform the forward pass through the TinyViT model, extracting features from the input image."""
         return self.forward_features(x)
-    def set_imgsz(self, imgsz: List[int] = [1024, 1024]):
+    def set_imgsz(self, imgsz: list[int] = [1024, 1024]):
         """Set image size to make model compatible with different image sizes."""
         imgsz = [s // 4 for s in imgsz]
         self.patches_resolution = imgsz

ultralytics/models/sam/modules/transformer.py CHANGED Viewed

@@ -1,7 +1,8 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+from __future__ import annotations
 import math
-from typing import Tuple, Type
 import torch
 from torch import Tensor, nn
@@ -44,7 +45,7 @@ class TwoWayTransformer(nn.Module):
         embedding_dim: int,
         num_heads: int,
         mlp_dim: int,
-        activation: Type[nn.Module] = nn.ReLU,
+        activation: type[nn.Module] = nn.ReLU,
         attention_downsample_rate: int = 2,
     ) -> None:
         """
@@ -85,7 +86,7 @@ class TwoWayTransformer(nn.Module):
         image_embedding: torch.Tensor,
         image_pe: torch.Tensor,
         point_embedding: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Process image and point embeddings through the Two-Way Transformer.
@@ -162,7 +163,7 @@ class TwoWayAttentionBlock(nn.Module):
         embedding_dim: int,
         num_heads: int,
         mlp_dim: int = 2048,
-        activation: Type[nn.Module] = nn.ReLU,
+        activation: type[nn.Module] = nn.ReLU,
         attention_downsample_rate: int = 2,
         skip_first_layer_pe: bool = False,
     ) -> None:
@@ -198,7 +199,7 @@ class TwoWayAttentionBlock(nn.Module):
     def forward(
         self, queries: torch.Tensor, keys: torch.Tensor, query_pe: torch.Tensor, key_pe: torch.Tensor
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Apply two-way attention to process query and key embeddings in a transformer block.

ultralytics/models/sam/modules/utils.py CHANGED Viewed

@@ -1,12 +1,14 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
-from typing import Any, Dict, Tuple
+from __future__ import annotations
+from typing import Any
 import torch
 import torch.nn.functional as F
-def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: Dict[int, Any], max_cond_frame_num: int):
+def select_closest_cond_frames(frame_idx: int, cond_frame_outputs: dict[int, Any], max_cond_frame_num: int):
     """
     Select the closest conditioning frames to a given frame index.
@@ -248,7 +250,7 @@ def window_partition(x: torch.Tensor, window_size: int):
     return windows, (Hp, Wp)
-def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]):
+def window_unpartition(windows: torch.Tensor, window_size: int, pad_hw: tuple[int, int], hw: tuple[int, int]):
     """
     Unpartition windowed sequences into original sequences and remove padding.
@@ -333,8 +335,8 @@ def add_decomposed_rel_pos(
     q: torch.Tensor,
     rel_pos_h: torch.Tensor,
     rel_pos_w: torch.Tensor,
-    q_size: Tuple[int, int],
-    k_size: Tuple[int, int],
+    q_size: tuple[int, int],
+    k_size: tuple[int, int],
 ) -> torch.Tensor:
     """
     Add decomposed Relative Positional Embeddings to the attention map.

ultralytics/models/sam/predict.py CHANGED Viewed

@@ -8,8 +8,10 @@ using SAM. It forms an integral part of the Ultralytics framework and is designe
 segmentation tasks.
 """
+from __future__ import annotations
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any
 import cv2
 import numpy as np
@@ -1717,9 +1719,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
     def __init__(
         self,
         cfg: Any = DEFAULT_CFG,
-        overrides: Optional[Dict[str, Any]] = None,
+        overrides: dict[str, Any] | None = None,
         max_obj_num: int = 3,
-        _callbacks: Optional[Dict[str, Any]] = None,
+        _callbacks: dict[str, Any] | None = None,
     ) -> None:
         """
         Initialize the predictor with configuration and optional overrides.
@@ -1759,14 +1761,14 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
     @smart_inference_mode()
     def inference(
         self,
-        img: Union[torch.Tensor, np.ndarray],
-        bboxes: Optional[List[List[float]]] = None,
-        masks: Optional[Union[torch.Tensor, np.ndarray]] = None,
-        points: Optional[List[List[float]]] = None,
-        labels: Optional[List[int]] = None,
-        obj_ids: Optional[List[int]] = None,
+        img: torch.Tensor | np.ndarray,
+        bboxes: list[list[float]] | None = None,
+        masks: torch.Tensor | np.ndarray | None = None,
+        points: list[list[float]] | None = None,
+        labels: list[int] | None = None,
+        obj_ids: list[int] | None = None,
         update_memory: bool = False,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Perform inference on a single image with optional bounding boxes, masks, points and object IDs.
         It has two modes: one is to run inference on a single image without updating the memory,
@@ -1824,7 +1826,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
         pred_scores = torch.clamp_(pred_scores / 32, min=0)
         return pred_masks.flatten(0, 1), pred_scores.flatten(0, 1)
-    def get_im_features(self, img: Union[torch.Tensor, np.ndarray]) -> None:
+    def get_im_features(self, img: torch.Tensor | np.ndarray) -> None:
         """
         Initialize the image state by processing the input image and extracting features.
@@ -1844,10 +1846,10 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
     @smart_inference_mode()
     def update_memory(
         self,
-        obj_ids: List[int] = None,
-        points: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        masks: Optional[torch.Tensor] = None,
+        obj_ids: list[int] = None,
+        points: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+        masks: torch.Tensor | None = None,
     ) -> None:
         """
         Append the imgState to the memory_bank and update the memory for the model.
@@ -1923,7 +1925,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
         consolidated_out["maskmem_pos_enc"] = maskmem_pos_enc
         self.memory_bank.append(consolidated_out)
-    def _prepare_memory_conditioned_features(self, obj_idx: Optional[int]) -> torch.Tensor:
+    def _prepare_memory_conditioned_features(self, obj_idx: int | None) -> torch.Tensor:
         """
         Prepare the memory-conditioned features for the current image state. If obj_idx is provided, it supposes to
         prepare features for a specific prompted object in the image. If obj_idx is None, it prepares features for all
@@ -1958,7 +1960,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
             *self.feat_sizes[-1],
         )
-    def get_maskmem_enc(self) -> Tuple[torch.Tensor, torch.Tensor]:
+    def get_maskmem_enc(self) -> tuple[torch.Tensor, torch.Tensor]:
         """Get the memory and positional encoding from the memory, which is used to condition the current image
         features.
         """
@@ -1973,7 +1975,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
         memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0)
         return memory, memory_pos_embed
-    def _obj_id_to_idx(self, obj_id: int) -> Optional[int]:
+    def _obj_id_to_idx(self, obj_id: int) -> int | None:
         """
         Map client-side object id to model-side object index.
@@ -1987,11 +1989,11 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
     def track_step(
         self,
-        obj_idx: Optional[int] = None,
-        point: Optional[torch.Tensor] = None,
-        label: Optional[torch.Tensor] = None,
-        mask: Optional[torch.Tensor] = None,
-    ) -> Dict[str, Any]:
+        obj_idx: int | None = None,
+        point: torch.Tensor | None = None,
+        label: torch.Tensor | None = None,
+        mask: torch.Tensor | None = None,
+    ) -> dict[str, Any]:
         """
         Tracking step for the current image state to predict masks.
@@ -2010,7 +2012,6 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
             current_out (Dict[str, Any]): A dictionary containing the current output with mask predictions and object pointers.
                 Keys include 'point_inputs', 'mask_inputs', 'pred_masks', 'pred_masks_high_res', 'obj_ptr', 'object_score_logits'.
         """
-        current_out = {}
         if mask is not None and self.model.use_mask_input_as_output_without_sam:
             # When use_mask_input_as_output_without_sam=True, we directly output the mask input
             # (see it as a GT mask) without using a SAM prompt encoder + mask decoder.
@@ -2021,7 +2022,7 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
             # fused the visual feature with previous memory features in the memory bank
             pix_feat_with_mem = self._prepare_memory_conditioned_features(obj_idx)
             # calculate the first feature if adding obj_idx exists(means adding prompts)
-            pix_feat_with_mem = pix_feat_with_mem[0:1] if obj_idx is not None else pix_feat_with_mem
+            pix_feat_with_mem = pix_feat_with_mem[:1] if obj_idx is not None else pix_feat_with_mem
             _, _, _, low_res_masks, high_res_masks, obj_ptr, object_score_logits = self.model._forward_sam_heads(
                 backbone_features=pix_feat_with_mem,
                 point_inputs={"point_coords": point, "point_labels": label} if obj_idx is not None else None,
@@ -2029,9 +2030,9 @@ class SAM2DynamicInteractivePredictor(SAM2Predictor):
                 multimask_output=False,
                 high_res_features=[feat[: pix_feat_with_mem.size(0)] for feat in self.high_res_features],
             )
-        current_out["pred_masks"] = low_res_masks
-        current_out["pred_masks_high_res"] = high_res_masks
-        current_out["obj_ptr"] = obj_ptr
-        current_out["object_score_logits"] = object_score_logits
-        return current_out
+        return {
+            "pred_masks": low_res_masks,
+            "pred_masks_high_res": high_res_masks,
+            "obj_ptr": obj_ptr,
+            "object_score_logits": object_score_logits,
+        }

dgenerate-ultralytics-headless 8.3.189__py3-none-any.whl → 8.3.191__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.189py3-none-any.whl → 8.3.191py3-none-any.whl