PyPI - ultralytics - Versions diffs - 8.0.196__py3-none-any.whl → 8.0.198__py3-none-any.whl - Mend

ultralytics 8.0.196py3-none-any.whl → 8.0.198py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultralytics might be problematic. Click here for more details.

Files changed (49) hide show

ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +4 -5
ultralytics/data/augment.py +2 -2
ultralytics/data/converter.py +12 -13
ultralytics/data/dataset.py +1 -1
ultralytics/engine/__init__.py +1 -0
ultralytics/engine/exporter.py +1 -1
ultralytics/engine/trainer.py +2 -1
ultralytics/hub/session.py +1 -1
ultralytics/models/fastsam/predict.py +33 -2
ultralytics/models/fastsam/prompt.py +38 -1
ultralytics/models/fastsam/utils.py +5 -5
ultralytics/models/fastsam/val.py +27 -1
ultralytics/models/nas/model.py +20 -0
ultralytics/models/nas/predict.py +23 -0
ultralytics/models/nas/val.py +24 -0
ultralytics/models/rtdetr/val.py +17 -5
ultralytics/models/sam/modules/decoders.py +26 -1
ultralytics/models/sam/modules/encoders.py +31 -3
ultralytics/models/sam/modules/sam.py +22 -7
ultralytics/models/sam/modules/tiny_encoder.py +147 -45
ultralytics/models/sam/modules/transformer.py +47 -2
ultralytics/models/sam/predict.py +19 -2
ultralytics/models/utils/loss.py +20 -2
ultralytics/models/utils/ops.py +5 -5
ultralytics/nn/modules/block.py +33 -10
ultralytics/nn/modules/conv.py +16 -4
ultralytics/nn/modules/head.py +48 -17
ultralytics/nn/modules/transformer.py +2 -2
ultralytics/nn/tasks.py +7 -7
ultralytics/utils/__init__.py +2 -1
ultralytics/utils/benchmarks.py +13 -0
ultralytics/utils/callbacks/mlflow.py +76 -36
ultralytics/utils/callbacks/wb.py +92 -1
ultralytics/utils/checks.py +4 -4
ultralytics/utils/errors.py +12 -0
ultralytics/utils/files.py +1 -1
ultralytics/utils/instance.py +41 -3
ultralytics/utils/loss.py +22 -19
ultralytics/utils/metrics.py +106 -24
ultralytics/utils/tal.py +1 -1
ultralytics/utils/torch_utils.py +4 -2
ultralytics/utils/tuner.py +10 -4
{ultralytics-8.0.196.dist-info → ultralytics-8.0.198.dist-info}/METADATA +1 -1
{ultralytics-8.0.196.dist-info → ultralytics-8.0.198.dist-info}/RECORD +49 -49
{ultralytics-8.0.196.dist-info → ultralytics-8.0.198.dist-info}/LICENSE +0 -0
{ultralytics-8.0.196.dist-info → ultralytics-8.0.198.dist-info}/WHEEL +0 -0
{ultralytics-8.0.196.dist-info → ultralytics-8.0.198.dist-info}/entry_points.txt +0 -0
{ultralytics-8.0.196.dist-info → ultralytics-8.0.198.dist-info}/top_level.txt +0 -0

ultralytics/models/sam/modules/sam.py CHANGED Viewed

@@ -16,6 +16,20 @@ from .encoders import ImageEncoderViT, PromptEncoder
 class Sam(nn.Module):
+    """
+    Sam (Segment Anything Model) is designed for object segmentation tasks. It uses image encoders to generate image
+    embeddings, and prompt encoders to encode various types of input prompts. These embeddings are then used by the mask
+    decoder to predict object masks.
+    Attributes:
+        mask_threshold (float): Threshold value for mask prediction.
+        image_format (str): Format of the input image, default is 'RGB'.
+        image_encoder (ImageEncoderViT): The backbone used to encode the image into embeddings.
+        prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+        mask_decoder (MaskDecoder): Predicts object masks from the image and prompt embeddings.
+        pixel_mean (List[float]): Mean pixel values for image normalization.
+        pixel_std (List[float]): Standard deviation values for image normalization.
+    """
     mask_threshold: float = 0.0
     image_format: str = 'RGB'
@@ -28,18 +42,19 @@ class Sam(nn.Module):
         pixel_std: List[float] = (58.395, 57.12, 57.375)
     ) -> None:
         """
-        SAM predicts object masks from an image and input prompts.
+        Initialize the Sam class to predict object masks from an image and input prompts.
         Note:
             All forward() operations moved to SAMPredictor.
         Args:
-          image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings that allow for
-            efficient mask prediction.
-          prompt_encoder (PromptEncoder): Encodes various types of input prompts.
-          mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
-          pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
-          pixel_std (list(float)): Std values for normalizing pixels in the input image.
+            image_encoder (ImageEncoderViT): The backbone used to encode the image into image embeddings.
+            prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+            mask_decoder (MaskDecoder): Predicts masks from the image embeddings and encoded prompts.
+            pixel_mean (List[float], optional): Mean values for normalizing pixels in the input image. Defaults to
+                (123.675, 116.28, 103.53).
+            pixel_std (List[float], optional): Std values for normalizing pixels in the input image. Defaults to
+                (58.395, 57.12, 57.375).
         """
         super().__init__()
         self.image_encoder = image_encoder

ultralytics/models/sam/modules/tiny_encoder.py CHANGED Viewed

@@ -21,6 +21,7 @@ from ultralytics.utils.instance import to_2tuple
 class Conv2d_BN(torch.nn.Sequential):
+    """A sequential container that performs 2D convolution followed by batch normalization."""
     def __init__(self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1):
         """Initializes the MBConv model with given input channels, output channels, expansion ratio, activation, and
@@ -35,6 +36,7 @@ class Conv2d_BN(torch.nn.Sequential):
 class PatchEmbed(nn.Module):
+    """Embeds images into patches and projects them into a specified embedding dimension."""
     def __init__(self, in_chans, embed_dim, resolution, activation):
         """Initialize the PatchMerging class with specified input, output dimensions, resolution and activation
@@ -59,6 +61,7 @@ class PatchEmbed(nn.Module):
 class MBConv(nn.Module):
+    """Mobile Inverted Bottleneck Conv (MBConv) layer, part of the EfficientNet architecture."""
     def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path):
         """Initializes a convolutional layer with specified dimensions, input resolution, depth, and activation
@@ -96,6 +99,7 @@ class MBConv(nn.Module):
 class PatchMerging(nn.Module):
+    """Merges neighboring patches in the feature map and projects to a new dimension."""
     def __init__(self, input_resolution, dim, out_dim, activation):
         """Initializes the ConvLayer with specific dimension, input resolution, depth, activation, drop path, and other
@@ -130,6 +134,11 @@ class PatchMerging(nn.Module):
 class ConvLayer(nn.Module):
+    """
+    Convolutional Layer featuring multiple MobileNetV3-style inverted bottleneck convolutions (MBConv).
+    Optionally applies downsample operations to the output, and provides support for gradient checkpointing.
+    """
     def __init__(
         self,
@@ -143,13 +152,27 @@ class ConvLayer(nn.Module):
         out_dim=None,
         conv_expand_ratio=4.,
     ):
+        """
+        Initializes the ConvLayer with the given dimensions and settings.
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): The resolution of the input image.
+            depth (int): The number of MBConv layers in the block.
+            activation (Callable): Activation function applied after each convolution.
+            drop_path (Union[float, List[float]]): Drop path rate. Single float or a list of floats for each MBConv.
+            downsample (Optional[Callable]): Function for downsampling the output. None to skip downsampling.
+            use_checkpoint (bool): Whether to use gradient checkpointing to save memory.
+            out_dim (Optional[int]): The dimensionality of the output. None means it will be the same as `dim`.
+            conv_expand_ratio (float): Expansion ratio for the MBConv layers.
+        """
         super().__init__()
         self.dim = dim
         self.input_resolution = input_resolution
         self.depth = depth
         self.use_checkpoint = use_checkpoint
-        # build blocks
+        # Build blocks
         self.blocks = nn.ModuleList([
             MBConv(
                 dim,
@@ -159,7 +182,7 @@ class ConvLayer(nn.Module):
                 drop_path[i] if isinstance(drop_path, list) else drop_path,
             ) for i in range(depth)])
-        # patch merging layer
+        # Patch merging layer
         self.downsample = None if downsample is None else downsample(
             input_resolution, dim=dim, out_dim=out_dim, activation=activation)
@@ -171,6 +194,11 @@ class ConvLayer(nn.Module):
 class Mlp(nn.Module):
+    """
+    Multi-layer Perceptron (MLP) for transformer architectures.
+    This layer takes an input with in_features, applies layer normalization and two fully-connected layers.
+    """
     def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
         """Initializes Attention module with the given parameters including dimension, key_dim, number of heads, etc."""
@@ -194,6 +222,14 @@ class Mlp(nn.Module):
 class Attention(torch.nn.Module):
+    """
+    Multi-head attention module with support for spatial awareness, applying attention biases based on spatial
+    resolution. Implements trainable attention biases for each unique offset between spatial positions in the resolution
+    grid.
+    Attributes:
+        ab (Tensor, optional): Cached attention biases for inference, deleted during training.
+    """
     def __init__(
             self,
@@ -203,8 +239,21 @@ class Attention(torch.nn.Module):
             attn_ratio=4,
             resolution=(14, 14),
     ):
+        """
+        Initializes the Attention module.
+        Args:
+            dim (int): The dimensionality of the input and output.
+            key_dim (int): The dimensionality of the keys and queries.
+            num_heads (int, optional): Number of attention heads. Default is 8.
+            attn_ratio (float, optional): Attention ratio, affecting the dimensions of the value vectors. Default is 4.
+            resolution (Tuple[int, int], optional): Spatial resolution of the input feature map. Default is (14, 14).
+        Raises:
+            AssertionError: If `resolution` is not a tuple of length 2.
+        """
         super().__init__()
-        # (h, w)
         assert isinstance(resolution, tuple) and len(resolution) == 2
         self.num_heads = num_heads
         self.scale = key_dim ** -0.5
@@ -241,8 +290,9 @@ class Attention(torch.nn.Module):
         else:
             self.ab = self.attention_biases[:, self.attention_bias_idxs]
-    def forward(self, x):  # x (B,N,C)
-        B, N, _ = x.shape
+    def forward(self, x):  # x
+        """Performs forward pass over the input tensor 'x' by applying normalization and querying keys/values."""
+        B, N, _ = x.shape  # B, N, C
         # Normalization
         x = self.norm(x)
@@ -264,20 +314,7 @@ class Attention(torch.nn.Module):
 class TinyViTBlock(nn.Module):
-    """
-    TinyViT Block.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int, int]): Input resolution.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        drop (float, optional): Dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        local_conv_size (int): the kernel size of the convolution between Attention and MLP. Default: 3
-        activation (torch.nn): the activation function. Default: nn.GELU
-    """
+    """TinyViT Block that applies self-attention and a local convolution to the input."""
     def __init__(
         self,
@@ -291,6 +328,24 @@ class TinyViTBlock(nn.Module):
         local_conv_size=3,
         activation=nn.GELU,
     ):
+        """
+        Initializes the TinyViTBlock.
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
+            num_heads (int): Number of attention heads.
+            window_size (int, optional): Window size for attention. Default is 7.
+            mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
+            drop (float, optional): Dropout rate. Default is 0.
+            drop_path (float, optional): Stochastic depth rate. Default is 0.
+            local_conv_size (int, optional): The kernel size of the local convolution. Default is 3.
+            activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
+        Raises:
+            AssertionError: If `window_size` is not greater than 0.
+            AssertionError: If `dim` is not divisible by `num_heads`.
+        """
         super().__init__()
         self.dim = dim
         self.input_resolution = input_resolution
@@ -338,11 +393,11 @@ class TinyViTBlock(nn.Module):
             pH, pW = H + pad_b, W + pad_r
             nH = pH // self.window_size
             nW = pW // self.window_size
-            # window partition
+            # Window partition
             x = x.view(B, nH, self.window_size, nW, self.window_size,
                        C).transpose(2, 3).reshape(B * nH * nW, self.window_size * self.window_size, C)
             x = self.attn(x)
-            # window reverse
+            # Window reverse
             x = x.view(B, nH, nW, self.window_size, self.window_size, C).transpose(2, 3).reshape(B, pH, pW, C)
             if padding:
@@ -367,24 +422,7 @@ class TinyViTBlock(nn.Module):
 class BasicLayer(nn.Module):
-    """
-    A basic TinyViT layer for one stage.
-    Args:
-        dim (int): Number of input channels.
-        input_resolution (tuple[int]): Input resolution.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        drop (float, optional): Dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-        local_conv_size (int): the kernel size of the depthwise convolution between attention and MLP. Default: 3
-        activation (torch.nn): the activation function. Default: nn.GELU
-        out_dim (int | optional): the output dimension of the layer. Default: None
-    """
+    """A basic TinyViT layer for one stage in a TinyViT architecture."""
     def __init__(
         self,
@@ -402,13 +440,34 @@ class BasicLayer(nn.Module):
         activation=nn.GELU,
         out_dim=None,
     ):
+        """
+        Initializes the BasicLayer.
+        Args:
+            dim (int): The dimensionality of the input and output.
+            input_resolution (Tuple[int, int]): Spatial resolution of the input feature map.
+            depth (int): Number of TinyViT blocks.
+            num_heads (int): Number of attention heads.
+            window_size (int): Local window size.
+            mlp_ratio (float, optional): Ratio of mlp hidden dim to embedding dim. Default is 4.
+            drop (float, optional): Dropout rate. Default is 0.
+            drop_path (float | tuple[float], optional): Stochastic depth rate. Default is 0.
+            downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default is None.
+            use_checkpoint (bool, optional): Whether to use checkpointing to save memory. Default is False.
+            local_conv_size (int, optional): Kernel size of the local convolution. Default is 3.
+            activation (torch.nn, optional): Activation function for MLP. Default is nn.GELU.
+            out_dim (int | None, optional): The output dimension of the layer. Default is None.
+        Raises:
+            ValueError: If `drop_path` is a list of float but its length doesn't match `depth`.
+        """
         super().__init__()
         self.dim = dim
         self.input_resolution = input_resolution
         self.depth = depth
         self.use_checkpoint = use_checkpoint
-        # build blocks
+        # Build blocks
         self.blocks = nn.ModuleList([
             TinyViTBlock(
                 dim=dim,
@@ -422,7 +481,7 @@ class BasicLayer(nn.Module):
                 activation=activation,
             ) for i in range(depth)])
-        # patch merging layer
+        # Patch merging layer
         self.downsample = None if downsample is None else downsample(
             input_resolution, dim=dim, out_dim=out_dim, activation=activation)
@@ -456,6 +515,30 @@ class LayerNorm2d(nn.Module):
 class TinyViT(nn.Module):
+    """
+    The TinyViT architecture for vision tasks.
+    Attributes:
+        img_size (int): Input image size.
+        in_chans (int): Number of input channels.
+        num_classes (int): Number of classification classes.
+        embed_dims (List[int]): List of embedding dimensions for each layer.
+        depths (List[int]): List of depths for each layer.
+        num_heads (List[int]): List of number of attention heads for each layer.
+        window_sizes (List[int]): List of window sizes for each layer.
+        mlp_ratio (float): Ratio of MLP hidden dimension to embedding dimension.
+        drop_rate (float): Dropout rate for drop layers.
+        drop_path_rate (float): Drop path rate for stochastic depth.
+        use_checkpoint (bool): Use checkpointing for efficient memory usage.
+        mbconv_expand_ratio (float): Expansion ratio for MBConv layer.
+        local_conv_size (int): Local convolution kernel size.
+        layer_lr_decay (float): Layer-wise learning rate decay.
+    Note:
+        This implementation is generalized to accept a list of depths, attention heads,
+        embedding dimensions and window sizes, which allows you to create a
+        "stack" of TinyViT models of varying configurations.
+    """
     def __init__(
         self,
@@ -474,6 +557,25 @@ class TinyViT(nn.Module):
         local_conv_size=3,
         layer_lr_decay=1.0,
     ):
+        """
+        Initializes the TinyViT model.
+        Args:
+            img_size (int, optional): The input image size. Defaults to 224.
+            in_chans (int, optional): Number of input channels. Defaults to 3.
+            num_classes (int, optional): Number of classification classes. Defaults to 1000.
+            embed_dims (List[int], optional): List of embedding dimensions for each layer. Defaults to [96, 192, 384, 768].
+            depths (List[int], optional): List of depths for each layer. Defaults to [2, 2, 6, 2].
+            num_heads (List[int], optional): List of number of attention heads for each layer. Defaults to [3, 6, 12, 24].
+            window_sizes (List[int], optional): List of window sizes for each layer. Defaults to [7, 7, 14, 7].
+            mlp_ratio (float, optional): Ratio of MLP hidden dimension to embedding dimension. Defaults to 4.
+            drop_rate (float, optional): Dropout rate. Defaults to 0.
+            drop_path_rate (float, optional): Drop path rate for stochastic depth. Defaults to 0.1.
+            use_checkpoint (bool, optional): Whether to use checkpointing for efficient memory usage. Defaults to False.
+            mbconv_expand_ratio (float, optional): Expansion ratio for MBConv layer. Defaults to 4.0.
+            local_conv_size (int, optional): Local convolution kernel size. Defaults to 3.
+            layer_lr_decay (float, optional): Layer-wise learning rate decay. Defaults to 1.0.
+        """
         super().__init__()
         self.img_size = img_size
         self.num_classes = num_classes
@@ -491,10 +593,10 @@ class TinyViT(nn.Module):
         patches_resolution = self.patch_embed.patches_resolution
         self.patches_resolution = patches_resolution
-        # stochastic depth
+        # Stochastic depth
         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
-        # build layers
+        # Build layers
         self.layers = nn.ModuleList()
         for i_layer in range(self.num_layers):
             kwargs = dict(
@@ -526,7 +628,7 @@ class TinyViT(nn.Module):
         self.norm_head = nn.LayerNorm(embed_dims[-1])
         self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else torch.nn.Identity()
-        # init weights
+        # Init weights
         self.apply(self._init_weights)
         self.set_layer_lr_decay(layer_lr_decay)
         self.neck = nn.Sequential(
@@ -551,7 +653,7 @@ class TinyViT(nn.Module):
         """Sets the learning rate decay for each layer in the TinyViT model."""
         decay_rate = layer_lr_decay
-        # layers -> blocks (depth)
+        # Layers -> blocks (depth)
         depth = sum(self.depths)
         lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)]

ultralytics/models/sam/modules/transformer.py CHANGED Viewed

@@ -10,6 +10,21 @@ from ultralytics.nn.modules import MLPBlock
 class TwoWayTransformer(nn.Module):
+    """
+    A Two-Way Transformer module that enables the simultaneous attention to both image and query points. This class
+    serves as a specialized transformer decoder that attends to an input image using queries whose positional embedding
+    is supplied. This is particularly useful for tasks like object detection, image segmentation, and point cloud
+    processing.
+    Attributes:
+        depth (int): The number of layers in the transformer.
+        embedding_dim (int): The channel dimension for the input embeddings.
+        num_heads (int): The number of heads for multihead attention.
+        mlp_dim (int): The internal channel dimension for the MLP block.
+        layers (nn.ModuleList): The list of TwoWayAttentionBlock layers that make up the transformer.
+        final_attn_token_to_image (Attention): The final attention layer applied from the queries to the image.
+        norm_final_attn (nn.LayerNorm): The layer normalization applied to the final queries.
+    """
     def __init__(
         self,
@@ -98,6 +113,23 @@ class TwoWayTransformer(nn.Module):
 class TwoWayAttentionBlock(nn.Module):
+    """
+    An attention block that performs both self-attention and cross-attention in two directions: queries to keys and
+    keys to queries. This block consists of four main layers: (1) self-attention on sparse inputs, (2) cross-attention
+    of sparse inputs to dense inputs, (3) an MLP block on sparse inputs, and (4) cross-attention of dense inputs to
+    sparse inputs.
+    Attributes:
+        self_attn (Attention): The self-attention layer for the queries.
+        norm1 (nn.LayerNorm): Layer normalization following the first attention block.
+        cross_attn_token_to_image (Attention): Cross-attention layer from queries to keys.
+        norm2 (nn.LayerNorm): Layer normalization following the second attention block.
+        mlp (MLPBlock): MLP block that transforms the query embeddings.
+        norm3 (nn.LayerNorm): Layer normalization following the MLP block.
+        norm4 (nn.LayerNorm): Layer normalization following the third attention block.
+        cross_attn_image_to_token (Attention): Cross-attention layer from keys to queries.
+        skip_first_layer_pe (bool): Whether to skip the positional encoding in the first layer.
+    """
     def __init__(
         self,
@@ -180,6 +212,17 @@ class Attention(nn.Module):
         num_heads: int,
         downsample_rate: int = 1,
     ) -> None:
+        """
+        Initializes the Attention model with the given dimensions and settings.
+        Args:
+            embedding_dim (int): The dimensionality of the input embeddings.
+            num_heads (int): The number of attention heads.
+            downsample_rate (int, optional): The factor by which the internal dimensions are downsampled. Defaults to 1.
+        Raises:
+            AssertionError: If 'num_heads' does not evenly divide the internal dimension (embedding_dim / downsample_rate).
+        """
         super().__init__()
         self.embedding_dim = embedding_dim
         self.internal_dim = embedding_dim // downsample_rate
@@ -191,13 +234,15 @@ class Attention(nn.Module):
         self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
         self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
-    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+    @staticmethod
+    def _separate_heads(x: Tensor, num_heads: int) -> Tensor:
         """Separate the input tensor into the specified number of attention heads."""
         b, n, c = x.shape
         x = x.reshape(b, n, num_heads, c // num_heads)
         return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
-    def _recombine_heads(self, x: Tensor) -> Tensor:
+    @staticmethod
+    def _recombine_heads(x: Tensor) -> Tensor:
         """Recombine the separated attention heads into a single tensor."""
         b, n_heads, n_tokens, c_per_head = x.shape
         x = x.transpose(1, 2)

ultralytics/models/sam/predict.py CHANGED Viewed

@@ -17,6 +17,24 @@ from .build import build_sam
 class Predictor(BasePredictor):
+    """
+    A prediction class for segmentation tasks, extending the BasePredictor.
+    This class serves as an interface for model inference for segmentation tasks.
+    It can preprocess input images, perform inference, and postprocess the output.
+    It also supports handling various types of input prompts including bounding boxes,
+    points, and low-resolution masks for better prediction results.
+    Attributes:
+        cfg (dict): Configuration dictionary.
+        overrides (dict): Dictionary of overriding values.
+        _callbacks (dict): Dictionary of callback functions.
+        args (namespace): Argument namespace.
+        im (torch.Tensor): Preprocessed image for current prediction.
+        features (torch.Tensor): Image features.
+        prompts (dict): Dictionary of prompts like bboxes, points, masks.
+        segment_all (bool): Whether to perform segmentation on all objects or not.
+    """
     def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
         """Initializes the Predictor class with default or provided configuration, overrides, and callbacks."""
@@ -396,8 +414,7 @@ class Predictor(BasePredictor):
             unchanged = unchanged and not changed
             new_masks.append(torch.as_tensor(mask).unsqueeze(0))
-            # Give score=0 to changed masks and score=1 to unchanged masks
-            # so NMS will prefer ones that didn't need postprocessing
+            # Give score=0 to changed masks and 1 to unchanged masks so NMS prefers masks not needing postprocessing
             scores.append(float(unchanged))
         # Recalculate boxes and remove any new duplicates

ultralytics/models/utils/loss.py CHANGED Viewed

@@ -11,6 +11,24 @@ from .ops import HungarianMatcher
 class DETRLoss(nn.Module):
+    """
+    DETR (DEtection TRansformer) Loss class. This class calculates and returns the different loss components for the
+    DETR object detection model. It computes classification loss, bounding box loss, GIoU loss, and optionally auxiliary
+    losses.
+    Attributes:
+        nc (int): The number of classes.
+        loss_gain (dict): Coefficients for different loss components.
+        aux_loss (bool): Whether to compute auxiliary losses.
+        use_fl (bool): Use FocalLoss or not.
+        use_vfl (bool): Use VarifocalLoss or not.
+        use_uni_match (bool): Whether to use a fixed layer to assign labels for the auxiliary branch.
+        uni_match_ind (int): The fixed indices of a layer to use if `use_uni_match` is True.
+        matcher (HungarianMatcher): Object to compute matching cost and indices.
+        fl (FocalLoss or None): Focal Loss object if `use_fl` is True, otherwise None.
+        vfl (VarifocalLoss or None): Varifocal Loss object if `use_vfl` is True, otherwise None.
+        device (torch.device): Device on which tensors are stored.
+    """
     def __init__(self,
                  nc=80,
@@ -48,7 +66,7 @@ class DETRLoss(nn.Module):
     def _get_loss_class(self, pred_scores, targets, gt_scores, num_gts, postfix=''):
         """Computes the classification loss based on predictions, target values, and ground truth scores."""
-        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
+        # Logits: [b, query, num_classes], gt_class: list[[n, 1]]
         name_class = f'loss_class{postfix}'
         bs, nq = pred_scores.shape[:2]
         # one_hot = F.one_hot(targets, self.nc + 1)[..., :-1]  # (bs, num_queries, num_classes)
@@ -72,7 +90,7 @@ class DETRLoss(nn.Module):
         """Calculates and returns the bounding box loss and GIoU loss for the predicted and ground truth bounding
         boxes.
         """
-        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
+        # Boxes: [b, query, 4], gt_bbox: list[[n, 4]]
         name_bbox = f'loss_bbox{postfix}'
         name_giou = f'loss_giou{postfix}'

ultralytics/models/utils/ops.py CHANGED Viewed

@@ -188,7 +188,7 @@ def get_cdn_group(batch,
     num_group = num_dn // max_nums
     num_group = 1 if num_group == 0 else num_group
-    # pad gt to max_num of a batch
+    # Pad gt to max_num of a batch
     bs = len(gt_groups)
     gt_cls = batch['cls']  # (bs*num, )
     gt_bbox = batch['bboxes']  # bs*num, 4
@@ -204,10 +204,10 @@ def get_cdn_group(batch,
     neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num
     if cls_noise_ratio > 0:
-        # half of bbox prob
+        # Half of bbox prob
         mask = torch.rand(dn_cls.shape) < (cls_noise_ratio * 0.5)
         idx = torch.nonzero(mask).squeeze(-1)
-        # randomly put a new one here
+        # Randomly put a new one here
         new_label = torch.randint_like(idx, 0, num_classes, dtype=dn_cls.dtype, device=dn_cls.device)
         dn_cls[idx] = new_label
@@ -240,9 +240,9 @@ def get_cdn_group(batch,
     tgt_size = num_dn + num_queries
     attn_mask = torch.zeros([tgt_size, tgt_size], dtype=torch.bool)
-    # match query cannot see the reconstruct
+    # Match query cannot see the reconstruct
     attn_mask[num_dn:, :num_dn] = True
-    # reconstruct cannot see each other
+    # Reconstruct cannot see each other
     for i in range(num_group):
         if i == 0:
             attn_mask[max_nums * 2 * i:max_nums * 2 * (i + 1), max_nums * 2 * (i + 1):num_dn] = True

ultralytics 8.0.196__py3-none-any.whl → 8.0.198__py3-none-any.whl

Potentially problematic release.

ultralytics 8.0.196py3-none-any.whl → 8.0.198py3-none-any.whl