PyPI - ultralytics - Versions diffs - 8.3.143__py3-none-any.whl → 8.3.145__py3-none-any.whl - Mend

ultralytics 8.3.143py3-none-any.whl → 8.3.145py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

tests/conftest.py +7 -24
tests/test_cli.py +1 -1
tests/test_cuda.py +7 -2
tests/test_engine.py +7 -8
tests/test_exports.py +16 -16
tests/test_integrations.py +1 -1
tests/test_solutions.py +11 -11
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +16 -13
ultralytics/data/annotator.py +6 -5
ultralytics/data/augment.py +127 -126
ultralytics/data/base.py +54 -51
ultralytics/data/build.py +47 -23
ultralytics/data/converter.py +47 -43
ultralytics/data/dataset.py +51 -50
ultralytics/data/loaders.py +77 -44
ultralytics/data/split.py +22 -9
ultralytics/data/split_dota.py +63 -39
ultralytics/data/utils.py +59 -39
ultralytics/engine/exporter.py +79 -27
ultralytics/engine/model.py +52 -51
ultralytics/engine/predictor.py +37 -28
ultralytics/engine/results.py +191 -161
ultralytics/engine/trainer.py +36 -19
ultralytics/engine/tuner.py +12 -9
ultralytics/engine/validator.py +7 -9
ultralytics/hub/__init__.py +11 -13
ultralytics/hub/auth.py +22 -2
ultralytics/hub/google/__init__.py +19 -19
ultralytics/hub/session.py +37 -51
ultralytics/hub/utils.py +19 -5
ultralytics/models/fastsam/model.py +30 -12
ultralytics/models/fastsam/predict.py +5 -6
ultralytics/models/fastsam/utils.py +3 -3
ultralytics/models/fastsam/val.py +10 -6
ultralytics/models/nas/model.py +9 -5
ultralytics/models/nas/predict.py +6 -6
ultralytics/models/nas/val.py +3 -3
ultralytics/models/rtdetr/model.py +7 -6
ultralytics/models/rtdetr/predict.py +14 -7
ultralytics/models/rtdetr/train.py +10 -4
ultralytics/models/rtdetr/val.py +36 -9
ultralytics/models/sam/amg.py +30 -12
ultralytics/models/sam/build.py +22 -22
ultralytics/models/sam/model.py +10 -9
ultralytics/models/sam/modules/blocks.py +76 -80
ultralytics/models/sam/modules/decoders.py +6 -8
ultralytics/models/sam/modules/encoders.py +23 -26
ultralytics/models/sam/modules/memory_attention.py +13 -1
ultralytics/models/sam/modules/sam.py +57 -26
ultralytics/models/sam/modules/tiny_encoder.py +232 -237
ultralytics/models/sam/modules/transformer.py +13 -13
ultralytics/models/sam/modules/utils.py +11 -19
ultralytics/models/sam/predict.py +114 -101
ultralytics/models/utils/loss.py +98 -77
ultralytics/models/utils/ops.py +116 -67
ultralytics/models/yolo/classify/predict.py +5 -5
ultralytics/models/yolo/classify/train.py +32 -28
ultralytics/models/yolo/classify/val.py +7 -8
ultralytics/models/yolo/detect/predict.py +1 -0
ultralytics/models/yolo/detect/train.py +15 -14
ultralytics/models/yolo/detect/val.py +37 -36
ultralytics/models/yolo/model.py +106 -23
ultralytics/models/yolo/obb/predict.py +3 -4
ultralytics/models/yolo/obb/train.py +14 -6
ultralytics/models/yolo/obb/val.py +29 -23
ultralytics/models/yolo/pose/predict.py +9 -8
ultralytics/models/yolo/pose/train.py +24 -16
ultralytics/models/yolo/pose/val.py +44 -26
ultralytics/models/yolo/segment/predict.py +5 -5
ultralytics/models/yolo/segment/train.py +11 -7
ultralytics/models/yolo/segment/val.py +2 -2
ultralytics/models/yolo/world/train.py +33 -23
ultralytics/models/yolo/world/train_world.py +11 -3
ultralytics/models/yolo/yoloe/predict.py +11 -11
ultralytics/models/yolo/yoloe/train.py +73 -21
ultralytics/models/yolo/yoloe/train_seg.py +10 -7
ultralytics/models/yolo/yoloe/val.py +42 -18
ultralytics/nn/autobackend.py +59 -15
ultralytics/nn/modules/__init__.py +4 -4
ultralytics/nn/modules/activation.py +4 -1
ultralytics/nn/modules/block.py +178 -111
ultralytics/nn/modules/conv.py +6 -5
ultralytics/nn/modules/head.py +469 -121
ultralytics/nn/modules/transformer.py +147 -58
ultralytics/nn/tasks.py +227 -20
ultralytics/nn/text_model.py +30 -33
ultralytics/solutions/ai_gym.py +4 -6
ultralytics/solutions/analytics.py +7 -4
ultralytics/solutions/config.py +10 -10
ultralytics/solutions/distance_calculation.py +11 -10
ultralytics/solutions/heatmap.py +2 -2
ultralytics/solutions/instance_segmentation.py +7 -4
ultralytics/solutions/object_blurrer.py +3 -3
ultralytics/solutions/object_counter.py +15 -11
ultralytics/solutions/object_cropper.py +3 -2
ultralytics/solutions/parking_management.py +29 -28
ultralytics/solutions/queue_management.py +6 -6
ultralytics/solutions/region_counter.py +10 -3
ultralytics/solutions/security_alarm.py +3 -3
ultralytics/solutions/similarity_search.py +85 -24
ultralytics/solutions/solutions.py +189 -79
ultralytics/solutions/speed_estimation.py +28 -22
ultralytics/solutions/streamlit_inference.py +17 -12
ultralytics/solutions/trackzone.py +4 -4
ultralytics/trackers/basetrack.py +16 -23
ultralytics/trackers/bot_sort.py +30 -20
ultralytics/trackers/byte_tracker.py +70 -64
ultralytics/trackers/track.py +4 -8
ultralytics/trackers/utils/gmc.py +31 -58
ultralytics/trackers/utils/kalman_filter.py +37 -37
ultralytics/trackers/utils/matching.py +1 -1
ultralytics/utils/__init__.py +105 -89
ultralytics/utils/autobatch.py +16 -3
ultralytics/utils/autodevice.py +54 -24
ultralytics/utils/benchmarks.py +45 -29
ultralytics/utils/callbacks/base.py +3 -3
ultralytics/utils/callbacks/clearml.py +9 -9
ultralytics/utils/callbacks/comet.py +67 -25
ultralytics/utils/callbacks/dvc.py +7 -10
ultralytics/utils/callbacks/mlflow.py +2 -5
ultralytics/utils/callbacks/neptune.py +7 -13
ultralytics/utils/callbacks/raytune.py +1 -1
ultralytics/utils/callbacks/tensorboard.py +5 -6
ultralytics/utils/callbacks/wb.py +14 -14
ultralytics/utils/checks.py +14 -13
ultralytics/utils/dist.py +5 -5
ultralytics/utils/downloads.py +94 -67
ultralytics/utils/errors.py +5 -5
ultralytics/utils/export.py +61 -47
ultralytics/utils/files.py +23 -22
ultralytics/utils/instance.py +48 -52
ultralytics/utils/loss.py +78 -40
ultralytics/utils/metrics.py +186 -130
ultralytics/utils/ops.py +186 -190
ultralytics/utils/patches.py +15 -17
ultralytics/utils/plotting.py +71 -27
ultralytics/utils/tal.py +21 -15
ultralytics/utils/torch_utils.py +53 -50
ultralytics/utils/triton.py +5 -4
ultralytics/utils/tuner.py +5 -5
{ultralytics-8.3.143.dist-info → ultralytics-8.3.145.dist-info}/METADATA +2 -2
ultralytics-8.3.145.dist-info/RECORD +272 -0
ultralytics-8.3.143.dist-info/RECORD +0 -272
{ultralytics-8.3.143.dist-info → ultralytics-8.3.145.dist-info}/WHEEL +0 -0
{ultralytics-8.3.143.dist-info → ultralytics-8.3.145.dist-info}/entry_points.txt +0 -0
{ultralytics-8.3.143.dist-info → ultralytics-8.3.145.dist-info}/licenses/LICENSE +0 -0
{ultralytics-8.3.143.dist-info → ultralytics-8.3.145.dist-info}/top_level.txt +0 -0

ultralytics/models/utils/loss.py CHANGED Viewed

@@ -1,5 +1,7 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+from typing import Any, Dict, List, Optional, Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -19,7 +21,7 @@ class DETRLoss(nn.Module):
     Attributes:
         nc (int): Number of classes.
-        loss_gain (dict): Coefficients for different loss components.
+        loss_gain (Dict[str, float]): Coefficients for different loss components.
         aux_loss (bool): Whether to compute auxiliary losses.
         use_fl (bool): Whether to use FocalLoss.
         use_vfl (bool): Whether to use VarifocalLoss.
@@ -33,15 +35,15 @@ class DETRLoss(nn.Module):
     def __init__(
         self,
-        nc=80,
-        loss_gain=None,
-        aux_loss=True,
-        use_fl=True,
-        use_vfl=False,
-        use_uni_match=False,
-        uni_match_ind=0,
-        gamma=1.5,
-        alpha=0.25,
+        nc: int = 80,
+        loss_gain: Optional[Dict[str, float]] = None,
+        aux_loss: bool = True,
+        use_fl: bool = True,
+        use_vfl: bool = False,
+        use_uni_match: bool = False,
+        uni_match_ind: int = 0,
+        gamma: float = 1.5,
+        alpha: float = 0.25,
     ):
         """
         Initialize DETR loss function with customizable components and gains.
@@ -51,14 +53,14 @@ class DETRLoss(nn.Module):
         Args:
             nc (int): Number of classes.
-            loss_gain (dict): Coefficients for different loss components.
+            loss_gain (Dict[str, float], optional): Coefficients for different loss components.
             aux_loss (bool): Whether to use auxiliary losses from each decoder layer.
             use_fl (bool): Whether to use FocalLoss.
             use_vfl (bool): Whether to use VarifocalLoss.
             use_uni_match (bool): Whether to use fixed layer for auxiliary branch label assignment.
             uni_match_ind (int): Index of fixed layer for uni_match.
             gamma (float): The focusing parameter that controls how much the loss focuses on hard-to-classify examples.
-            alpha (float | list): The balancing factor used to address class imbalance.
+            alpha (float): The balancing factor used to address class imbalance.
         """
         super().__init__()
@@ -75,19 +77,21 @@ class DETRLoss(nn.Module):
         self.uni_match_ind = uni_match_ind
         self.device = None
-    def _get_loss_class(self, pred_scores, targets, gt_scores, num_gts, postfix=""):
+    def _get_loss_class(
+        self, pred_scores: torch.Tensor, targets: torch.Tensor, gt_scores: torch.Tensor, num_gts: int, postfix: str = ""
+    ) -> Dict[str, torch.Tensor]:
         """
         Compute classification loss based on predictions, target values, and ground truth scores.
         Args:
-            pred_scores (torch.Tensor): Predicted class scores with shape (batch_size, num_queries, num_classes).
-            targets (torch.Tensor): Target class indices with shape (batch_size, num_queries).
-            gt_scores (torch.Tensor): Ground truth confidence scores with shape (batch_size, num_queries).
+            pred_scores (torch.Tensor): Predicted class scores with shape (B, N, C).
+            targets (torch.Tensor): Target class indices with shape (B, N).
+            gt_scores (torch.Tensor): Ground truth confidence scores with shape (B, N).
             num_gts (int): Number of ground truth objects.
             postfix (str, optional): String to append to the loss name for identification in multi-loss scenarios.
         Returns:
-            loss_cls (torch.Tensor): Classification loss value.
+            (Dict[str, torch.Tensor]): Dictionary containing classification loss value.
         Notes:
             The function supports different classification loss types:
@@ -115,22 +119,21 @@ class DETRLoss(nn.Module):
         return {name_class: loss_cls.squeeze() * self.loss_gain["class"]}
-    def _get_loss_bbox(self, pred_bboxes, gt_bboxes, postfix=""):
+    def _get_loss_bbox(
+        self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, postfix: str = ""
+    ) -> Dict[str, torch.Tensor]:
         """
         Compute bounding box and GIoU losses for predicted and ground truth bounding boxes.
         Args:
-            pred_bboxes (torch.Tensor): Predicted bounding boxes with shape (batch_size, num_queries, 4).
-            gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape (N, 4), where N is the total
-                number of ground truth boxes.
-            postfix (str): String to append to the loss names for identification in multi-loss scenarios.
+            pred_bboxes (torch.Tensor): Predicted bounding boxes with shape (N, 4).
+            gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape (N, 4).
+            postfix (str, optional): String to append to the loss names for identification in multi-loss scenarios.
         Returns:
-            loss (dict): Dictionary containing:
-                - loss_bbox{postfix} (torch.Tensor): L1 loss between predicted and ground truth boxes,
-                  scaled by the bbox loss gain.
-                - loss_giou{postfix} (torch.Tensor): GIoU loss between predicted and ground truth boxes,
-                  scaled by the giou loss gain.
+            (Dict[str, torch.Tensor]): Dictionary containing:
+                - loss_bbox{postfix}: L1 loss between predicted and ground truth boxes, scaled by the bbox loss gain.
+                - loss_giou{postfix}: GIoU loss between predicted and ground truth boxes, scaled by the giou loss gain.
         Notes:
             If no ground truth boxes are provided (empty list), zero-valued tensors are returned for both losses.
@@ -184,16 +187,16 @@ class DETRLoss(nn.Module):
     def _get_loss_aux(
         self,
-        pred_bboxes,
-        pred_scores,
-        gt_bboxes,
-        gt_cls,
-        gt_groups,
-        match_indices=None,
-        postfix="",
-        masks=None,
-        gt_mask=None,
-    ):
+        pred_bboxes: torch.Tensor,
+        pred_scores: torch.Tensor,
+        gt_bboxes: torch.Tensor,
+        gt_cls: torch.Tensor,
+        gt_groups: List[int],
+        match_indices: Optional[List[Tuple]] = None,
+        postfix: str = "",
+        masks: Optional[torch.Tensor] = None,
+        gt_mask: Optional[torch.Tensor] = None,
+    ) -> Dict[str, torch.Tensor]:
         """
         Get auxiliary losses for intermediate decoder layers.
@@ -203,13 +206,13 @@ class DETRLoss(nn.Module):
             gt_bboxes (torch.Tensor): Ground truth bounding boxes.
             gt_cls (torch.Tensor): Ground truth classes.
             gt_groups (List[int]): Number of ground truths per image.
-            match_indices (List[tuple], optional): Pre-computed matching indices.
-            postfix (str): String to append to loss names.
+            match_indices (List[Tuple], optional): Pre-computed matching indices.
+            postfix (str, optional): String to append to loss names.
             masks (torch.Tensor, optional): Predicted masks if using segmentation.
             gt_mask (torch.Tensor, optional): Ground truth masks if using segmentation.
         Returns:
-            (dict): Dictionary of auxiliary losses.
+            (Dict[str, torch.Tensor]): Dictionary of auxiliary losses.
         """
         # NOTE: loss class, bbox, giou, mask, dice
         loss = torch.zeros(5 if masks is not None else 3, device=pred_bboxes.device)
@@ -255,32 +258,36 @@ class DETRLoss(nn.Module):
         return loss
     @staticmethod
-    def _get_index(match_indices):
+    def _get_index(match_indices: List[Tuple]) -> Tuple[Tuple[torch.Tensor, torch.Tensor], torch.Tensor]:
         """
         Extract batch indices, source indices, and destination indices from match indices.
         Args:
-            match_indices (List[tuple]): List of tuples containing matched indices.
+            match_indices (List[Tuple]): List of tuples containing matched indices.
         Returns:
-            (tuple): Tuple containing (batch_idx, src_idx) and dst_idx.
+            batch_idx (Tuple[torch.Tensor, torch.Tensor]): Tuple containing (batch_idx, src_idx).
+            dst_idx (torch.Tensor): Destination indices.
         """
         batch_idx = torch.cat([torch.full_like(src, i) for i, (src, _) in enumerate(match_indices)])
         src_idx = torch.cat([src for (src, _) in match_indices])
         dst_idx = torch.cat([dst for (_, dst) in match_indices])
         return (batch_idx, src_idx), dst_idx
-    def _get_assigned_bboxes(self, pred_bboxes, gt_bboxes, match_indices):
+    def _get_assigned_bboxes(
+        self, pred_bboxes: torch.Tensor, gt_bboxes: torch.Tensor, match_indices: List[Tuple]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """
         Assign predicted bounding boxes to ground truth bounding boxes based on match indices.
         Args:
             pred_bboxes (torch.Tensor): Predicted bounding boxes.
             gt_bboxes (torch.Tensor): Ground truth bounding boxes.
-            match_indices (List[tuple]): List of tuples containing matched indices.
+            match_indices (List[Tuple]): List of tuples containing matched indices.
         Returns:
-            (tuple): Tuple containing assigned predictions and ground truths.
+            pred_assigned (torch.Tensor): Assigned predicted bounding boxes.
+            gt_assigned (torch.Tensor): Assigned ground truth bounding boxes.
         """
         pred_assigned = torch.cat(
             [
@@ -298,16 +305,16 @@ class DETRLoss(nn.Module):
     def _get_loss(
         self,
-        pred_bboxes,
-        pred_scores,
-        gt_bboxes,
-        gt_cls,
-        gt_groups,
-        masks=None,
-        gt_mask=None,
-        postfix="",
-        match_indices=None,
-    ):
+        pred_bboxes: torch.Tensor,
+        pred_scores: torch.Tensor,
+        gt_bboxes: torch.Tensor,
+        gt_cls: torch.Tensor,
+        gt_groups: List[int],
+        masks: Optional[torch.Tensor] = None,
+        gt_mask: Optional[torch.Tensor] = None,
+        postfix: str = "",
+        match_indices: Optional[List[Tuple]] = None,
+    ) -> Dict[str, torch.Tensor]:
         """
         Calculate losses for a single prediction layer.
@@ -319,11 +326,11 @@ class DETRLoss(nn.Module):
             gt_groups (List[int]): Number of ground truths per image.
             masks (torch.Tensor, optional): Predicted masks if using segmentation.
             gt_mask (torch.Tensor, optional): Ground truth masks if using segmentation.
-            postfix (str): String to append to loss names.
-            match_indices (List[tuple], optional): Pre-computed matching indices.
+            postfix (str, optional): String to append to loss names.
+            match_indices (List[Tuple], optional): Pre-computed matching indices.
         Returns:
-            (dict): Dictionary of losses.
+            (Dict[str, torch.Tensor]): Dictionary of losses.
         """
         if match_indices is None:
             match_indices = self.matcher(
@@ -347,22 +354,26 @@ class DETRLoss(nn.Module):
             # **(self._get_loss_mask(masks, gt_mask, match_indices, postfix) if masks is not None and gt_mask is not None else {})
         }
-    def forward(self, pred_bboxes, pred_scores, batch, postfix="", **kwargs):
+    def forward(
+        self,
+        pred_bboxes: torch.Tensor,
+        pred_scores: torch.Tensor,
+        batch: Dict[str, Any],
+        postfix: str = "",
+        **kwargs: Any,
+    ) -> Dict[str, torch.Tensor]:
         """
         Calculate loss for predicted bounding boxes and scores.
         Args:
-            pred_bboxes (torch.Tensor): Predicted bounding boxes, shape [l, b, query, 4].
-            pred_scores (torch.Tensor): Predicted class scores, shape [l, b, query, num_classes].
-            batch (dict): Batch information containing:
-                cls (torch.Tensor): Ground truth classes, shape [num_gts].
-                bboxes (torch.Tensor): Ground truth bounding boxes, shape [num_gts, 4].
-                gt_groups (List[int]): Number of ground truths for each image in the batch.
-            postfix (str): Postfix for loss names.
+            pred_bboxes (torch.Tensor): Predicted bounding boxes, shape (L, B, N, 4).
+            pred_scores (torch.Tensor): Predicted class scores, shape (L, B, N, C).
+            batch (Dict[str, Any]): Batch information containing cls, bboxes, and gt_groups.
+            postfix (str, optional): Postfix for loss names.
             **kwargs (Any): Additional arguments, may include 'match_indices'.
         Returns:
-            (dict): Computed losses, including main and auxiliary (if enabled).
+            (Dict[str, torch.Tensor]): Computed losses, including main and auxiliary (if enabled).
         Notes:
             Uses last elements of pred_bboxes and pred_scores for main loss, and the rest for auxiliary losses if
@@ -394,19 +405,26 @@ class RTDETRDetectionLoss(DETRLoss):
     an additional denoising training loss when provided with denoising metadata.
     """
-    def forward(self, preds, batch, dn_bboxes=None, dn_scores=None, dn_meta=None):
+    def forward(
+        self,
+        preds: Tuple[torch.Tensor, torch.Tensor],
+        batch: Dict[str, Any],
+        dn_bboxes: Optional[torch.Tensor] = None,
+        dn_scores: Optional[torch.Tensor] = None,
+        dn_meta: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, torch.Tensor]:
         """
         Forward pass to compute detection loss with optional denoising loss.
         Args:
-            preds (tuple): Tuple containing predicted bounding boxes and scores.
-            batch (dict): Batch data containing ground truth information.
+            preds (Tuple[torch.Tensor, torch.Tensor]): Tuple containing predicted bounding boxes and scores.
+            batch (Dict[str, Any]): Batch data containing ground truth information.
             dn_bboxes (torch.Tensor, optional): Denoising bounding boxes.
             dn_scores (torch.Tensor, optional): Denoising scores.
-            dn_meta (dict, optional): Metadata for denoising.
+            dn_meta (Dict[str, Any], optional): Metadata for denoising.
         Returns:
-            (dict): Dictionary containing total loss and denoising loss if applicable.
+            (Dict[str, torch.Tensor]): Dictionary containing total loss and denoising loss if applicable.
         """
         pred_bboxes, pred_scores = preds
         total_loss = super().forward(pred_bboxes, pred_scores, batch)
@@ -429,7 +447,9 @@ class RTDETRDetectionLoss(DETRLoss):
         return total_loss
     @staticmethod
-    def get_dn_match_indices(dn_pos_idx, dn_num_group, gt_groups):
+    def get_dn_match_indices(
+        dn_pos_idx: List[torch.Tensor], dn_num_group: int, gt_groups: List[int]
+    ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
         """
         Get match indices for denoising.
@@ -439,7 +459,7 @@ class RTDETRDetectionLoss(DETRLoss):
             gt_groups (List[int]): List of integers representing number of ground truths per image.
         Returns:
-            (List[tuple]): List of tuples containing matched indices for denoising.
+            (List[Tuple[torch.Tensor, torch.Tensor]]): List of tuples containing matched indices for denoising.
         """
         dn_match_indices = []
         idx_groups = torch.as_tensor([0, *gt_groups[:-1]]).cumsum_(0)
@@ -447,8 +467,9 @@ class RTDETRDetectionLoss(DETRLoss):
             if num_gt > 0:
                 gt_idx = torch.arange(end=num_gt, dtype=torch.long) + idx_groups[i]
                 gt_idx = gt_idx.repeat(dn_num_group)
-                assert len(dn_pos_idx[i]) == len(gt_idx), "Expected the same length, "
-                f"but got {len(dn_pos_idx[i])} and {len(gt_idx)} respectively."
+                assert len(dn_pos_idx[i]) == len(gt_idx), (
+                    f"Expected the same length, but got {len(dn_pos_idx[i])} and {len(gt_idx)} respectively."
+                )
                 dn_match_indices.append((dn_pos_idx[i], gt_idx))
             else:
                 dn_match_indices.append((torch.zeros([0], dtype=torch.long), torch.zeros([0], dtype=torch.long)))

ultralytics/models/utils/ops.py CHANGED Viewed

@@ -1,5 +1,7 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+from typing import Any, Dict, List, Optional, Tuple
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -11,40 +13,58 @@ from ultralytics.utils.ops import xywh2xyxy, xyxy2xywh
 class HungarianMatcher(nn.Module):
     """
-    A module implementing the HungarianMatcher, which is a differentiable module to solve the assignment problem in an
-    end-to-end fashion.
+    A module implementing the HungarianMatcher for optimal assignment between predictions and ground truth.
-    HungarianMatcher performs optimal assignment over the predicted and ground truth bounding boxes using a cost
-    function that considers classification scores, bounding box coordinates, and optionally, mask predictions.
+    HungarianMatcher performs optimal bipartite assignment over predicted and ground truth bounding boxes using a cost
+    function that considers classification scores, bounding box coordinates, and optionally mask predictions. This is
+    used in end-to-end object detection models like DETR.
     Attributes:
-        cost_gain (dict): Dictionary of cost coefficients: 'class', 'bbox', 'giou', 'mask', and 'dice'.
-        use_fl (bool): Indicates whether to use Focal Loss for the classification cost calculation.
-        with_mask (bool): Indicates whether the model makes mask predictions.
-        num_sample_points (int): The number of sample points used in mask cost calculation.
-        alpha (float): The alpha factor in Focal Loss calculation.
-        gamma (float): The gamma factor in Focal Loss calculation.
+        cost_gain (Dict[str, float]): Dictionary of cost coefficients for 'class', 'bbox', 'giou', 'mask', and 'dice'
+            components.
+        use_fl (bool): Whether to use Focal Loss for classification cost calculation.
+        with_mask (bool): Whether the model makes mask predictions.
+        num_sample_points (int): Number of sample points used in mask cost calculation.
+        alpha (float): Alpha factor in Focal Loss calculation.
+        gamma (float): Gamma factor in Focal Loss calculation.
     Methods:
-        forward: Computes the assignment between predictions and ground truths for a batch.
-        _cost_mask: Computes the mask cost and dice cost if masks are predicted.
+        forward: Compute optimal assignment between predictions and ground truths for a batch.
+        _cost_mask: Compute mask cost and dice cost if masks are predicted.
+    Examples:
+        Initialize a HungarianMatcher with custom cost gains
+        >>> matcher = HungarianMatcher(cost_gain={"class": 2, "bbox": 5, "giou": 2})
+        Perform matching between predictions and ground truth
+        >>> pred_boxes = torch.rand(2, 100, 4)  # batch_size=2, num_queries=100
+        >>> pred_scores = torch.rand(2, 100, 80)  # 80 classes
+        >>> gt_boxes = torch.rand(10, 4)  # 10 ground truth boxes
+        >>> gt_classes = torch.randint(0, 80, (10,))
+        >>> gt_groups = [5, 5]  # 5 GT boxes per image
+        >>> indices = matcher(pred_boxes, pred_scores, gt_boxes, gt_classes, gt_groups)
     """
-    def __init__(self, cost_gain=None, use_fl=True, with_mask=False, num_sample_points=12544, alpha=0.25, gamma=2.0):
+    def __init__(
+        self,
+        cost_gain: Optional[Dict[str, float]] = None,
+        use_fl: bool = True,
+        with_mask: bool = False,
+        num_sample_points: int = 12544,
+        alpha: float = 0.25,
+        gamma: float = 2.0,
+    ):
         """
-        Initialize a HungarianMatcher module for optimal assignment of predicted and ground truth bounding boxes.
-        The HungarianMatcher uses a cost function that considers classification scores, bounding box coordinates,
-        and optionally mask predictions to perform optimal bipartite matching between predictions and ground truths.
+        Initialize HungarianMatcher for optimal assignment of predicted and ground truth bounding boxes.
         Args:
-            cost_gain (dict, optional): Dictionary of cost coefficients for different components of the matching cost.
-                Should contain keys 'class', 'bbox', 'giou', 'mask', and 'dice'.
-            use_fl (bool, optional): Whether to use Focal Loss for the classification cost calculation.
-            with_mask (bool, optional): Whether the model makes mask predictions.
-            num_sample_points (int, optional): Number of sample points used in mask cost calculation.
-            alpha (float, optional): Alpha factor in Focal Loss calculation.
-            gamma (float, optional): Gamma factor in Focal Loss calculation.
+            cost_gain (Dict[str, float], optional): Dictionary of cost coefficients for different matching cost
+                components. Should contain keys 'class', 'bbox', 'giou', 'mask', and 'dice'.
+            use_fl (bool): Whether to use Focal Loss for classification cost calculation.
+            with_mask (bool): Whether the model makes mask predictions.
+            num_sample_points (int): Number of sample points used in mask cost calculation.
+            alpha (float): Alpha factor in Focal Loss calculation.
+            gamma (float): Gamma factor in Focal Loss calculation.
         """
         super().__init__()
         if cost_gain is None:
@@ -56,41 +76,49 @@ class HungarianMatcher(nn.Module):
         self.alpha = alpha
         self.gamma = gamma
-    def forward(self, pred_bboxes, pred_scores, gt_bboxes, gt_cls, gt_groups, masks=None, gt_mask=None):
+    def forward(
+        self,
+        pred_bboxes: torch.Tensor,
+        pred_scores: torch.Tensor,
+        gt_bboxes: torch.Tensor,
+        gt_cls: torch.Tensor,
+        gt_groups: List[int],
+        masks: Optional[torch.Tensor] = None,
+        gt_mask: Optional[List[torch.Tensor]] = None,
+    ) -> List[Tuple[torch.Tensor, torch.Tensor]]:
         """
-        Forward pass for HungarianMatcher. Computes costs based on prediction and ground truth and finds the optimal
-        matching between predictions and ground truth based on these costs.
+        Compute optimal assignment between predictions and ground truth using Hungarian algorithm.
+        This method calculates matching costs based on classification scores, bounding box coordinates, and optionally
+        mask predictions, then finds the optimal bipartite assignment between predictions and ground truth.
         Args:
             pred_bboxes (torch.Tensor): Predicted bounding boxes with shape (batch_size, num_queries, 4).
-            pred_scores (torch.Tensor): Predicted scores with shape (batch_size, num_queries, num_classes).
-            gt_cls (torch.Tensor): Ground truth classes with shape (num_gts, ).
+            pred_scores (torch.Tensor): Predicted classification scores with shape (batch_size, num_queries,
+                num_classes).
             gt_bboxes (torch.Tensor): Ground truth bounding boxes with shape (num_gts, 4).
-            gt_groups (List[int]): List of length equal to batch size, containing the number of ground truths for
-                each image.
+            gt_cls (torch.Tensor): Ground truth class labels with shape (num_gts,).
+            gt_groups (List[int]): Number of ground truth boxes for each image in the batch.
             masks (torch.Tensor, optional): Predicted masks with shape (batch_size, num_queries, height, width).
-            gt_mask (List[torch.Tensor], optional): List of ground truth masks, each with shape (num_masks, Height, Width).
+            gt_mask (List[torch.Tensor], optional): Ground truth masks, each with shape (num_masks, Height, Width).
         Returns:
-            (List[Tuple[torch.Tensor, torch.Tensor]]): A list of size batch_size, each element is a tuple (index_i, index_j), where:
-                - index_i is the tensor of indices of the selected predictions (in order)
-                - index_j is the tensor of indices of the corresponding selected ground truth targets (in order)
-                For each batch element, it holds:
-                    len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+            (List[Tuple[torch.Tensor, torch.Tensor]]): A list of size batch_size, each element is a tuple
+                (index_i, index_j), where index_i is the tensor of indices of the selected predictions (in order)
+                and index_j is the tensor of indices of the corresponding selected ground truth targets (in order).
+                For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes).
         """
         bs, nq, nc = pred_scores.shape
         if sum(gt_groups) == 0:
             return [(torch.tensor([], dtype=torch.long), torch.tensor([], dtype=torch.long)) for _ in range(bs)]
-        # We flatten to compute the cost matrices in a batch
-        # (batch_size * num_queries, num_classes)
+        # Flatten to compute cost matrices in batch format
         pred_scores = pred_scores.detach().view(-1, nc)
         pred_scores = F.sigmoid(pred_scores) if self.use_fl else F.softmax(pred_scores, dim=-1)
-        # (batch_size * num_queries, 4)
         pred_bboxes = pred_bboxes.detach().view(-1, 4)
-        # Compute the classification cost
+        # Compute classification cost
         pred_scores = pred_scores[:, gt_cls]
         if self.use_fl:
             neg_cost_class = (1 - self.alpha) * (pred_scores**self.gamma) * (-(1 - pred_scores + 1e-8).log())
@@ -99,23 +127,24 @@ class HungarianMatcher(nn.Module):
         else:
             cost_class = -pred_scores
-        # Compute the L1 cost between boxes
+        # Compute L1 cost between boxes
         cost_bbox = (pred_bboxes.unsqueeze(1) - gt_bboxes.unsqueeze(0)).abs().sum(-1)  # (bs*num_queries, num_gt)
-        # Compute the GIoU cost between boxes, (bs*num_queries, num_gt)
+        # Compute GIoU cost between boxes, (bs*num_queries, num_gt)
         cost_giou = 1.0 - bbox_iou(pred_bboxes.unsqueeze(1), gt_bboxes.unsqueeze(0), xywh=True, GIoU=True).squeeze(-1)
-        # Final cost matrix
+        # Combine costs into final cost matrix
         C = (
             self.cost_gain["class"] * cost_class
             + self.cost_gain["bbox"] * cost_bbox
             + self.cost_gain["giou"] * cost_giou
         )
-        # Compute the mask cost and dice cost
+        # Add mask costs if available
         if self.with_mask:
             C += self._cost_mask(bs, gt_groups, masks, gt_mask)
-        # Set invalid values (NaNs and infinities) to 0 (fixes ValueError: matrix contains invalid numeric entries)
+        # Set invalid values (NaNs and infinities) to 0
         C[C.isnan() | C.isinf()] = 0.0
         C = C.view(bs, nq, -1).cpu()
@@ -158,28 +187,49 @@ class HungarianMatcher(nn.Module):
 def get_cdn_group(
-    batch, num_classes, num_queries, class_embed, num_dn=100, cls_noise_ratio=0.5, box_noise_scale=1.0, training=False
-):
+    batch: Dict[str, Any],
+    num_classes: int,
+    num_queries: int,
+    class_embed: torch.Tensor,
+    num_dn: int = 100,
+    cls_noise_ratio: float = 0.5,
+    box_noise_scale: float = 1.0,
+    training: bool = False,
+) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor], Optional[Dict[str, Any]]]:
     """
-    Get contrastive denoising training group with positive and negative samples from ground truths.
+    Generate contrastive denoising training group with positive and negative samples from ground truths.
+    This function creates denoising queries for contrastive denoising training by adding noise to ground truth
+    bounding boxes and class labels. It generates both positive and negative samples to improve model robustness.
     Args:
-        batch (dict): A dict that includes 'gt_cls' (torch.Tensor with shape (num_gts, )), 'gt_bboxes'
-            (torch.Tensor with shape (num_gts, 4)), 'gt_groups' (List[int]) which is a list of batch size length
-            indicating the number of gts of each image.
-        num_classes (int): Number of classes.
-        num_queries (int): Number of queries.
-        class_embed (torch.Tensor): Embedding weights to map class labels to embedding space.
-        num_dn (int, optional): Number of denoising queries.
-        cls_noise_ratio (float, optional): Noise ratio for class labels.
-        box_noise_scale (float, optional): Noise scale for bounding box coordinates.
-        training (bool, optional): If it's in training mode.
+        batch (Dict[str, Any]): Batch dictionary containing 'gt_cls' (torch.Tensor with shape (num_gts,)),
+            'gt_bboxes' (torch.Tensor with shape (num_gts, 4)), and 'gt_groups' (List[int]) indicating number of
+            ground truths per image.
+        num_classes (int): Total number of object classes.
+        num_queries (int): Number of object queries.
+        class_embed (torch.Tensor): Class embedding weights to map labels to embedding space.
+        num_dn (int): Number of denoising queries to generate.
+        cls_noise_ratio (float): Noise ratio for class labels.
+        box_noise_scale (float): Noise scale for bounding box coordinates.
+        training (bool): Whether model is in training mode.
     Returns:
-        padding_cls (Optional[torch.Tensor]): The modified class embeddings for denoising.
-        padding_bbox (Optional[torch.Tensor]): The modified bounding boxes for denoising.
-        attn_mask (Optional[torch.Tensor]): The attention mask for denoising.
-        dn_meta (Optional[Dict]): Meta information for denoising.
+        padding_cls (torch.Tensor | None): Modified class embeddings for denoising with shape (bs, num_dn, embed_dim).
+        padding_bbox (torch.Tensor | None): Modified bounding boxes for denoising with shape (bs, num_dn, 4).
+        attn_mask (torch.Tensor | None): Attention mask for denoising with shape (tgt_size, tgt_size).
+        dn_meta (Dict[str, Any] | None): Meta information dictionary containing denoising parameters.
+    Examples:
+        Generate denoising group for training
+        >>> batch = {
+        ...     "cls": torch.tensor([0, 1, 2]),
+        ...     "bboxes": torch.rand(3, 4),
+        ...     "batch_idx": torch.tensor([0, 0, 1]),
+        ...     "gt_groups": [2, 1],
+        ... }
+        >>> class_embed = torch.rand(80, 256)  # 80 classes, 256 embedding dim
+        >>> cdn_outputs = get_cdn_group(batch, 80, 100, class_embed, training=True)
     """
     if (not training) or num_dn <= 0 or batch is None:
         return None, None, None, None
@@ -197,7 +247,7 @@ def get_cdn_group(
     gt_bbox = batch["bboxes"]  # bs*num, 4
     b_idx = batch["batch_idx"]
-    # Each group has positive and negative queries.
+    # Each group has positive and negative queries
     dn_cls = gt_cls.repeat(2 * num_group)  # (2*num_group*bs*num, )
     dn_bbox = gt_bbox.repeat(2 * num_group, 1)  # 2*num_group*bs*num, 4
     dn_b_idx = b_idx.repeat(2 * num_group).view(-1)  # (2*num_group*bs*num, )
@@ -207,10 +257,10 @@ def get_cdn_group(
     neg_idx = torch.arange(total_num * num_group, dtype=torch.long, device=gt_bbox.device) + num_group * total_num
     if cls_noise_ratio > 0:
-        # Half of bbox prob
+        # Apply class label noise to half of the samples
         mask = torch.rand(dn_cls.shape) < (cls_noise_ratio * 0.5)
         idx = torch.nonzero(mask).squeeze(-1)
-        # Randomly put a new one here
+        # Randomly assign new class labels
         new_label = torch.randint_like(idx, 0, num_classes, dtype=dn_cls.dtype, device=dn_cls.device)
         dn_cls[idx] = new_label
@@ -229,7 +279,6 @@ def get_cdn_group(
         dn_bbox = torch.logit(dn_bbox, eps=1e-6)  # inverse sigmoid
     num_dn = int(max_nums * 2 * num_group)  # total denoising queries
-    # class_embed = torch.cat([class_embed, torch.zeros([1, class_embed.shape[-1]], device=class_embed.device)])
     dn_cls_embed = class_embed[dn_cls]  # bs*num * 2 * num_group, 256
     padding_cls = torch.zeros(bs, num_dn, dn_cls_embed.shape[-1], device=gt_cls.device)
     padding_bbox = torch.zeros(bs, num_dn, 4, device=gt_bbox.device)

ultralytics 8.3.143__py3-none-any.whl → 8.3.145__py3-none-any.whl

ultralytics 8.3.143py3-none-any.whl → 8.3.145py3-none-any.whl