PyPI - ultralytics-opencv-headless - Versions diffs - 8.3.253__py3-none-any.whl → 8.4.0__py3-none-any.whl - Mend

ultralytics-opencv-headless 8.3.253py3-none-any.whl → 8.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

tests/__init__.py +2 -2
tests/conftest.py +1 -1
tests/test_cuda.py +8 -2
tests/test_engine.py +6 -6
tests/test_exports.py +10 -3
tests/test_integrations.py +9 -9
tests/test_python.py +14 -14
tests/test_solutions.py +3 -3
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +6 -6
ultralytics/cfg/default.yaml +3 -1
ultralytics/cfg/models/26/yolo26-cls.yaml +33 -0
ultralytics/cfg/models/26/yolo26-obb.yaml +52 -0
ultralytics/cfg/models/26/yolo26-p2.yaml +60 -0
ultralytics/cfg/models/26/yolo26-p6.yaml +60 -0
ultralytics/cfg/models/26/yolo26-pose.yaml +53 -0
ultralytics/cfg/models/26/yolo26-seg.yaml +52 -0
ultralytics/cfg/models/26/yolo26.yaml +52 -0
ultralytics/cfg/models/26/yoloe-26-seg.yaml +53 -0
ultralytics/cfg/models/26/yoloe-26.yaml +53 -0
ultralytics/data/augment.py +7 -0
ultralytics/data/dataset.py +1 -1
ultralytics/engine/exporter.py +10 -3
ultralytics/engine/model.py +1 -1
ultralytics/engine/trainer.py +40 -15
ultralytics/engine/tuner.py +15 -7
ultralytics/models/fastsam/predict.py +1 -1
ultralytics/models/yolo/detect/train.py +3 -2
ultralytics/models/yolo/detect/val.py +6 -0
ultralytics/models/yolo/model.py +1 -1
ultralytics/models/yolo/obb/predict.py +1 -1
ultralytics/models/yolo/obb/train.py +1 -1
ultralytics/models/yolo/pose/train.py +1 -1
ultralytics/models/yolo/segment/predict.py +1 -1
ultralytics/models/yolo/segment/train.py +1 -1
ultralytics/models/yolo/segment/val.py +3 -1
ultralytics/models/yolo/yoloe/train.py +6 -1
ultralytics/models/yolo/yoloe/train_seg.py +6 -1
ultralytics/nn/autobackend.py +7 -3
ultralytics/nn/modules/__init__.py +8 -0
ultralytics/nn/modules/block.py +127 -8
ultralytics/nn/modules/head.py +818 -205
ultralytics/nn/tasks.py +74 -29
ultralytics/nn/text_model.py +5 -2
ultralytics/optim/__init__.py +5 -0
ultralytics/optim/muon.py +338 -0
ultralytics/utils/benchmarks.py +1 -0
ultralytics/utils/callbacks/platform.py +9 -7
ultralytics/utils/downloads.py +3 -1
ultralytics/utils/export/engine.py +19 -10
ultralytics/utils/export/imx.py +22 -11
ultralytics/utils/export/tensorflow.py +1 -41
ultralytics/utils/loss.py +584 -203
ultralytics/utils/metrics.py +1 -0
ultralytics/utils/ops.py +11 -2
ultralytics/utils/tal.py +98 -19
{ultralytics_opencv_headless-8.3.253.dist-info → ultralytics_opencv_headless-8.4.0.dist-info}/METADATA +31 -39
{ultralytics_opencv_headless-8.3.253.dist-info → ultralytics_opencv_headless-8.4.0.dist-info}/RECORD +62 -51
{ultralytics_opencv_headless-8.3.253.dist-info → ultralytics_opencv_headless-8.4.0.dist-info}/WHEEL +0 -0
{ultralytics_opencv_headless-8.3.253.dist-info → ultralytics_opencv_headless-8.4.0.dist-info}/entry_points.txt +0 -0
{ultralytics_opencv_headless-8.3.253.dist-info → ultralytics_opencv_headless-8.4.0.dist-info}/licenses/LICENSE +0 -0
{ultralytics_opencv_headless-8.3.253.dist-info → ultralytics_opencv_headless-8.4.0.dist-info}/top_level.txt +0 -0

ultralytics/nn/tasks.py CHANGED Viewed

@@ -20,6 +20,7 @@ from ultralytics.nn.modules import (
     C3TR,
     ELAN1,
     OBB,
+    OBB26,
     PSA,
     SPP,
     SPPELAN,
@@ -55,6 +56,7 @@ from ultralytics.nn.modules import (
     Index,
     LRPCHead,
     Pose,
+    Pose26,
     RepC3,
     RepConv,
     RepNCSPELAN4,
@@ -63,16 +65,19 @@ from ultralytics.nn.modules import (
     RTDETRDecoder,
     SCDown,
     Segment,
+    Segment26,
     TorchVision,
     WorldDetect,
     YOLOEDetect,
     YOLOESegment,
+    YOLOESegment26,
     v10Detect,
 )
 from ultralytics.utils import DEFAULT_CFG_DICT, LOGGER, YAML, colorstr, emojis
 from ultralytics.utils.checks import check_requirements, check_suffix, check_yaml
 from ultralytics.utils.loss import (
-    E2EDetectLoss,
+    E2ELoss,
+    PoseLoss26,
     v8ClassificationLoss,
     v8DetectionLoss,
     v8OBBLoss,
@@ -223,7 +228,7 @@ class BaseModel(torch.nn.Module):
         Returns:
             (torch.nn.Module): The fused model is returned.
         """
-        if not self.is_fused():
+        if True:
             for m in self.model.modules():
                 if isinstance(m, (Conv, Conv2, DWConv)) and hasattr(m, "bn"):
                     if isinstance(m, Conv2):
@@ -241,7 +246,7 @@ class BaseModel(torch.nn.Module):
                 if isinstance(m, RepVGGDW):
                     m.fuse()
                     m.forward = m.forward_fuse
-                if isinstance(m, v10Detect):
+                if isinstance(m, Detect) and getattr(m, "end2end", False):
                     m.fuse()  # remove one2many head
             self.info(verbose=verbose)
@@ -386,7 +391,6 @@ class DetectionModel(BaseModel):
         self.model, self.save = parse_model(deepcopy(self.yaml), ch=ch, verbose=verbose)  # model, savelist
         self.names = {i: f"{i}" for i in range(self.yaml["nc"])}  # default names dict
         self.inplace = self.yaml.get("inplace", True)
-        self.end2end = getattr(self.model[-1], "end2end", False)
         # Build strides
         m = self.model[-1]  # Detect()
@@ -396,9 +400,10 @@ class DetectionModel(BaseModel):
             def _forward(x):
                 """Perform a forward pass through the model, handling different Detect subclass types accordingly."""
+                output = self.forward(x)
                 if self.end2end:
-                    return self.forward(x)["one2many"]
-                return self.forward(x)[0] if isinstance(m, (Segment, YOLOESegment, Pose, OBB)) else self.forward(x)
+                    output = output["one2many"]
+                return output["feats"]
             self.model.eval()  # Avoid changing batch statistics until training begins
             m.training = True  # Setting it to True to properly return strides
@@ -415,6 +420,11 @@ class DetectionModel(BaseModel):
             self.info()
             LOGGER.info("")
+    @property
+    def end2end(self):
+        """Return whether the model uses end-to-end NMS-free detection."""
+        return getattr(self.model[-1], "end2end", False)
     def _predict_augment(self, x):
         """Perform augmentations on input image x and return augmented inference and train outputs.
@@ -481,7 +491,7 @@ class DetectionModel(BaseModel):
     def init_criterion(self):
         """Initialize the loss criterion for the DetectionModel."""
-        return E2EDetectLoss(self) if getattr(self, "end2end", False) else v8DetectionLoss(self)
+        return E2ELoss(self) if getattr(self, "end2end", False) else v8DetectionLoss(self)
 class OBBModel(DetectionModel):
@@ -513,7 +523,7 @@ class OBBModel(DetectionModel):
     def init_criterion(self):
         """Initialize the loss criterion for the model."""
-        return v8OBBLoss(self)
+        return E2ELoss(self, v8OBBLoss) if getattr(self, "end2end", False) else v8OBBLoss(self)
 class SegmentationModel(DetectionModel):
@@ -545,7 +555,7 @@ class SegmentationModel(DetectionModel):
     def init_criterion(self):
         """Initialize the loss criterion for the SegmentationModel."""
-        return v8SegmentationLoss(self)
+        return E2ELoss(self, v8SegmentationLoss) if getattr(self, "end2end", False) else v8SegmentationLoss(self)
 class PoseModel(DetectionModel):
@@ -586,7 +596,7 @@ class PoseModel(DetectionModel):
     def init_criterion(self):
         """Initialize the loss criterion for the PoseModel."""
-        return v8PoseLoss(self)
+        return E2ELoss(self, PoseLoss26) if getattr(self, "end2end", False) else v8PoseLoss(self)
 class ClassificationModel(BaseModel):
@@ -984,6 +994,7 @@ class YOLOEModel(DetectionModel):
             verbose (bool): Whether to display model information.
         """
         super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
+        self.text_model = self.yaml.get("text_model", "mobileclip:blt")
     @smart_inference_mode()
     def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
@@ -1003,9 +1014,13 @@ class YOLOEModel(DetectionModel):
         device = next(self.model.parameters()).device
         if not getattr(self, "clip_model", None) and cache_clip_model:
             # For backwards compatibility of models lacking clip_model attribute
-            self.clip_model = build_text_model("mobileclip:blt", device=device)
+            self.clip_model = build_text_model(getattr(self, "text_model", "mobileclip:blt"), device=device)
-        model = self.clip_model if cache_clip_model else build_text_model("mobileclip:blt", device=device)
+        model = (
+            self.clip_model
+            if cache_clip_model
+            else build_text_model(getattr(self, "text_model", "mobileclip:blt"), device=device)
+        )
         text_token = model.tokenize(text)
         txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
         txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
@@ -1045,10 +1060,12 @@ class YOLOEModel(DetectionModel):
         device = next(self.parameters()).device
         self(torch.empty(1, 3, self.args["imgsz"], self.args["imgsz"]).to(device))  # warmup
+        cv3 = getattr(head, "one2one_cv3", head.cv3)
+        cv2 = getattr(head, "one2one_cv2", head.cv2)
         # re-parameterization for prompt-free model
         self.model[-1].lrpc = nn.ModuleList(
-            LRPCHead(cls, pf[-1], loc[-1], enabled=i != 2)
-            for i, (cls, pf, loc) in enumerate(zip(vocab, head.cv3, head.cv2))
+            LRPCHead(cls, pf[-1], loc[-1], enabled=i != 2) for i, (cls, pf, loc) in enumerate(zip(vocab, cv3, cv2))
         )
         for loc_head, cls_head in zip(head.cv2, head.cv3):
             assert isinstance(loc_head, nn.Sequential)
@@ -1077,8 +1094,9 @@ class YOLOEModel(DetectionModel):
         device = next(self.model.parameters()).device
         head.fuse(self.pe.to(device))  # fuse prompt embeddings to classify head
+        cv3 = getattr(head, "one2one_cv3", head.cv3)
         vocab = nn.ModuleList()
-        for cls_head in head.cv3:
+        for cls_head in cv3:
             assert isinstance(cls_head, nn.Sequential)
             vocab.append(cls_head[-1])
         return vocab
@@ -1155,9 +1173,8 @@ class YOLOEModel(DetectionModel):
                 cls_pe = self.get_cls_pe(m.get_tpe(tpe), vpe).to(device=x[0].device, dtype=x[0].dtype)
                 if cls_pe.shape[0] != b or m.export:
                     cls_pe = cls_pe.expand(b, -1, -1)
-                x = m(x, cls_pe)
-            else:
-                x = m(x)  # run
+                x.append(cls_pe)  # adding cls embedding
+            x = m(x)  # run
             y.append(x if m.i in self.save else None)  # save output
             if visualize:
@@ -1179,10 +1196,17 @@ class YOLOEModel(DetectionModel):
             from ultralytics.utils.loss import TVPDetectLoss
             visual_prompt = batch.get("visuals", None) is not None  # TODO
-            self.criterion = TVPDetectLoss(self) if visual_prompt else self.init_criterion()
+            self.criterion = (
+                (E2ELoss(self, TVPDetectLoss) if getattr(self, "end2end", False) else TVPDetectLoss(self))
+                if visual_prompt
+                else self.init_criterion()
+            )
         if preds is None:
-            preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
+            preds = self.forward(
+                batch["img"],
+                tpe=None if "visuals" in batch else batch.get("txt_feats", None),
+                vpe=batch.get("visuals", None),
+            )
         return self.criterion(preds, batch)
@@ -1224,7 +1248,11 @@ class YOLOESegModel(YOLOEModel, SegmentationModel):
             from ultralytics.utils.loss import TVPSegmentLoss
             visual_prompt = batch.get("visuals", None) is not None  # TODO
-            self.criterion = TVPSegmentLoss(self) if visual_prompt else self.init_criterion()
+            self.criterion = (
+                (E2ELoss(self, TVPSegmentLoss) if getattr(self, "end2end", False) else TVPSegmentLoss(self))
+                if visual_prompt
+                else self.init_criterion()
+            )
         if preds is None:
             preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
@@ -1499,7 +1527,8 @@ def parse_model(d, ch, verbose=True):
     # Args
     legacy = True  # backward compatibility for v3/v5/v8/v9 models
     max_channels = float("inf")
-    nc, act, scales = (d.get(x) for x in ("nc", "activation", "scales"))
+    nc, act, scales, end2end = (d.get(x) for x in ("nc", "activation", "scales", "end2end"))
+    reg_max = d.get("reg_max", 16)
     depth, width, kpt_shape = (d.get(x, 1.0) for x in ("depth_multiple", "width_multiple", "kpt_shape"))
     scale = d.get("scale")
     if scales:
@@ -1624,13 +1653,29 @@ def parse_model(d, ch, verbose=True):
         elif m is Concat:
             c2 = sum(ch[x] for x in f)
         elif m in frozenset(
-            {Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect}
+            {
+                Detect,
+                WorldDetect,
+                YOLOEDetect,
+                Segment,
+                Segment26,
+                YOLOESegment,
+                YOLOESegment26,
+                Pose,
+                Pose26,
+                OBB,
+                OBB26,
+            }
         ):
-            args.append([ch[x] for x in f])
-            if m is Segment or m is YOLOESegment:
+            args.extend([reg_max, end2end, [ch[x] for x in f]])
+            if m is Segment or m is YOLOESegment or m is Segment26 or m is YOLOESegment26:
                 args[2] = make_divisible(min(args[2], max_channels) * width, 8)
-            if m in {Detect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB}:
+            if m in {Detect, YOLOEDetect, Segment, Segment26, YOLOESegment, YOLOESegment26, Pose, Pose26, OBB, OBB26}:
                 m.legacy = legacy
+        elif m is v10Detect:
+            args.append([ch[x] for x in f])
+        elif m is ImagePoolingAttn:
+            args.insert(1, [ch[x] for x in f])  # channels as second arg
         elif m is RTDETRDecoder:  # special case, channels arg must be passed in index 1
             args.insert(1, [ch[x] for x in f])
         elif m is CBLinear:
@@ -1717,9 +1762,9 @@ def guess_model_task(model):
             return "detect"
         if "segment" in m:
             return "segment"
-        if m == "pose":
+        if "pose" in m:
             return "pose"
-        if m == "obb":
+        if "obb" in m:
             return "obb"
     # Guess from model cfg

ultralytics/nn/text_model.py CHANGED Viewed

@@ -275,7 +275,7 @@ class MobileCLIPTS(TextModel):
         >>> features = text_encoder.encode_text(tokens)
     """
-    def __init__(self, device: torch.device):
+    def __init__(self, device: torch.device, weight: str = "mobileclip_blt.ts"):
         """Initialize the MobileCLIP TorchScript text encoder.
         This class implements the TextModel interface using Apple's MobileCLIP model in TorchScript format for efficient
@@ -283,11 +283,12 @@ class MobileCLIPTS(TextModel):
         Args:
             device (torch.device): Device to load the model on.
+            weight (str): Path to the TorchScript model weights.
         """
         super().__init__()
         from ultralytics.utils.downloads import attempt_download_asset
-        self.encoder = torch.jit.load(attempt_download_asset("mobileclip_blt.ts"), map_location=device)
+        self.encoder = torch.jit.load(attempt_download_asset(weight), map_location=device)
         self.tokenizer = clip.clip.tokenize
         self.device = device
@@ -352,5 +353,7 @@ def build_text_model(variant: str, device: torch.device = None) -> TextModel:
         return CLIP(size, device)
     elif base == "mobileclip":
         return MobileCLIPTS(device)
+    elif base == "mobileclip2":
+        return MobileCLIPTS(device, weight="mobileclip2_b.ts")
     else:
         raise ValueError(f"Unrecognized base model: '{base}'. Supported base models: 'clip', 'mobileclip'.")

ultralytics/optim/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+from .muon import Muon, MuSGD
+__all__ = ["MuSGD", "Muon"]

ultralytics/optim/muon.py ADDED Viewed

@@ -0,0 +1,338 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+from __future__ import annotations
+import torch
+from torch import optim
+def zeropower_via_newtonschulz5(G: torch.Tensor, eps: float = 1e-7) -> torch.Tensor:
+    """Compute the zeroth power / orthogonalization of matrix G using Newton-Schulz iteration.
+    This function implements a quintic Newton-Schulz iteration to compute an approximate orthogonalization of the input
+    matrix G. The iteration coefficients are optimized to maximize convergence slope at zero, producing a result similar
+    to UV^T from SVD, where USV^T = G, but with relaxed convergence guarantees that empirically work well for
+    optimization purposes.
+    Args:
+        G (torch.Tensor): Input 2D tensor/matrix to orthogonalize.
+        eps (float, optional): Small epsilon value added to norm for numerical stability. Default: 1e-7.
+    Returns:
+        (torch.Tensor): Orthogonalized matrix with same shape as input G.
+    Examples:
+        >>> G = torch.randn(128, 64)
+        >>> G_ortho = zeropower_via_newtonschulz5(G)
+        >>> print(G_ortho.shape)
+        torch.Size([128, 64])
+    Notes:
+        - Uses bfloat16 precision for computation.
+        - Performs exactly 5 Newton-Schulz iteration steps with fixed coefficients.
+        - Automatically transposes for efficiency when rows > columns.
+        - Output approximates US'V^T where S' has diagonal entries ~ Uniform(0.5, 1.5).
+        - Does not produce exact UV^T but works well empirically for neural network optimization.
+    """
+    assert len(G.shape) == 2
+    X = G.bfloat16()
+    X /= X.norm() + eps  # ensure top singular value <= 1
+    if G.size(0) > G.size(1):
+        X = X.T
+    for a, b, c in [  # num_steps fixed at 5
+        # original params
+        (3.4445, -4.7750, 2.0315),
+        (3.4445, -4.7750, 2.0315),
+        (3.4445, -4.7750, 2.0315),
+        (3.4445, -4.7750, 2.0315),
+        (3.4445, -4.7750, 2.0315),
+    ]:
+        # for _ in range(steps):
+        A = X @ X.T
+        B = b * A + c * A @ A
+        X = a * X + B @ X
+    if G.size(0) > G.size(1):
+        X = X.T
+    return X
+def muon_update(grad: torch.Tensor, momentum: torch.Tensor, beta: float = 0.95, nesterov: bool = True) -> torch.Tensor:
+    """Compute Muon optimizer update with momentum and orthogonalization.
+    This function applies momentum to the gradient, optionally uses Nesterov acceleration, and then orthogonalizes the
+    update using Newton-Schulz iterations. For convolutional filters (4D tensors), it reshapes before orthogonalization
+    and scales the final update based on parameter dimensions.
+    Args:
+        grad (torch.Tensor): Gradient tensor to update. Can be 2D or 4D (for conv filters).
+        momentum (torch.Tensor): Momentum buffer tensor, modified in-place via lerp.
+        beta (float, optional): Momentum coefficient for exponential moving average. Default: 0.95.
+        nesterov (bool, optional): Whether to use Nesterov momentum acceleration. Default: True.
+    Returns:
+        (torch.Tensor): Orthogonalized update tensor with same shape as input grad. For 4D inputs, returns reshaped
+            result matching original dimensions.
+    Examples:
+        >>> grad = torch.randn(64, 128)
+        >>> momentum = torch.zeros_like(grad)
+        >>> update = muon_update(grad, momentum, beta=0.95, nesterov=True)
+        >>> print(update.shape)
+        torch.Size([64, 128])
+    Notes:
+        - Momentum buffer is updated in-place: momentum = beta * momentum + (1-beta) * grad.
+        - With Nesterov: update = beta * momentum + (1-beta) * grad.
+        - Without Nesterov: update = momentum.
+        - 4D tensors (conv filters) are reshaped to 2D as (channels, height*width*depth) for orthogonalization.
+        - Final update is scaled by sqrt(max(dim[-2], dim[-1])) to account for parameter dimensions.
+    """
+    momentum.lerp_(grad, 1 - beta)
+    update = grad.lerp(momentum, beta) if nesterov else momentum
+    if update.ndim == 4:  # for the case of conv filters
+        update = update.view(len(update), -1)
+    update = zeropower_via_newtonschulz5(update)
+    update *= max(1, grad.size(-2) / grad.size(-1)) ** 0.5
+    return update
+class MuSGD(optim.Optimizer):
+    """Hybrid optimizer combining Muon and SGD updates for neural network training.
+    This optimizer implements a combination of Muon (a momentum-based optimizer with orthogonalization via Newton-Schulz
+    iterations) and standard SGD with momentum. It allows different parameter groups to use either the hybrid Muon+SGD
+    approach or pure SGD.
+    Args:
+        param_groups (list): List of parameter groups with their optimization settings.
+        muon (float, optional): Weight factor for Muon updates in hybrid mode. Default: 0.5.
+        sgd (float, optional): Weight factor for SGD updates in hybrid mode. Default: 0.5.
+    Attributes:
+        muon (float): Scaling factor applied to Muon learning rate.
+        sgd (float): Scaling factor applied to SGD learning rate in hybrid mode.
+    Examples:
+        >>> param_groups = [
+        ...     {
+        ...         "params": model.conv_params,
+        ...         "lr": 0.02,
+        ...         "use_muon": True,
+        ...         "momentum": 0.95,
+        ...         "nesterov": True,
+        ...         "weight_decay": 0.01,
+        ...     },
+        ...     {
+        ...         "params": model.other_params,
+        ...         "lr": 0.01,
+        ...         "use_muon": False,
+        ...         "momentum": 0.9,
+        ...         "nesterov": False,
+        ...         "weight_decay": 0,
+        ...     },
+        ... ]
+        >>> optimizer = MuSGD(param_groups, muon=0.5, sgd=0.5)
+        >>> loss = model(data)
+        >>> loss.backward()
+        >>> optimizer.step()
+    Notes:
+        - Parameter groups with 'use_muon': True will receive both Muon and SGD updates.
+        - Parameter groups with 'use_muon': False will receive only SGD updates.
+        - The Muon update uses orthogonalization which works best for 2D+ parameter tensors.
+    """
+    def __init__(
+        self,
+        params,
+        lr: float = 1e-3,
+        momentum: float = 0.0,
+        weight_decay: float = 0.0,
+        nesterov: bool = False,
+        use_muon: bool = False,
+        muon: float = 0.5,
+        sgd: float = 0.5,
+    ):
+        """Initialize MuSGD optimizer with hybrid Muon and SGD capabilities.
+        Args:
+            params: Iterable of parameters to optimize or dicts defining parameter groups.
+            lr (float): Learning rate.
+            momentum (float): Momentum factor for SGD.
+            weight_decay (float): Weight decay (L2 penalty).
+            nesterov (bool): Whether to use Nesterov momentum.
+            use_muon (bool): Whether to enable Muon updates.
+            muon (float): Scaling factor for Muon component.
+            sgd (float): Scaling factor for SGD component.
+        """
+        defaults = dict(
+            lr=lr,
+            momentum=momentum,
+            weight_decay=weight_decay,
+            nesterov=nesterov,
+            use_muon=use_muon,
+        )
+        super().__init__(params, defaults)
+        self.muon = muon
+        self.sgd = sgd
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Applies either hybrid Muon+SGD updates or pure SGD updates depending on the
+        'use_muon' flag in each parameter group. For Muon-enabled groups, parameters
+        receive both an orthogonalized Muon update and a standard SGD momentum update.
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss. Default: None.
+        Returns:
+            (torch.Tensor | None): The loss value if closure is provided, otherwise None.
+        Notes:
+            - Parameters with None gradients are assigned zero gradients for synchronization.
+            - Muon updates use Newton-Schulz orthogonalization and work best on 2D+ tensors.
+            - Weight decay is applied only to the SGD component in hybrid mode.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            # Muon
+            if group["use_muon"]:
+                # generate weight updates in distributed fashion
+                for p in group["params"]:
+                    lr = group["lr"]
+                    if p.grad is None:
+                        continue
+                    grad = p.grad
+                    state = self.state[p]
+                    if len(state) == 0:
+                        state["momentum_buffer"] = torch.zeros_like(p)
+                        state["momentum_buffer_SGD"] = torch.zeros_like(p)
+                    update = muon_update(
+                        grad, state["momentum_buffer"], beta=group["momentum"], nesterov=group["nesterov"]
+                    )
+                    p.add_(update.reshape(p.shape), alpha=-(lr * self.muon))
+                    # SGD update
+                    if group["weight_decay"] != 0:
+                        grad = grad.add(p, alpha=group["weight_decay"])
+                    state["momentum_buffer_SGD"].mul_(group["momentum"]).add_(grad)
+                    sgd_update = (
+                        grad.add(state["momentum_buffer_SGD"], alpha=group["momentum"])
+                        if group["nesterov"]
+                        else state["momentum_buffer_SGD"]
+                    )
+                    p.add_(sgd_update, alpha=-(lr * self.sgd))
+            else:  # SGD
+                for p in group["params"]:
+                    lr = group["lr"]
+                    if p.grad is None:
+                        continue
+                    grad = p.grad
+                    if group["weight_decay"] != 0:
+                        grad = grad.add(p, alpha=group["weight_decay"])
+                    state = self.state[p]
+                    if len(state) == 0:
+                        state["momentum_buffer"] = torch.zeros_like(p)
+                    state["momentum_buffer"].mul_(group["momentum"]).add_(grad)
+                    update = (
+                        grad.add(state["momentum_buffer"], alpha=group["momentum"])
+                        if group["nesterov"]
+                        else state["momentum_buffer"]
+                    )
+                    p.add_(update, alpha=-lr)
+        return loss
+class Muon(optim.Optimizer):
+    """Muon optimizer for usage in non-distributed settings.
+    This optimizer implements the Muon algorithm, which combines momentum-based updates with orthogonalization via
+    Newton-Schulz iterations. It applies weight decay and learning rate scaling to parameter updates.
+    Args:
+        params (iterable): Iterable of parameters to optimize or dicts defining parameter groups.
+        lr (float, optional): Learning rate. Default: 0.02.
+        weight_decay (float, optional): Weight decay (L2 penalty) coefficient. Default: 0.
+        momentum (float, optional): Momentum coefficient for exponential moving average. Default: 0.95.
+    Attributes:
+        param_groups (list): List of parameter groups with their optimization settings.
+        state (dict): Dictionary containing optimizer state for each parameter.
+    Examples:
+        >>> model = YourModel()
+        >>> optimizer = Muon(model.parameters(), lr=0.02, weight_decay=0.01, momentum=0.95)
+        >>> loss = model(data)
+        >>> loss.backward()
+        >>> optimizer.step()
+    Notes:
+        - Designed for non-distributed training environments.
+        - Uses Muon updates with orthogonalization for all parameters.
+        - Weight decay is applied multiplicatively before parameter update.
+        - Parameters with None gradients are assigned zero gradients for synchronization.
+    """
+    def __init__(self, params, lr: float = 0.02, weight_decay: float = 0, momentum: float = 0.95):
+        """Initialize Muon optimizer with orthogonalization-based updates.
+        Args:
+            params: Iterable of parameters to optimize or dicts defining parameter groups.
+            lr (float): Learning rate.
+            weight_decay (float): Weight decay factor applied multiplicatively.
+            momentum (float): Momentum factor for gradient accumulation.
+        """
+        defaults = dict(lr=lr, weight_decay=weight_decay, momentum=momentum)
+        super().__init__(params, defaults)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Perform a single optimization step.
+        Applies Muon updates to all parameters, incorporating momentum and orthogonalization.
+        Weight decay is applied multiplicatively before the parameter update.
+        Args:
+            closure (Callable[[], torch.Tensor] | None, optional): A closure that reevaluates the model
+                and returns the loss. Default: None.
+        Returns:
+            (torch.Tensor | None): The loss value if closure is provided, otherwise None.
+        Examples:
+            >>> optimizer = Muon(model.parameters())
+            >>> loss = model(inputs)
+            >>> loss.backward()
+            >>> optimizer.step()
+        Notes:
+            - Parameters with None gradients are assigned zero gradients for synchronization.
+            - Weight decay is applied as: p *= (1 - lr * weight_decay).
+            - Muon update uses Newton-Schulz orthogonalization and works best on 2D+ tensors.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    # continue
+                    p.grad = torch.zeros_like(p)  # Force synchronization
+                state = self.state[p]
+                if len(state) == 0:
+                    state["momentum_buffer"] = torch.zeros_like(p)
+                update = muon_update(p.grad, state["momentum_buffer"], beta=group["momentum"])
+                p.mul_(1 - group["lr"] * group["weight_decay"])
+                p.add_(update.reshape(p.shape), alpha=-group["lr"])
+        return loss

ultralytics/utils/benchmarks.py CHANGED Viewed

@@ -141,6 +141,7 @@ def benchmark(
                 assert not isinstance(model, YOLOWorld), "YOLOWorldv2 MNN exports not supported yet"
             if format == "ncnn":
                 assert not isinstance(model, YOLOWorld), "YOLOWorldv2 NCNN exports not supported yet"
+                assert not ARM64, "NCNN not supported on ARM64"  # https://github.com/Tencent/ncnn/issues/6509
             if format == "imx":
                 assert not is_end2end
                 assert not isinstance(model, YOLOWorld), "YOLOWorldv2 IMX exports not supported"

ultralytics-opencv-headless 8.3.253__py3-none-any.whl → 8.4.0__py3-none-any.whl

ultralytics-opencv-headless 8.3.253py3-none-any.whl → 8.4.0py3-none-any.whl