PyPI - ultralytics - Versions diffs - 8.3.98__py3-none-any.whl → 8.3.100__py3-none-any.whl - Mend

ultralytics 8.3.98py3-none-any.whl → 8.3.100py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

tests/test_python.py +56 -0
ultralytics/__init__.py +3 -2
ultralytics/cfg/models/11/yoloe-11-seg.yaml +48 -0
ultralytics/cfg/models/11/yoloe-11.yaml +48 -0
ultralytics/cfg/models/v8/yoloe-v8-seg.yaml +45 -0
ultralytics/cfg/models/v8/yoloe-v8.yaml +45 -0
ultralytics/data/augment.py +101 -5
ultralytics/data/dataset.py +165 -12
ultralytics/engine/exporter.py +5 -4
ultralytics/engine/trainer.py +16 -7
ultralytics/models/__init__.py +2 -2
ultralytics/models/yolo/__init__.py +3 -3
ultralytics/models/yolo/detect/val.py +6 -1
ultralytics/models/yolo/model.py +183 -3
ultralytics/models/yolo/segment/val.py +43 -16
ultralytics/models/yolo/yoloe/__init__.py +21 -0
ultralytics/models/yolo/yoloe/predict.py +170 -0
ultralytics/models/yolo/yoloe/train.py +355 -0
ultralytics/models/yolo/yoloe/train_seg.py +141 -0
ultralytics/models/yolo/yoloe/val.py +187 -0
ultralytics/nn/autobackend.py +17 -7
ultralytics/nn/modules/__init__.py +18 -1
ultralytics/nn/modules/block.py +17 -1
ultralytics/nn/modules/head.py +359 -22
ultralytics/nn/tasks.py +276 -10
ultralytics/nn/text_model.py +193 -0
ultralytics/utils/benchmarks.py +1 -0
ultralytics/utils/callbacks/comet.py +3 -6
ultralytics/utils/downloads.py +6 -2
ultralytics/utils/loss.py +67 -6
ultralytics/utils/plotting.py +1 -1
ultralytics/utils/tal.py +1 -1
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/METADATA +10 -10
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/RECORD +38 -28
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/WHEEL +0 -0
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/entry_points.txt +0 -0
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/licenses/LICENSE +0 -0
{ultralytics-8.3.98.dist-info → ultralytics-8.3.100.dist-info}/top_level.txt +0 -0

ultralytics/nn/tasks.py CHANGED Viewed

@@ -8,7 +8,9 @@ from copy import deepcopy
 from pathlib import Path
 import torch
+import torch.nn as nn
+from ultralytics.nn.autobackend import check_class_names
 from ultralytics.nn.modules import (
     AIFI,
     C1,
@@ -51,6 +53,7 @@ from ultralytics.nn.modules import (
     HGStem,
     ImagePoolingAttn,
     Index,
+    LRPCHead,
     Pose,
     RepC3,
     RepConv,
@@ -62,6 +65,8 @@ from ultralytics.nn.modules import (
     Segment,
     TorchVision,
     WorldDetect,
+    YOLOEDetect,
+    YOLOESegment,
     v10Detect,
 )
 from ultralytics.utils import DEFAULT_CFG_DICT, DEFAULT_CFG_KEYS, LOGGER, colorstr, emojis, yaml_load
@@ -83,6 +88,7 @@ from ultralytics.utils.torch_utils import (
     intersect_dicts,
     model_info,
     scale_img,
+    smart_inference_mode,
     time_sync,
 )
@@ -255,7 +261,9 @@ class BaseModel(torch.nn.Module):
         """
         self = super()._apply(fn)
         m = self.model[-1]  # Detect()
-        if isinstance(m, Detect):  # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect
+        if isinstance(
+            m, Detect
+        ):  # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect, YOLOEDetect, YOLOESegment
             m.stride = fn(m.stride)
             m.anchors = fn(m.anchors)
             m.strides = fn(m.strides)
@@ -329,7 +337,7 @@ class DetectionModel(BaseModel):
         # Build strides
         m = self.model[-1]  # Detect()
-        if isinstance(m, Detect):  # includes all Detect subclasses like Segment, Pose, OBB, WorldDetect
+        if isinstance(m, Detect):  # includes all Detect subclasses like Segment, Pose, OBB, YOLOEDetect, YOLOESegment
             s = 256  # 2x min stride
             m.inplace = self.inplace
@@ -337,7 +345,7 @@ class DetectionModel(BaseModel):
                 """Perform a forward pass through the model, handling different Detect subclass types accordingly."""
                 if self.end2end:
                     return self.forward(x)["one2many"]
-                return self.forward(x)[0] if isinstance(m, (Segment, Pose, OBB)) else self.forward(x)
+                return self.forward(x)[0] if isinstance(m, (Segment, YOLOESegment, Pose, OBB)) else self.forward(x)
             m.stride = torch.tensor([s / x.shape[-2] for x in _forward(torch.zeros(1, ch, s, s))])  # forward
             self.stride = m.stride
@@ -778,6 +786,260 @@ class WorldModel(DetectionModel):
         return self.criterion(preds, batch)
+class YOLOEModel(DetectionModel):
+    """YOLOE detection model."""
+    def __init__(self, cfg="yoloe-v8s.yaml", ch=3, nc=None, verbose=True):
+        """
+        Initialize YOLOE model with given config and parameters.
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            verbose (bool): Whether to display model information.
+        """
+        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
+    @smart_inference_mode()
+    def get_text_pe(self, text, batch=80, cache_clip_model=False, without_reprta=False):
+        """
+        Set classes in advance so that model could do offline-inference without clip model.
+        Args:
+            text (List[str]): List of class names.
+            batch (int): Batch size for processing text tokens.
+            cache_clip_model (bool): Whether to cache the CLIP model.
+            without_reprta (bool): Whether to return text embeddings cooperated with reprta module.
+        Returns:
+            (torch.Tensor): Text positional embeddings.
+        """
+        from ultralytics.nn.text_model import build_text_model
+        device = next(self.model.parameters()).device
+        if not getattr(self, "clip_model", None) and cache_clip_model:
+            # For backwards compatibility of models lacking clip_model attribute
+            self.clip_model = build_text_model("mobileclip:blt", device=device)
+        model = self.clip_model if cache_clip_model else build_text_model("mobileclip:blt", device=device)
+        text_token = model.tokenize(text)
+        txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
+        txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
+        txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
+        if without_reprta:
+            return txt_feats
+        assert not self.training
+        head = self.model[-1]
+        assert isinstance(head, YOLOEDetect)
+        return head.get_tpe(txt_feats)  # run axuiliary text head
+    @smart_inference_mode()
+    def get_visual_pe(self, img, visual):
+        """
+        Get visual embeddings.
+        Args:
+            img (torch.Tensor): Input image tensor.
+            visual (torch.Tensor): Visual features.
+        Returns:
+            (torch.Tensor): Visual positional embeddings.
+        """
+        return self(img, vpe=visual, return_vpe=True)
+    def set_vocab(self, vocab, names):
+        """
+        Set vocabulary for the prompt-free model.
+        Args:
+            vocab (nn.ModuleList): List of vocabulary items.
+            names (List[str]): List of class names.
+        """
+        assert not self.training
+        head = self.model[-1]
+        assert isinstance(head, YOLOEDetect)
+        # Cache anchors for head
+        device = next(self.parameters()).device
+        self(torch.empty(1, 3, self.args["imgsz"], self.args["imgsz"]).to(device))  # warmup
+        # re-parameterization for prompt-free model
+        self.model[-1].lrpc = nn.ModuleList(
+            LRPCHead(cls, pf[-1], loc[-1], enabled=i != 2)
+            for i, (cls, pf, loc) in enumerate(zip(vocab, head.cv3, head.cv2))
+        )
+        for loc_head, cls_head in zip(head.cv2, head.cv3):
+            assert isinstance(loc_head, nn.Sequential)
+            assert isinstance(cls_head, nn.Sequential)
+            del loc_head[-1]
+            del cls_head[-1]
+        self.model[-1].nc = len(names)
+        self.names = check_class_names(names)
+    def get_vocab(self, names):
+        """
+        Get fused vocabulary layer from the model.
+        Args:
+            names (list): List of class names.
+        Returns:
+            (nn.ModuleList): List of vocabulary modules.
+        """
+        assert not self.training
+        head = self.model[-1]
+        assert isinstance(head, YOLOEDetect)
+        assert not head.is_fused
+        tpe = self.get_text_pe(names)
+        self.set_classes(names, tpe)
+        device = next(self.model.parameters()).device
+        head.fuse(self.pe.to(device))  # fuse prompt embeddings to classify head
+        vocab = nn.ModuleList()
+        for cls_head in head.cv3:
+            assert isinstance(cls_head, nn.Sequential)
+            vocab.append(cls_head[-1])
+        return vocab
+    def set_classes(self, names, embeddings):
+        """
+        Set classes in advance so that model could do offline-inference without clip model.
+        Args:
+            names (List[str]): List of class names.
+            embeddings (torch.Tensor): Embeddings tensor.
+        """
+        assert embeddings.ndim == 3
+        self.pe = embeddings
+        self.model[-1].nc = len(names)
+        self.names = check_class_names(names)
+    def get_cls_pe(self, tpe, vpe):
+        """
+        Get class positional embeddings.
+        Args:
+            tpe (torch.Tensor, optional): Text positional embeddings.
+            vpe (torch.Tensor, optional): Visual positional embeddings.
+        Returns:
+            (torch.Tensor): Class positional embeddings.
+        """
+        all_pe = []
+        if tpe is not None:
+            assert tpe.ndim == 3
+            all_pe.append(tpe)
+        if vpe is not None:
+            assert vpe.ndim == 3
+            all_pe.append(vpe)
+        if not all_pe:
+            all_pe.append(getattr(self, "pe", torch.zeros(1, 80, 512)))
+        return torch.cat(all_pe, dim=1)
+    def predict(
+        self, x, profile=False, visualize=False, tpe=None, augment=False, embed=None, vpe=None, return_vpe=False
+    ):
+        """
+        Perform a forward pass through the model.
+        Args:
+            x (torch.Tensor): The input tensor.
+            profile (bool): If True, profile the computation time for each layer.
+            visualize (bool): If True, save feature maps for visualization.
+            tpe (torch.Tensor, optional): Text positional embeddings.
+            augment (bool): If True, perform data augmentation during inference.
+            embed (list, optional): A list of feature vectors/embeddings to return.
+            vpe (torch.Tensor, optional): Visual positional embeddings.
+            return_vpe (bool): If True, return visual positional embeddings.
+        Returns:
+            (torch.Tensor): Model's output tensor.
+        """
+        y, dt, embeddings = [], [], []  # outputs
+        b = x.shape[0]
+        for m in self.model:  # except the head part
+            if m.f != -1:  # if not from previous layer
+                x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
+            if profile:
+                self._profile_one_layer(m, x, dt)
+            if isinstance(m, YOLOEDetect):
+                vpe = m.get_vpe(x, vpe) if vpe is not None else None
+                if return_vpe:
+                    assert vpe is not None
+                    assert not self.training
+                    return vpe
+                cls_pe = self.get_cls_pe(m.get_tpe(tpe), vpe).to(device=x[0].device, dtype=x[0].dtype)
+                if len(cls_pe) != b:
+                    cls_pe = cls_pe.repeat(b, 1, 1)
+                x = m(x, cls_pe)
+            else:
+                x = m(x)  # run
+            y.append(x if m.i in self.save else None)  # save output
+            if visualize:
+                feature_visualization(x, m.type, m.i, save_dir=visualize)
+            if embed and m.i in embed:
+                embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
+                if m.i == max(embed):
+                    return torch.unbind(torch.cat(embeddings, 1), dim=0)
+        return x
+    def loss(self, batch, preds=None):
+        """
+        Compute loss.
+        Args:
+            batch (dict): Batch to compute loss on.
+            preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
+        """
+        if not hasattr(self, "criterion"):
+            from ultralytics.utils.loss import TVPDetectLoss
+            visual_prompt = batch.get("visuals", None) is not None  # TODO
+            self.criterion = TVPDetectLoss(self) if visual_prompt else self.init_criterion()
+        if preds is None:
+            preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
+        return self.criterion(preds, batch)
+class YOLOESegModel(YOLOEModel, SegmentationModel):
+    """YOLOE segmentation model."""
+    def __init__(self, cfg="yoloe-v8s-seg.yaml", ch=3, nc=None, verbose=True):
+        """
+        Initialize YOLOE segmentation model with given config and parameters.
+        Args:
+            cfg (str | dict): Model configuration file path or dictionary.
+            ch (int): Number of input channels.
+            nc (int, optional): Number of classes.
+            verbose (bool): Whether to display model information.
+        """
+        super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
+    def loss(self, batch, preds=None):
+        """
+        Compute loss.
+        Args:
+            batch (dict): Batch to compute loss on.
+            preds (torch.Tensor | List[torch.Tensor], optional): Predictions.
+        """
+        if not hasattr(self, "criterion"):
+            from ultralytics.utils.loss import TVPSegmentLoss
+            visual_prompt = batch.get("visuals", None) is not None  # TODO
+            self.criterion = TVPSegmentLoss(self) if visual_prompt else self.init_criterion()
+        if preds is None:
+            preds = self.forward(batch["img"], tpe=batch.get("txt_feats", None), vpe=batch.get("visuals", None))
+        return self.criterion(preds, batch)
 class Ensemble(torch.nn.ModuleList):
     """Ensemble of models."""
@@ -1185,6 +1447,8 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
                 legacy = False
                 if scale in "lx":  # for L/X sizes
                     args.extend((True, 1.2))
+            if m is C2fCIB:
+                legacy = False
         elif m is AIFI:
             args = [ch[f], *args]
         elif m in frozenset({HGStem, HGBlock}):
@@ -1199,11 +1463,13 @@ def parse_model(d, ch, verbose=True):  # model_dict, input_channels(3)
             args = [ch[f]]
         elif m is Concat:
             c2 = sum(ch[x] for x in f)
-        elif m in frozenset({Detect, WorldDetect, Segment, Pose, OBB, ImagePoolingAttn, v10Detect}):
+        elif m in frozenset(
+            {Detect, WorldDetect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB, ImagePoolingAttn, v10Detect}
+        ):
             args.append([ch[x] for x in f])
-            if m is Segment:
+            if m is Segment or m is YOLOESegment:
                 args[2] = make_divisible(min(args[2], max_channels) * width, 8)
-            if m in {Detect, Segment, Pose, OBB}:
+            if m in {Detect, YOLOEDetect, Segment, YOLOESegment, Pose, OBB}:
                 m.legacy = legacy
         elif m is RTDETRDecoder:  # special case, channels arg must be passed in index 1
             args.insert(1, [ch[x] for x in f])
@@ -1269,7 +1535,7 @@ def guess_model_scale(model_path):
         (str): The size character of the model's scale (n, s, m, l, or x).
     """
     try:
-        return re.search(r"yolo[v]?\d+([nslmx])", Path(model_path).stem).group(1)  # returns n, s, m, l, or x
+        return re.search(r"yolo(e-)?[v]?\d+([nslmx])", Path(model_path).stem).group(2)  # noqa
     except AttributeError:
         return ""
@@ -1292,7 +1558,7 @@ def guess_model_task(model):
             return "classify"
         if "detect" in m:
             return "detect"
-        if m == "segment":
+        if "segment" in m:
             return "segment"
         if m == "pose":
             return "pose"
@@ -1312,7 +1578,7 @@ def guess_model_task(model):
             with contextlib.suppress(Exception):
                 return cfg2task(eval(x))
         for m in model.modules():
-            if isinstance(m, Segment):
+            if isinstance(m, (Segment, YOLOESegment)):
                 return "segment"
             elif isinstance(m, Classify):
                 return "classify"
@@ -1320,7 +1586,7 @@ def guess_model_task(model):
                 return "pose"
             elif isinstance(m, OBB):
                 return "obb"
-            elif isinstance(m, (Detect, WorldDetect, v10Detect)):
+            elif isinstance(m, (Detect, WorldDetect, YOLOEDetect, v10Detect)):
                 return "detect"
     # Guess from model filename

ultralytics/nn/text_model.py ADDED Viewed

@@ -0,0 +1,193 @@
+# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
+from abc import abstractmethod
+from pathlib import Path
+import torch
+import torch.nn as nn
+from ultralytics.utils import LOGGER, checks
+from ultralytics.utils.torch_utils import smart_inference_mode
+try:
+    import clip
+except ImportError:
+    checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
+    import clip
+try:
+    import warnings
+    # Suppress 'timm.models.layers is deprecated, please import via timm.layers' warning from mobileclip usage
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=FutureWarning)
+        import mobileclip
+except ImportError:
+    # MobileCLIP repo has an incorrect version of torchvision as dependency
+    # Manually install other dependencies first and install mobileclip with "--no-deps" flag
+    checks.check_requirements(["open-clip-torch>=2.20.0", "timm>=0.9.5"])
+    checks.check_requirements("git+https://github.com/apple/ml-mobileclip.git", cmds="--no-deps")
+    import mobileclip
+class TextModel(nn.Module):
+    """
+    Abstract base class for text encoding models.
+    This class defines the interface for text encoding models used in vision-language tasks. Subclasses must implement
+    the tokenize and encode_text methods.
+    Methods:
+        tokenize: Convert input texts to tokens.
+        encode_text: Encode tokenized texts into feature vectors.
+    """
+    def __init__(self):
+        """Initialize the TextModel base class."""
+        super().__init__()
+    @abstractmethod
+    def tokenize(texts):
+        """Convert input texts to tokens for model processing."""
+        pass
+    @abstractmethod
+    def encode_text(texts, dtype):
+        """Encode tokenized texts into normalized feature vectors."""
+        pass
+class CLIP(TextModel):
+    """
+    OpenAI CLIP text encoder implementation.
+    This class implements the TextModel interface using OpenAI's CLIP model for text encoding.
+    Attributes:
+        model (clip.model.CLIP): The loaded CLIP model.
+        device (torch.device): Device where the model is loaded.
+    Methods:
+        tokenize: Convert input texts to CLIP tokens.
+        encode_text: Encode tokenized texts into normalized feature vectors.
+    """
+    def __init__(self, size, device):
+        """
+        Initialize the CLIP text encoder.
+        Args:
+            size (str): Model size identifier (e.g., 'ViT-B/32').
+            device (torch.device): Device to load the model on.
+        """
+        super().__init__()
+        self.model = clip.load(size, device=device)[0]
+        self.to(device)
+        self.device = device
+        self.eval()
+    def tokenize(self, texts):
+        """Convert input texts to CLIP tokens."""
+        return clip.tokenize(texts).to(self.device)
+    @smart_inference_mode()
+    def encode_text(self, texts, dtype=torch.float32):
+        """
+        Encode tokenized texts into normalized feature vectors.
+        Args:
+            texts (torch.Tensor): Tokenized text inputs.
+            dtype (torch.dtype): Data type for output features.
+        Returns:
+            (torch.Tensor): Normalized text feature vectors.
+        """
+        txt_feats = self.model.encode_text(texts).to(dtype)
+        txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
+        return txt_feats
+class MobileCLIP(TextModel):
+    """
+    Apple MobileCLIP text encoder implementation.
+    This class implements the TextModel interface using Apple's MobileCLIP model for efficient text encoding.
+    Attributes:
+        model (mobileclip.model.MobileCLIP): The loaded MobileCLIP model.
+        tokenizer (callable): Tokenizer function for processing text inputs.
+        device (torch.device): Device where the model is loaded.
+        config_size_map (dict): Mapping from size identifiers to model configuration names.
+    Methods:
+        tokenize: Convert input texts to MobileCLIP tokens.
+        encode_text: Encode tokenized texts into normalized feature vectors.
+    """
+    config_size_map = {"s0": "s0", "s1": "s1", "s2": "s2", "b": "b", "blt": "b"}
+    def __init__(self, size, device):
+        """
+        Initialize the MobileCLIP text encoder.
+        Args:
+            size (str): Model size identifier (e.g., 's0', 's1', 's2', 'b', 'blt').
+            device (torch.device): Device to load the model on.
+        """
+        super().__init__()
+        config = self.config_size_map[size]
+        file = f"mobileclip_{size}.pt"
+        if not Path(file).is_file():
+            from ultralytics import download
+            download(f"https://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/{file}")
+        self.model = mobileclip.create_model_and_transforms(f"mobileclip_{config}", pretrained=file, device=device)[0]
+        self.tokenizer = mobileclip.get_tokenizer(f"mobileclip_{config}")
+        self.to(device)
+        self.device = device
+        self.eval()
+    def tokenize(self, texts):
+        """Convert input texts to MobileCLIP tokens."""
+        return self.tokenizer(texts).to(self.device)
+    @smart_inference_mode()
+    def encode_text(self, texts, dtype=torch.float32):
+        """
+        Encode tokenized texts into normalized feature vectors.
+        Args:
+            texts (torch.Tensor): Tokenized text inputs.
+            dtype (torch.dtype): Data type for output features.
+        Returns:
+            (torch.Tensor): Normalized text feature vectors.
+        """
+        text_features = self.model.encode_text(texts).to(dtype)
+        text_features /= text_features.norm(p=2, dim=-1, keepdim=True)
+        return text_features
+def build_text_model(variant, device=None):
+    """
+    Build a text encoding model based on the specified variant.
+    Args:
+        variant (str): Model variant in format "base:size" (e.g., "clip:ViT-B/32" or "mobileclip:s0").
+        device (torch.device, optional): Device to load the model on.
+    Returns:
+        (TextModel): Instantiated text encoding model.
+    Raises:
+        AssertionError: If the specified variant is not supported.
+    """
+    LOGGER.info(f"Build text model {variant}")
+    base, size = variant.split(":")
+    if base == "clip":
+        return CLIP(size, device)
+    elif base == "mobileclip":
+        return MobileCLIP(size, device)
+    else:
+        print("Variant not found")
+        assert False

ultralytics/utils/benchmarks.py CHANGED Viewed

@@ -126,6 +126,7 @@ def benchmark(
                 assert not isinstance(model, YOLOWorld), "YOLOWorldv2 TensorFlow exports not supported by onnx2tf yet"
             if i == 11:  # Paddle
                 assert not isinstance(model, YOLOWorld), "YOLOWorldv2 Paddle exports not supported yet"
+                assert not model.task == "obb", "Paddle OBB bug https://github.com/PaddlePaddle/Paddle/issues/72024"
                 assert not is_end2end, "End-to-end models not supported by PaddlePaddle yet"
                 assert LINUX or MACOS, "Windows Paddle exports not supported yet"
             if i == 12:  # MNN

ultralytics/utils/callbacks/comet.py CHANGED Viewed

@@ -194,12 +194,9 @@ def _format_prediction_annotations(image_path, metadata, class_label_map=None, c
         LOGGER.debug(f"COMET WARNING: Image: {image_path} has no bounding boxes predictions")
         return None
-    label_index_offset = 0
-    if class_map is not None:
         # offset to align indices of class labels (starting from zero)
         # with prediction's category ID indices (can start from one)
-        label_index_offset = sorted(class_map)[0]
+    label_index_offset = sorted(class_map)[0] if class_map is not None else 0
     try:
         # import pycotools utilities to decompress annotations for various tasks, e.g. segmentation
         from pycocotools.mask import decode  # noqa
@@ -221,8 +218,8 @@ def _format_prediction_annotations(image_path, metadata, class_label_map=None, c
             segments = prediction.get("segmentation", None)
             if segments is not None:
                 segments = _extract_segmentation_annotation(segments, decode)
-                if segments is not None:
-                    annotation_data["points"] = segments
+            if segments is not None:
+                annotation_data["points"] = segments
         data.append(annotation_data)

ultralytics/utils/downloads.py CHANGED Viewed

@@ -15,7 +15,7 @@ from ultralytics.utils import LOGGER, TQDM, checks, clean_url, emojis, is_online
 # Define Ultralytics GitHub assets maintained at https://github.com/ultralytics/assets
 GITHUB_ASSETS_REPO = "ultralytics/assets"
-GITHUB_ASSETS_NAMES = (
+GITHUB_ASSETS_NAMES = frozenset(
     [f"yolov8{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb", "-oiv7")]
     + [f"yolo11{k}{suffix}.pt" for k in "nsmlx" for suffix in ("", "-cls", "-seg", "-pose", "-obb")]
     + [f"yolo12{k}{suffix}.pt" for k in "nsmlx" for suffix in ("",)]  # detect models only currently
@@ -23,16 +23,20 @@ GITHUB_ASSETS_NAMES = (
     + [f"yolov3{k}u.pt" for k in ("", "-spp", "-tiny")]
     + [f"yolov8{k}-world.pt" for k in "smlx"]
     + [f"yolov8{k}-worldv2.pt" for k in "smlx"]
+    + [f"yoloe-v8{k}{suffix}.pt" for k in "sml" for suffix in ("-seg", "-seg-pf")]
+    + [f"yoloe-11{k}{suffix}.pt" for k in "sml" for suffix in ("-seg", "-seg-pf")]
     + [f"yolov9{k}.pt" for k in "tsmce"]
     + [f"yolov10{k}.pt" for k in "nsmblx"]
     + [f"yolo_nas_{k}.pt" for k in "sml"]
     + [f"sam_{k}.pt" for k in "bl"]
+    + [f"sam2_{k}.pt" for k in "blst"]
+    + [f"sam2.1_{k}.pt" for k in "blst"]
     + [f"FastSAM-{k}.pt" for k in "sx"]
     + [f"rtdetr-{k}.pt" for k in "lx"]
     + ["mobile_sam.pt"]
     + ["calibration_image_sample_data_20x128x128x3_float32.npy.zip"]
 )
-GITHUB_ASSETS_STEMS = [Path(k).stem for k in GITHUB_ASSETS_NAMES]
+GITHUB_ASSETS_STEMS = frozenset(k.rsplit(".", 1)[0] for k in GITHUB_ASSETS_NAMES)
 def is_url(url, check=False):

ultralytics 8.3.98__py3-none-any.whl → 8.3.100__py3-none-any.whl

ultralytics 8.3.98py3-none-any.whl → 8.3.100py3-none-any.whl