PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.135__py3-none-any.whl → 8.3.138__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.135py3-none-any.whl → 8.3.138py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{dgenerate_ultralytics_headless-8.3.135.dist-info → dgenerate_ultralytics_headless-8.3.138.dist-info}/METADATA +1 -2
{dgenerate_ultralytics_headless-8.3.135.dist-info → dgenerate_ultralytics_headless-8.3.138.dist-info}/RECORD +40 -40
tests/test_cuda.py +2 -7
tests/test_exports.py +1 -6
tests/test_solutions.py +181 -8
ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +4 -4
ultralytics/data/base.py +1 -1
ultralytics/data/build.py +4 -3
ultralytics/data/loaders.py +2 -2
ultralytics/engine/exporter.py +6 -7
ultralytics/engine/model.py +2 -2
ultralytics/engine/predictor.py +3 -10
ultralytics/engine/trainer.py +1 -1
ultralytics/engine/validator.py +1 -1
ultralytics/hub/auth.py +2 -2
ultralytics/hub/utils.py +8 -3
ultralytics/models/yolo/classify/predict.py +11 -0
ultralytics/models/yolo/obb/val.py +1 -1
ultralytics/models/yolo/world/train.py +66 -20
ultralytics/models/yolo/world/train_world.py +1 -0
ultralytics/models/yolo/yoloe/train.py +10 -39
ultralytics/models/yolo/yoloe/val.py +3 -3
ultralytics/nn/tasks.py +41 -24
ultralytics/nn/text_model.py +1 -0
ultralytics/solutions/similarity_search.py +3 -6
ultralytics/solutions/streamlit_inference.py +1 -1
ultralytics/utils/__init__.py +1 -1
ultralytics/utils/callbacks/hub.py +5 -4
ultralytics/utils/checks.py +16 -13
ultralytics/utils/downloads.py +7 -5
ultralytics/utils/export.py +1 -1
ultralytics/utils/metrics.py +51 -22
ultralytics/utils/plotting.py +19 -13
ultralytics/utils/torch_utils.py +3 -0
ultralytics/utils/triton.py +1 -1
{dgenerate_ultralytics_headless-8.3.135.dist-info → dgenerate_ultralytics_headless-8.3.138.dist-info}/WHEEL +0 -0
{dgenerate_ultralytics_headless-8.3.135.dist-info → dgenerate_ultralytics_headless-8.3.138.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.135.dist-info → dgenerate_ultralytics_headless-8.3.138.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.135.dist-info → dgenerate_ultralytics_headless-8.3.138.dist-info}/top_level.txt +0 -0

ultralytics/engine/model.py CHANGED Viewed

@@ -288,7 +288,7 @@ class Model(torch.nn.Module):
             weights = checks.check_file(weights, download_dir=SETTINGS["weights_dir"])  # download and return local file
         weights = checks.check_model_file_from_stem(weights)  # add suffix, i.e. yolo11n -> yolo11n.pt
-        if Path(weights).suffix == ".pt":
+        if str(weights).rpartition(".")[-1] == "pt":
             self.model, self.ckpt = attempt_load_one_weight(weights)
             self.task = self.model.args["task"]
             self.overrides = self.model.args = self._reset_ckpt_args(self.model.args)
@@ -319,7 +319,7 @@ class Model(torch.nn.Module):
             >>> model = Model("yolo11n.onnx")
             >>> model._check_is_pytorch_model()  # Raises TypeError
         """
-        pt_str = isinstance(self.model, (str, Path)) and Path(self.model).suffix == ".pt"
+        pt_str = isinstance(self.model, (str, Path)) and str(self.model).rpartition(".")[-1] == "pt"
         pt_module = isinstance(self.model, torch.nn.Module)
         if not (pt_module or pt_str):
             raise TypeError(

ultralytics/engine/predictor.py CHANGED Viewed

@@ -43,7 +43,7 @@ import torch
 from ultralytics.cfg import get_cfg, get_save_dir
 from ultralytics.data import load_inference_source
-from ultralytics.data.augment import LetterBox, classify_transforms
+from ultralytics.data.augment import LetterBox
 from ultralytics.nn.autobackend import AutoBackend
 from ultralytics.utils import DEFAULT_CFG, LOGGER, MACOS, WINDOWS, callbacks, colorstr, ops
 from ultralytics.utils.checks import check_imgsz, check_imshow
@@ -247,15 +247,6 @@ class BasePredictor:
                 Source for inference.
         """
         self.imgsz = check_imgsz(self.args.imgsz, stride=self.model.stride, min_dim=2)  # check image size
-        self.transforms = (
-            getattr(
-                self.model.model,
-                "transforms",
-                classify_transforms(self.imgsz[0]),
-            )
-            if self.args.task == "classify"
-            else None
-        )
         self.dataset = load_inference_source(
             source=source,
             batch=self.args.batch,
@@ -395,6 +386,8 @@ class BasePredictor:
         self.device = self.model.device  # update device
         self.args.half = self.model.fp16  # update half
+        if hasattr(self.model, "imgsz"):
+            self.args.imgsz = self.model.imgsz  # reuse imgsz from export metadata
         self.model.eval()
     def write_results(self, i, p, im, s):

ultralytics/engine/trainer.py CHANGED Viewed

@@ -578,7 +578,7 @@ class BaseTrainer:
         try:
             if self.args.task == "classify":
                 data = check_cls_dataset(self.args.data)
-            elif self.args.data.split(".")[-1] in {"yaml", "yml"} or self.args.task in {
+            elif self.args.data.rsplit(".", 1)[-1] in {"yaml", "yml"} or self.args.task in {
                 "detect",
                 "segment",
                 "pose",

ultralytics/engine/validator.py CHANGED Viewed

@@ -175,7 +175,7 @@ class BaseValidator:
                 self.args.batch = model.metadata.get("batch", 1)  # export.py models default to batch-size 1
                 LOGGER.info(f"Setting batch={self.args.batch} input of shape ({self.args.batch}, 3, {imgsz}, {imgsz})")
-            if str(self.args.data).split(".")[-1] in {"yaml", "yml"}:
+            if str(self.args.data).rsplit(".", 1)[-1] in {"yaml", "yml"}:
                 self.data = check_det_dataset(self.args.data)
             elif self.args.task == "classify":
                 self.data = check_cls_dataset(self.args.data, split=self.args.split)

ultralytics/hub/auth.py CHANGED Viewed

@@ -37,7 +37,7 @@ class Auth:
             verbose (bool): Enable verbose logging.
         """
         # Split the input API key in case it contains a combined key_model and keep only the API key part
-        api_key = api_key.split("_")[0]
+        api_key = api_key.split("_", 1)[0]
         # Set API key attribute as value passed or SETTINGS API key if none passed
         self.api_key = api_key or SETTINGS.get("api_key", "")
@@ -77,7 +77,7 @@ class Auth:
         for attempts in range(max_attempts):
             LOGGER.info(f"{PREFIX}Login. Attempt {attempts + 1} of {max_attempts}")
             input_key = getpass.getpass(f"Enter API key from {API_KEY_URL} ")
-            self.api_key = input_key.split("_")[0]  # remove model id if present
+            self.api_key = input_key.split("_", 1)[0]  # remove model id if present
             if self.authenticate():
                 return True
         raise ConnectionError(emojis(f"{PREFIX}Failed to authenticate ❌"))

ultralytics/hub/utils.py CHANGED Viewed

@@ -1,7 +1,6 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
 import os
-import platform
 import random
 import threading
 import time
@@ -18,6 +17,7 @@ from ultralytics.utils import (
     IS_PIP_PACKAGE,
     LOGGER,
     ONLINE,
+    PYTHON_VERSION,
     RANK,
     SETTINGS,
     TESTS_RUNNING,
@@ -27,6 +27,7 @@ from ultralytics.utils import (
     get_git_origin_url,
 )
 from ultralytics.utils.downloads import GITHUB_ASSETS_NAMES
+from ultralytics.utils.torch_utils import get_cpu_info
 HUB_API_ROOT = os.environ.get("ULTRALYTICS_HUB_API", "https://api.ultralytics.com")
 HUB_WEB_ROOT = os.environ.get("ULTRALYTICS_HUB_WEB", "https://hub.ultralytics.com")
@@ -191,7 +192,9 @@ class Events:
         self.metadata = {
             "cli": Path(ARGV[0]).name == "yolo",
             "install": "git" if IS_GIT_DIR else "pip" if IS_PIP_PACKAGE else "other",
-            "python": ".".join(platform.python_version_tuple()[:2]),  # i.e. 3.10
+            "python": PYTHON_VERSION.rsplit(".", 1)[0],  # i.e. 3.13
+            "CPU": get_cpu_info(),
+            # "GPU": get_gpu_info(index=0) if cuda else None,
             "version": __version__,
             "env": ENVIRONMENT,
             "session_id": round(random.random() * 1e15),
@@ -205,12 +208,13 @@ class Events:
             and (IS_PIP_PACKAGE or get_git_origin_url() == "https://github.com/ultralytics/ultralytics.git")
         )
-    def __call__(self, cfg):
+    def __call__(self, cfg, device=None):
         """
         Attempt to add a new event to the events list and send events if the rate limit is reached.
         Args:
             cfg (IterableSimpleNamespace): The configuration object containing mode and task information.
+            device (torch.device | str): The device type (e.g., 'cpu', 'cuda').
         """
         if not self.enabled:
             # Events disabled, do nothing
@@ -222,6 +226,7 @@ class Events:
                 **self.metadata,
                 "task": cfg.task,
                 "model": cfg.model if cfg.model in GITHUB_ASSETS_NAMES else "custom",
+                "device": str(device),
             }
             if cfg.mode == "export":
                 params["format"] = cfg.format

ultralytics/models/yolo/classify/predict.py CHANGED Viewed

@@ -4,6 +4,7 @@ import cv2
 import torch
 from PIL import Image
+from ultralytics.data.augment import classify_transforms
 from ultralytics.engine.predictor import BasePredictor
 from ultralytics.engine.results import Results
 from ultralytics.utils import DEFAULT_CFG, ops
@@ -51,6 +52,16 @@ class ClassificationPredictor(BasePredictor):
         self.args.task = "classify"
         self._legacy_transform_name = "ultralytics.yolo.data.augment.ToTensor"
+    def setup_source(self, source):
+        """Sets up source and inference mode and classify transforms."""
+        super().setup_source(source)
+        updated = (
+            self.model.model.transforms.transforms[0].size != max(self.imgsz)
+            if hasattr(self.model.model, "transforms")
+            else True
+        )
+        self.transforms = self.model.model.transforms if not updated else classify_transforms(self.imgsz)
     def preprocess(self, img):
         """Convert input images to model-compatible tensor format with appropriate normalization."""
         if not isinstance(img, torch.Tensor):

ultralytics/models/yolo/obb/val.py CHANGED Viewed

@@ -252,7 +252,7 @@ class OBBValidator(DetectionValidator):
             merged_results = defaultdict(list)
             LOGGER.info(f"Saving merged predictions with DOTA format to {pred_merged_txt}...")
             for d in data:
-                image_id = d["image_id"].split("__")[0]
+                image_id = d["image_id"].split("__", 1)[0]
                 pattern = re.compile(r"\d+___\d+")
                 x, y = (int(c) for c in re.findall(pattern, d["image_id"])[0].split("___"))
                 bbox, score, cls = d["rbox"], d["score"], d["category_id"] - 1

ultralytics/models/yolo/world/train.py CHANGED Viewed

@@ -1,11 +1,14 @@
 # Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license
 import itertools
+from pathlib import Path
+import torch
 from ultralytics.data import build_yolo_dataset
-from ultralytics.models import yolo
+from ultralytics.models.yolo.detect import DetectionTrainer
 from ultralytics.nn.tasks import WorldModel
-from ultralytics.utils import DEFAULT_CFG, RANK, checks
+from ultralytics.utils import DEFAULT_CFG, LOGGER, RANK
 from ultralytics.utils.torch_utils import de_parallel
@@ -13,15 +16,11 @@ def on_pretrain_routine_end(trainer):
     """Callback to set up model classes and text encoder at the end of the pretrain routine."""
     if RANK in {-1, 0}:
         # Set class names for evaluation
-        names = [name.split("/")[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
+        names = [name.split("/", 1)[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
         de_parallel(trainer.ema.ema).set_classes(names, cache_clip_model=False)
-    device = next(trainer.model.parameters()).device
-    trainer.text_model, _ = trainer.clip.load("ViT-B/32", device=device)
-    for p in trainer.text_model.parameters():
-        p.requires_grad_(False)
-class WorldTrainer(yolo.detect.DetectionTrainer):
+class WorldTrainer(DetectionTrainer):
     """
     A class to fine-tune a world model on a close-set dataset.
@@ -54,14 +53,7 @@ class WorldTrainer(yolo.detect.DetectionTrainer):
         if overrides is None:
             overrides = {}
         super().__init__(cfg, overrides, _callbacks)
-        # Import and assign clip
-        try:
-            import clip
-        except ImportError:
-            checks.check_requirements("git+https://github.com/ultralytics/CLIP.git")
-            import clip
-        self.clip = clip
+        self.text_embeddings = None
     def get_model(self, cfg=None, weights=None, verbose=True):
         """
@@ -102,18 +94,72 @@ class WorldTrainer(yolo.detect.DetectionTrainer):
             (Dataset): YOLO dataset configured for training or validation.
         """
         gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
-        return build_yolo_dataset(
+        dataset = build_yolo_dataset(
             self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"
         )
+        if mode == "train":
+            self.set_text_embeddings([dataset], batch)  # cache text embeddings to accelerate training
+        return dataset
+    def set_text_embeddings(self, datasets, batch):
+        """
+        Set text embeddings for datasets to accelerate training by caching category names.
+        This method collects unique category names from all datasets, then generates and caches text embeddings
+        for these categories to improve training efficiency.
+        Args:
+            datasets (List[Dataset]): List of datasets from which to extract category names.
+            batch (int | None): Batch size used for processing.
+        Notes:
+            This method collects category names from datasets that have the 'category_names' attribute,
+            then uses the first dataset's image path to determine where to cache the generated text embeddings.
+        """
+        text_embeddings = {}
+        for dataset in datasets:
+            if not hasattr(dataset, "category_names"):
+                continue
+            text_embeddings.update(
+                self.generate_text_embeddings(
+                    list(dataset.category_names), batch, cache_dir=Path(dataset.img_path).parent
+                )
+            )
+        self.text_embeddings = text_embeddings
+    def generate_text_embeddings(self, texts, batch, cache_dir):
+        """
+        Generate text embeddings for a list of text samples.
+        Args:
+            texts (List[str]): List of text samples to encode.
+            batch (int): Batch size for processing.
+            cache_dir (Path): Directory to save/load cached embeddings.
+        Returns:
+            (dict): Dictionary mapping text samples to their embeddings.
+        """
+        model = "clip:ViT-B/32"
+        cache_path = cache_dir / f"text_embeddings_{model.replace(':', '_').replace('/', '_')}.pt"
+        if cache_path.exists():
+            LOGGER.info(f"Reading existed cache from '{cache_path}'")
+            txt_map = torch.load(cache_path)
+            if sorted(txt_map.keys()) == sorted(texts):
+                return txt_map
+        LOGGER.info(f"Caching text embeddings to '{cache_path}'")
+        assert self.model is not None
+        txt_feats = self.model.get_text_pe(texts, batch, cache_clip_model=False)
+        txt_map = dict(zip(texts, txt_feats.squeeze(0)))
+        torch.save(txt_map, cache_path)
+        return txt_map
     def preprocess_batch(self, batch):
         """Preprocess a batch of images and text for YOLOWorld training."""
-        batch = super().preprocess_batch(batch)
+        batch = DetectionTrainer.preprocess_batch(self, batch)
         # Add text features
         texts = list(itertools.chain(*batch["texts"]))
-        text_token = self.clip.tokenize(texts).to(batch["img"].device)
-        txt_feats = self.text_model.encode_text(text_token).to(dtype=batch["img"].dtype)  # torch.float32
+        txt_feats = torch.stack([self.text_embeddings[text] for text in texts]).to(self.device)
         txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
         batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
         return batch

ultralytics/models/yolo/world/train_world.py CHANGED Viewed

@@ -100,6 +100,7 @@ class WorldTrainerFromScratch(WorldTrainer):
             else build_grounding(self.args, im_path["img_path"], im_path["json_file"], batch, stride=gs)
             for im_path in img_path
         ]
+        self.set_text_embeddings(datasets, batch)  # cache text embeddings to accelerate training
         return YOLOConcatDataset(datasets) if len(datasets) > 1 else datasets[0]
     def get_dataset(self):

ultralytics/models/yolo/yoloe/train.py CHANGED Viewed

@@ -2,7 +2,6 @@
 import itertools
 from copy import copy, deepcopy
-from pathlib import Path
 import torch
@@ -157,40 +156,7 @@ class YOLOETrainerFromScratch(YOLOETrainer, WorldTrainerFromScratch):
         Returns:
             (YOLOConcatDataset | Dataset): The constructed dataset for training or validation.
         """
-        datasets = WorldTrainerFromScratch.build_dataset(self, img_path, mode, batch)
-        if mode == "train":
-            self.set_text_embeddings(
-                datasets.datasets if hasattr(datasets, "datasets") else [datasets], batch
-            )  # cache text embeddings to accelerate training
-        return datasets
-    def set_text_embeddings(self, datasets, batch):
-        """
-        Set text embeddings for datasets to accelerate training by caching category names.
-        This method collects unique category names from all datasets, then generates and caches text embeddings
-        for these categories to improve training efficiency.
-        Args:
-            datasets (List[Dataset]): List of datasets from which to extract category names.
-            batch (int | None): Batch size used for processing.
-        Notes:
-            This method collects category names from datasets that have the 'category_names' attribute,
-            then uses the first dataset's image path to determine where to cache the generated text embeddings.
-        """
-        # TODO: open up an interface to determine whether to do cache
-        category_names = set()
-        for dataset in datasets:
-            if not hasattr(dataset, "category_names"):
-                continue
-            category_names |= dataset.category_names
-        # TODO: enable to update the path or use a more general way to get the path
-        img_path = datasets[0].img_path
-        self.text_embeddings = self.generate_text_embeddings(
-            category_names, batch, cache_path=Path(img_path).parent / "text_embeddings.pt"
-        )
+        return WorldTrainerFromScratch.build_dataset(self, img_path, mode, batch)
     def preprocess_batch(self, batch):
         """Process batch for training, moving text features to the appropriate device."""
@@ -202,23 +168,28 @@ class YOLOETrainerFromScratch(YOLOETrainer, WorldTrainerFromScratch):
         batch["txt_feats"] = txt_feats
         return batch
-    def generate_text_embeddings(self, texts, batch, cache_path="embeddings.pt"):
+    def generate_text_embeddings(self, texts, batch, cache_dir):
         """
         Generate text embeddings for a list of text samples.
         Args:
             texts (List[str]): List of text samples to encode.
             batch (int): Batch size for processing.
-            cache_path (str | Path): Path to save/load cached embeddings.
+            cache_dir (Path): Directory to save/load cached embeddings.
         Returns:
             (dict): Dictionary mapping text samples to their embeddings.
         """
+        model = "mobileclip:blt"
+        cache_path = cache_dir / f"text_embeddings_{model.replace(':', '_').replace('/', '_')}.pt"
         if cache_path.exists():
             LOGGER.info(f"Reading existed cache from '{cache_path}'")
-            return torch.load(cache_path)
+            txt_map = torch.load(cache_path)
+            if sorted(txt_map.keys()) == sorted(texts):
+                return txt_map
+        LOGGER.info(f"Caching text embeddings to '{cache_path}'")
         assert self.model is not None
-        txt_feats = self.model.get_text_pe(texts, batch, without_reprta=True)
+        txt_feats = self.model.get_text_pe(texts, batch, without_reprta=True, cache_clip_model=False)
         txt_map = dict(zip(texts, txt_feats.squeeze(0)))
         torch.save(txt_map, cache_path)
         return txt_map

ultralytics/models/yolo/yoloe/val.py CHANGED Viewed

@@ -47,7 +47,7 @@ class YOLOEDetectValidator(DetectionValidator):
             (torch.Tensor): Visual prompt embeddings with shape (1, num_classes, embed_dim).
         """
         assert isinstance(model, YOLOEModel)
-        names = [name.split("/")[0] for name in list(dataloader.dataset.data["names"].values())]
+        names = [name.split("/", 1)[0] for name in list(dataloader.dataset.data["names"].values())]
         visual_pe = torch.zeros(len(names), model.model[-1].embed, device=self.device)
         cls_visual_num = torch.zeros(len(names))
@@ -140,7 +140,7 @@ class YOLOEDetectValidator(DetectionValidator):
         if trainer is not None:
             self.device = trainer.device
             model = trainer.ema.ema
-            names = [name.split("/")[0] for name in list(self.dataloader.dataset.data["names"].values())]
+            names = [name.split("/", 1)[0] for name in list(self.dataloader.dataset.data["names"].values())]
             if load_vp:
                 LOGGER.info("Validate using the visual prompt.")
@@ -164,7 +164,7 @@ class YOLOEDetectValidator(DetectionValidator):
                 model = attempt_load_weights(model, device=self.device, inplace=True)
             model.eval().to(self.device)
             data = check_det_dataset(refer_data or self.args.data)
-            names = [name.split("/")[0] for name in list(data["names"].values())]
+            names = [name.split("/", 1)[0] for name in list(data["names"].values())]
             if load_vp:
                 LOGGER.info("Validate using the visual prompt.")

ultralytics/nn/tasks.py CHANGED Viewed

@@ -146,6 +146,8 @@ class BaseModel(torch.nn.Module):
             (torch.Tensor): The last output of the model.
         """
         y, dt, embeddings = [], [], []  # outputs
+        embed = frozenset(embed) if embed is not None else {-1}
+        max_idx = max(embed)
         for m in self.model:
             if m.f != -1:  # if not from previous layer
                 x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
@@ -155,9 +157,9 @@ class BaseModel(torch.nn.Module):
             y.append(x if m.i in self.save else None)  # save output
             if visualize:
                 feature_visualization(x, m.type, m.i, save_dir=visualize)
-            if embed and m.i in embed:
+            if m.i in embed:
                 embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
-                if m.i == max(embed):
+                if m.i == max_idx:
                     return torch.unbind(torch.cat(embeddings, 1), dim=0)
         return x
@@ -677,6 +679,8 @@ class RTDETRDetectionModel(DetectionModel):
             (torch.Tensor): Model's output tensor.
         """
         y, dt, embeddings = [], [], []  # outputs
+        embed = frozenset(embed) if embed is not None else {-1}
+        max_idx = max(embed)
         for m in self.model[:-1]:  # except the head part
             if m.f != -1:  # if not from previous layer
                 x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
@@ -686,9 +690,9 @@ class RTDETRDetectionModel(DetectionModel):
             y.append(x if m.i in self.save else None)  # save output
             if visualize:
                 feature_visualization(x, m.type, m.i, save_dir=visualize)
-            if embed and m.i in embed:
+            if m.i in embed:
                 embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
-                if m.i == max(embed):
+                if m.i == max_idx:
                     return torch.unbind(torch.cat(embeddings, 1), dim=0)
         head = self.model[-1]
         x = head([y[j] for j in head.f], batch)  # head inference
@@ -721,24 +725,33 @@ class WorldModel(DetectionModel):
             batch (int): Batch size for processing text tokens.
             cache_clip_model (bool): Whether to cache the CLIP model.
         """
-        try:
-            import clip
-        except ImportError:
-            check_requirements("git+https://github.com/ultralytics/CLIP.git")
-            import clip
-        if (
-            not getattr(self, "clip_model", None) and cache_clip_model
-        ):  # for backwards compatibility of models lacking clip_model attribute
-            self.clip_model = clip.load("ViT-B/32")[0]
-        model = self.clip_model if cache_clip_model else clip.load("ViT-B/32")[0]
-        device = next(model.parameters()).device
-        text_token = clip.tokenize(text).to(device)
+        self.txt_feats = self.get_text_pe(text, batch=batch, cache_clip_model=cache_clip_model)
+        self.model[-1].nc = len(text)
+    @smart_inference_mode()
+    def get_text_pe(self, text, batch=80, cache_clip_model=True):
+        """
+        Set classes in advance so that model could do offline-inference without clip model.
+        Args:
+            text (List[str]): List of class names.
+            batch (int): Batch size for processing text tokens.
+            cache_clip_model (bool): Whether to cache the CLIP model.
+        Returns:
+            (torch.Tensor): Text positional embeddings.
+        """
+        from ultralytics.nn.text_model import build_text_model
+        device = next(self.model.parameters()).device
+        if not getattr(self, "clip_model", None) and cache_clip_model:
+            # For backwards compatibility of models lacking clip_model attribute
+            self.clip_model = build_text_model("clip:ViT-B/32", device=device)
+        model = self.clip_model if cache_clip_model else build_text_model("clip:ViT-B/32", device=device)
+        text_token = model.tokenize(text)
         txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
         txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
-        txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
-        self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
-        self.model[-1].nc = len(text)
+        return txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
     def predict(self, x, profile=False, visualize=False, txt_feats=None, augment=False, embed=None):
         """
@@ -760,6 +773,8 @@ class WorldModel(DetectionModel):
             txt_feats = txt_feats.expand(x.shape[0], -1, -1)
         ori_txt_feats = txt_feats.clone()
         y, dt, embeddings = [], [], []  # outputs
+        embed = frozenset(embed) if embed is not None else {-1}
+        max_idx = max(embed)
         for m in self.model:  # except the head part
             if m.f != -1:  # if not from previous layer
                 x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
@@ -777,9 +792,9 @@ class WorldModel(DetectionModel):
             y.append(x if m.i in self.save else None)  # save output
             if visualize:
                 feature_visualization(x, m.type, m.i, save_dir=visualize)
-            if embed and m.i in embed:
+            if m.i in embed:
                 embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
-                if m.i == max(embed):
+                if m.i == max_idx:
                     return torch.unbind(torch.cat(embeddings, 1), dim=0)
         return x
@@ -976,6 +991,8 @@ class YOLOEModel(DetectionModel):
         """
         y, dt, embeddings = [], [], []  # outputs
         b = x.shape[0]
+        embed = frozenset(embed) if embed is not None else {-1}
+        max_idx = max(embed)
         for m in self.model:  # except the head part
             if m.f != -1:  # if not from previous layer
                 x = y[m.f] if isinstance(m.f, int) else [x if j == -1 else y[j] for j in m.f]  # from earlier layers
@@ -997,9 +1014,9 @@ class YOLOEModel(DetectionModel):
             y.append(x if m.i in self.save else None)  # save output
             if visualize:
                 feature_visualization(x, m.type, m.i, save_dir=visualize)
-            if embed and m.i in embed:
+            if m.i in embed:
                 embeddings.append(torch.nn.functional.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1))  # flatten
-                if m.i == max(embed):
+                if m.i == max_idx:
                     return torch.unbind(torch.cat(embeddings, 1), dim=0)
         return x

ultralytics/nn/text_model.py CHANGED Viewed

@@ -324,6 +324,7 @@ class MobileCLIPTS(TextModel):
             >>> features.shape
             torch.Size([2, 512])  # Actual dimension depends on model size
         """
+        # NOTE: no need to do normalization here as it's embedded in the torchscript model
         return self.encoder(texts)

ultralytics/solutions/similarity_search.py CHANGED Viewed

@@ -30,12 +30,9 @@ class VisualAISearch(BaseSolution):
         """Initializes the VisualAISearch class with the FAISS index file and CLIP model."""
         super().__init__(**kwargs)
         check_requirements(["git+https://github.com/ultralytics/CLIP.git", "faiss-cpu"])
-        import clip
-        import faiss
-        self.faiss = faiss
-        self.clip = clip
+        self.faiss = __import__("faiss")
+        self.clip = __import__("clip")
         self.faiss_index = "faiss.index"
         self.data_path_npy = "paths.npy"
         self.model_name = "ViT-B/32"
@@ -51,7 +48,7 @@ class VisualAISearch(BaseSolution):
             safe_download(url=f"{ASSETS_URL}/images.zip", unzip=True, retry=3)
             self.data_dir = Path("images")
-        self.model, self.preprocess = clip.load(self.model_name, device=self.device)
+        self.model, self.preprocess = self.clip.load(self.model_name, device=self.device)
         self.index = None
         self.image_paths = []

ultralytics/solutions/streamlit_inference.py CHANGED Viewed

@@ -130,7 +130,7 @@ class Inference:
         # Add dropdown menu for model selection
         available_models = [x.replace("yolo", "YOLO") for x in GITHUB_ASSETS_STEMS if x.startswith("yolo11")]
         if self.model_path:  # If user provided the custom model, insert model without suffix as *.pt is added later
-            available_models.insert(0, self.model_path.split(".pt")[0])
+            available_models.insert(0, self.model_path.split(".pt", 1)[0])
         selected_model = self.st.sidebar.selectbox("Model", available_models)
         with self.st.spinner("Model is downloading..."):

ultralytics/utils/__init__.py CHANGED Viewed

@@ -1387,7 +1387,7 @@ def deprecation_warn(arg, new_arg=None):
 def clean_url(url):
     """Strip auth from URL, i.e. https://url.com/file.txt?auth -> https://url.com/file.txt."""
     url = Path(url).as_posix().replace(":/", "://")  # Pathlib turns :// -> :/, as_posix() for Windows
-    return unquote(url).split("?")[0]  # '%2F' to '/', split https://url.com/file.txt?auth
+    return unquote(url).split("?", 1)[0]  # '%2F' to '/', split https://url.com/file.txt?auth
 def url2file(url):

ultralytics/utils/callbacks/hub.py CHANGED Viewed

@@ -73,22 +73,23 @@ def on_train_end(trainer):
 def on_train_start(trainer):
     """Run events on train start."""
-    events(trainer.args)
+    events(trainer.args, trainer.device)
 def on_val_start(validator):
     """Run events on validation start."""
-    events(validator.args)
+    if not validator.training:
+        events(validator.args, validator.device)
 def on_predict_start(predictor):
     """Run events on predict start."""
-    events(predictor.args)
+    events(predictor.args, predictor.device)
 def on_export_start(exporter):
     """Run events on export start."""
-    events(exporter.args)
+    events(exporter.args, exporter.device)
 callbacks = (

dgenerate-ultralytics-headless 8.3.135__py3-none-any.whl → 8.3.138__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.135py3-none-any.whl → 8.3.138py3-none-any.whl