PyPI - ultralytics - Versions diffs - 8.1.37__py3-none-any.whl → 8.1.39__py3-none-any.whl - Mend

ultralytics 8.1.37py3-none-any.whl → 8.1.39py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of ultralytics might be problematic. Click here for more details.

Files changed (39) hide show

ultralytics/__init__.py +1 -1
ultralytics/cfg/__init__.py +1 -2
ultralytics/cfg/datasets/lvis.yaml +1239 -0
ultralytics/cfg/default.yaml +2 -2
ultralytics/data/__init__.py +18 -2
ultralytics/data/augment.py +123 -2
ultralytics/data/base.py +2 -0
ultralytics/data/build.py +25 -3
ultralytics/data/converter.py +22 -4
ultralytics/data/dataset.py +143 -27
ultralytics/data/utils.py +25 -1
ultralytics/engine/exporter.py +1 -3
ultralytics/engine/model.py +4 -1
ultralytics/engine/trainer.py +48 -44
ultralytics/models/fastsam/prompt.py +1 -1
ultralytics/models/yolo/__init__.py +2 -2
ultralytics/models/yolo/detect/val.py +36 -17
ultralytics/models/yolo/model.py +1 -0
ultralytics/models/yolo/world/__init__.py +5 -0
ultralytics/models/yolo/world/train.py +91 -0
ultralytics/models/yolo/world/train_world.py +108 -0
ultralytics/nn/autobackend.py +1 -1
ultralytics/nn/modules/block.py +4 -2
ultralytics/nn/modules/head.py +9 -0
ultralytics/nn/tasks.py +29 -13
ultralytics/solutions/heatmap.py +84 -46
ultralytics/solutions/object_counter.py +79 -64
ultralytics/trackers/utils/gmc.py +1 -1
ultralytics/utils/callbacks/raytune.py +1 -1
ultralytics/utils/loss.py +1 -1
ultralytics/utils/plotting.py +35 -21
ultralytics/utils/torch_utils.py +14 -0
ultralytics/utils/tuner.py +2 -2
{ultralytics-8.1.37.dist-info → ultralytics-8.1.39.dist-info}/METADATA +1 -1
{ultralytics-8.1.37.dist-info → ultralytics-8.1.39.dist-info}/RECORD +39 -35
{ultralytics-8.1.37.dist-info → ultralytics-8.1.39.dist-info}/LICENSE +0 -0
{ultralytics-8.1.37.dist-info → ultralytics-8.1.39.dist-info}/WHEEL +0 -0
{ultralytics-8.1.37.dist-info → ultralytics-8.1.39.dist-info}/entry_points.txt +0 -0
{ultralytics-8.1.37.dist-info → ultralytics-8.1.39.dist-info}/top_level.txt +0 -0

ultralytics/engine/trainer.py CHANGED Viewed

@@ -42,7 +42,7 @@ from ultralytics.utils.files import get_latest_run
 from ultralytics.utils.torch_utils import (
     EarlyStopping,
     ModelEMA,
-    de_parallel,
+    convert_optimizer_state_dict_to_fp16,
     init_seeds,
     one_cycle,
     select_device,
@@ -126,22 +126,7 @@ class BaseTrainer:
         # Model and Dataset
         self.model = check_model_file_from_stem(self.args.model)  # add suffix, i.e. yolov8n -> yolov8n.pt
-        try:
-            if self.args.task == "classify":
-                self.data = check_cls_dataset(self.args.data)
-            elif self.args.data.split(".")[-1] in ("yaml", "yml") or self.args.task in (
-                "detect",
-                "segment",
-                "pose",
-                "obb",
-            ):
-                self.data = check_det_dataset(self.args.data)
-                if "yaml_file" in self.data:
-                    self.args.data = self.data["yaml_file"]  # for validating 'yolo train data=url.zip' usage
-        except Exception as e:
-            raise RuntimeError(emojis(f"Dataset '{clean_url(self.args.data)}' error ❌ {e}")) from e
-        self.trainset, self.testset = self.get_dataset(self.data)
+        self.trainset, self.testset = self.get_dataset()
         self.ema = None
         # Optimization utils init
@@ -477,40 +462,59 @@ class BaseTrainer:
     def save_model(self):
         """Save model training checkpoints with additional metadata."""
+        import io
         import pandas as pd  # scope for faster startup
-        metrics = {**self.metrics, **{"fitness": self.fitness}}
-        results = {k.strip(): v for k, v in pd.read_csv(self.csv).to_dict(orient="list").items()}
-        ckpt = {
-            "epoch": self.epoch,
-            "best_fitness": self.best_fitness,
-            "model": deepcopy(de_parallel(self.model)).half(),
-            "ema": deepcopy(self.ema.ema).half(),
-            "updates": self.ema.updates,
-            "optimizer": self.optimizer.state_dict(),
-            "train_args": vars(self.args),  # save as dict
-            "train_metrics": metrics,
-            "train_results": results,
-            "date": datetime.now().isoformat(),
-            "version": __version__,
-            "license": "AGPL-3.0 (https://ultralytics.com/license)",
-            "docs": "https://docs.ultralytics.com",
-        }
-        # Save last and best
-        torch.save(ckpt, self.last)
+        # Serialize ckpt to a byte buffer once (faster than repeated torch.save() calls)
+        buffer = io.BytesIO()
+        torch.save(
+            {
+                "epoch": self.epoch,
+                "best_fitness": self.best_fitness,
+                "model": None,  # resume and final checkpoints derive from EMA
+                "ema": deepcopy(self.ema.ema).half(),
+                "updates": self.ema.updates,
+                "optimizer": convert_optimizer_state_dict_to_fp16(deepcopy(self.optimizer.state_dict())),
+                "train_args": vars(self.args),  # save as dict
+                "train_metrics": {**self.metrics, **{"fitness": self.fitness}},
+                "train_results": {k.strip(): v for k, v in pd.read_csv(self.csv).to_dict(orient="list").items()},
+                "date": datetime.now().isoformat(),
+                "version": __version__,
+                "license": "AGPL-3.0 (https://ultralytics.com/license)",
+                "docs": "https://docs.ultralytics.com",
+            },
+            buffer,
+        )
+        serialized_ckpt = buffer.getvalue()  # get the serialized content to save
+        # Save checkpoints
+        self.last.write_bytes(serialized_ckpt)  # save last.pt
         if self.best_fitness == self.fitness:
-            torch.save(ckpt, self.best)
+            self.best.write_bytes(serialized_ckpt)  # save best.pt
         if (self.save_period > 0) and (self.epoch > 0) and (self.epoch % self.save_period == 0):
-            torch.save(ckpt, self.wdir / f"epoch{self.epoch}.pt")
+            (self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt)  # save epoch, i.e. 'epoch3.pt'
-    @staticmethod
-    def get_dataset(data):
+    def get_dataset(self):
         """
         Get train, val path from data dict if it exists.
         Returns None if data format is not recognized.
         """
+        try:
+            if self.args.task == "classify":
+                data = check_cls_dataset(self.args.data)
+            elif self.args.data.split(".")[-1] in ("yaml", "yml") or self.args.task in (
+                "detect",
+                "segment",
+                "pose",
+                "obb",
+            ):
+                data = check_det_dataset(self.args.data)
+                if "yaml_file" in data:
+                    self.args.data = data["yaml_file"]  # for validating 'yolo train data=url.zip' usage
+        except Exception as e:
+            raise RuntimeError(emojis(f"Dataset '{clean_url(self.args.data)}' error ❌ {e}")) from e
+        self.data = data
         return data["train"], data.get("val") or data.get("test")
     def setup_model(self):
@@ -522,7 +526,7 @@ class BaseTrainer:
         ckpt = None
         if str(model).endswith(".pt"):
             weights, ckpt = attempt_load_one_weight(model)
-            cfg = ckpt["model"].yaml
+            cfg = weights.yaml
         else:
             cfg = model
         self.model = self.get_model(cfg=cfg, weights=weights, verbose=RANK == -1)  # calls Model(cfg, weights)
@@ -661,8 +665,8 @@ class BaseTrainer:
         if ckpt is None:
             return
         best_fitness = 0.0
-        start_epoch = ckpt["epoch"] + 1
-        if ckpt["optimizer"] is not None:
+        start_epoch = ckpt.get("epoch", -1) + 1
+        if ckpt.get("optimizer", None) is not None:
             self.optimizer.load_state_dict(ckpt["optimizer"])  # optimizer
             best_fitness = ckpt["best_fitness"]
         if self.ema and ckpt.get("ema"):

ultralytics/models/fastsam/prompt.py CHANGED Viewed

@@ -35,7 +35,7 @@ class FastSAMPrompt:
         except ImportError:
             from ultralytics.utils.checks import check_requirements
-            check_requirements("git+https://github.com/openai/CLIP.git")
+            check_requirements("git+https://github.com/ultralytics/CLIP.git")
             import clip
         self.clip = clip

ultralytics/models/yolo/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Ultralytics YOLO 🚀, AGPL-3.0 license
-from ultralytics.models.yolo import classify, detect, obb, pose, segment
+from ultralytics.models.yolo import classify, detect, obb, pose, segment, world
 from .model import YOLO, YOLOWorld
-__all__ = "classify", "segment", "detect", "pose", "obb", "YOLO", "YOLOWorld"
+__all__ = "classify", "segment", "detect", "pose", "obb", "world", "YOLO", "YOLOWorld"

ultralytics/models/yolo/detect/val.py CHANGED Viewed

@@ -33,6 +33,7 @@ class DetectionValidator(BaseValidator):
         super().__init__(dataloader, save_dir, pbar, args, _callbacks)
         self.nt_per_class = None
         self.is_coco = False
+        self.is_lvis = False
         self.class_map = None
         self.args.task = "detect"
         self.metrics = DetMetrics(save_dir=self.save_dir, on_plot=self.on_plot)
@@ -66,8 +67,9 @@ class DetectionValidator(BaseValidator):
         """Initialize evaluation metrics for YOLO."""
         val = self.data.get(self.args.split, "")  # validation path
         self.is_coco = isinstance(val, str) and "coco" in val and val.endswith(f"{os.sep}val2017.txt")  # is COCO
-        self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(1000))
-        self.args.save_json |= self.is_coco and not self.training  # run on final val if training COCO
+        self.is_lvis = isinstance(val, str) and "lvis" in val and not self.is_coco  # is LVIS
+        self.class_map = converter.coco80_to_coco91_class() if self.is_coco else list(range(len(model.names)))
+        self.args.save_json |= (self.is_coco or self.is_lvis) and not self.training  # run on final val if training COCO
         self.names = model.names
         self.nc = len(model.names)
         self.metrics.names = self.names
@@ -266,7 +268,8 @@ class DetectionValidator(BaseValidator):
             self.jdict.append(
                 {
                     "image_id": image_id,
-                    "category_id": self.class_map[int(p[5])],
+                    "category_id": self.class_map[int(p[5])]
+                    + (1 if self.is_lvis else 0),  # index starts from 1 if it's lvis
                     "bbox": [round(x, 3) for x in b],
                     "score": round(p[4], 5),
                 }
@@ -274,26 +277,42 @@ class DetectionValidator(BaseValidator):
     def eval_json(self, stats):
         """Evaluates YOLO output in JSON format and returns performance statistics."""
-        if self.args.save_json and self.is_coco and len(self.jdict):
-            anno_json = self.data["path"] / "annotations/instances_val2017.json"  # annotations
+        if self.args.save_json and (self.is_coco or self.is_lvis) and len(self.jdict):
             pred_json = self.save_dir / "predictions.json"  # predictions
-            LOGGER.info(f"\nEvaluating pycocotools mAP using {pred_json} and {anno_json}...")
+            anno_json = (
+                self.data["path"]
+                / "annotations"
+                / ("instances_val2017.json" if self.is_coco else f"lvis_v1_{self.args.split}.json")
+            )  # annotations
+            pkg = "pycocotools" if self.is_coco else "lvis"
+            LOGGER.info(f"\nEvaluating {pkg} mAP using {pred_json} and {anno_json}...")
             try:  # https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocoEvalDemo.ipynb
-                check_requirements("pycocotools>=2.0.6")
-                from pycocotools.coco import COCO  # noqa
-                from pycocotools.cocoeval import COCOeval  # noqa
-                for x in anno_json, pred_json:
+                for x in pred_json, anno_json:
                     assert x.is_file(), f"{x} file not found"
-                anno = COCO(str(anno_json))  # init annotations api
-                pred = anno.loadRes(str(pred_json))  # init predictions api (must pass string, not Path)
-                eval = COCOeval(anno, pred, "bbox")
+                check_requirements("pycocotools>=2.0.6" if self.is_coco else "lvis>=0.5.3")
                 if self.is_coco:
-                    eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files]  # images to eval
+                    from pycocotools.coco import COCO  # noqa
+                    from pycocotools.cocoeval import COCOeval  # noqa
+                    anno = COCO(str(anno_json))  # init annotations api
+                    pred = anno.loadRes(str(pred_json))  # init predictions api (must pass string, not Path)
+                    eval = COCOeval(anno, pred, "bbox")
+                else:
+                    from lvis import LVIS, LVISEval
+                    anno = LVIS(str(anno_json))  # init annotations api
+                    pred = anno._load_json(str(pred_json))  # init predictions api (must pass string, not Path)
+                    eval = LVISEval(anno, pred, "bbox")
+                eval.params.imgIds = [int(Path(x).stem) for x in self.dataloader.dataset.im_files]  # images to eval
                 eval.evaluate()
                 eval.accumulate()
                 eval.summarize()
-                stats[self.metrics.keys[-1]], stats[self.metrics.keys[-2]] = eval.stats[:2]  # update mAP50-95 and mAP50
+                if self.is_lvis:
+                    eval.print_results()  # explicitly call print_results
+                # update mAP50-95 and mAP50
+                stats[self.metrics.keys[-1]], stats[self.metrics.keys[-2]] = (
+                    eval.stats[:2] if self.is_coco else [eval.results["AP50"], eval.results["AP"]]
+                )
             except Exception as e:
-                LOGGER.warning(f"pycocotools unable to run: {e}")
+                LOGGER.warning(f"{pkg} unable to run: {e}")
         return stats

ultralytics/models/yolo/model.py CHANGED Viewed

@@ -83,6 +83,7 @@ class YOLOWorld(Model):
                 "model": WorldModel,
                 "validator": yolo.detect.DetectionValidator,
                 "predictor": yolo.detect.DetectionPredictor,
+                "trainer": yolo.world.WorldTrainer,
             }
         }

ultralytics/models/yolo/world/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+from .train import WorldTrainer
+__all__ = ["WorldTrainer"]

ultralytics/models/yolo/world/train.py ADDED Viewed

@@ -0,0 +1,91 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+from ultralytics.models import yolo
+from ultralytics.nn.tasks import WorldModel
+from ultralytics.utils import DEFAULT_CFG, RANK
+from ultralytics.data import build_yolo_dataset
+from ultralytics.utils.torch_utils import de_parallel
+from ultralytics.utils.checks import check_requirements
+import itertools
+try:
+    import clip
+except ImportError:
+    check_requirements("git+https://github.com/ultralytics/CLIP.git")
+    import clip
+def on_pretrain_routine_end(trainer):
+    """Callback."""
+    if RANK in (-1, 0):
+        # NOTE: for evaluation
+        names = [name.split("/")[0] for name in list(trainer.test_loader.dataset.data["names"].values())]
+        de_parallel(trainer.ema.ema).set_classes(names, cache_clip_model=False)
+    device = next(trainer.model.parameters()).device
+    text_model, _ = clip.load("ViT-B/32", device=device)
+    for p in text_model.parameters():
+        p.requires_grad_(False)
+    trainer.text_model = text_model
+class WorldTrainer(yolo.detect.DetectionTrainer):
+    """
+    A class to fine-tune a world model on a close-set dataset.
+    Example:
+        ```python
+        from ultralytics.models.yolo.world import WorldModel
+        args = dict(model='yolov8s-world.pt', data='coco8.yaml', epochs=3)
+        trainer = WorldTrainer(overrides=args)
+        trainer.train()
+        ```
+    """
+    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
+        """Initialize a WorldTrainer object with given arguments."""
+        if overrides is None:
+            overrides = {}
+        super().__init__(cfg, overrides, _callbacks)
+    def get_model(self, cfg=None, weights=None, verbose=True):
+        """Return WorldModel initialized with specified config and weights."""
+        # NOTE: This `nc` here is the max number of different text samples in one image, rather than the actual `nc`.
+        # NOTE: Following the official config, nc hard-coded to 80 for now.
+        model = WorldModel(
+            cfg["yaml_file"] if isinstance(cfg, dict) else cfg,
+            ch=3,
+            nc=min(self.data["nc"], 80),
+            verbose=verbose and RANK == -1,
+        )
+        if weights:
+            model.load(weights)
+        self.add_callback("on_pretrain_routine_end", on_pretrain_routine_end)
+        return model
+    def build_dataset(self, img_path, mode="train", batch=None):
+        """
+        Build YOLO Dataset.
+        Args:
+            img_path (str): Path to the folder containing images.
+            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
+            batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
+        """
+        gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
+        return build_yolo_dataset(
+            self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs, multi_modal=mode == "train"
+        )
+    def preprocess_batch(self, batch):
+        """Preprocesses a batch of images for YOLOWorld training, adjusting formatting and dimensions as needed."""
+        batch = super().preprocess_batch(batch)
+        # NOTE: add text features
+        texts = list(itertools.chain(*batch["texts"]))
+        text_token = clip.tokenize(texts).to(batch["img"].device)
+        txt_feats = self.text_model.encode_text(text_token).to(dtype=batch["img"].dtype)  # torch.float32
+        txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
+        batch["txt_feats"] = txt_feats.reshape(len(batch["texts"]), -1, txt_feats.shape[-1])
+        return batch

ultralytics/models/yolo/world/train_world.py ADDED Viewed

@@ -0,0 +1,108 @@
+from ultralytics.data import build_yolo_dataset, build_grounding, YOLOConcatDataset
+from ultralytics.data.utils import check_det_dataset
+from ultralytics.models.yolo.world import WorldTrainer
+from ultralytics.utils.torch_utils import de_parallel
+from ultralytics.utils import DEFAULT_CFG
+class WorldTrainerFromScratch(WorldTrainer):
+    """
+    A class extending the WorldTrainer class for training a world model from scratch on open-set dataset.
+    Example:
+        ```python
+        from ultralytics.models.yolo.world.train_world import WorldTrainerFromScratch
+        from ultralytics import YOLOWorld
+        data = dict(
+            train=dict(
+                yolo_data=["Objects365.yaml"],
+                grounding_data=[
+                    dict(
+                        img_path="../datasets/flickr30k/images",
+                        json_file="../datasets/flickr30k/final_flickr_separateGT_train.json",
+                    ),
+                    dict(
+                        img_path="../datasets/GQA/images",
+                        json_file="../datasets/GQA/final_mixed_train_no_coco.json",
+                    ),
+                ],
+            ),
+            val=dict(yolo_data=["lvis.yaml"]),
+        )
+        model = YOLOWorld("yolov8s-worldv2.yaml")
+        model.train(data=data, trainer=WorldTrainerFromScratch)
+        ```
+    """
+    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
+        """Initialize a WorldTrainer object with given arguments."""
+        if overrides is None:
+            overrides = {}
+        super().__init__(cfg, overrides, _callbacks)
+    def build_dataset(self, img_path, mode="train", batch=None):
+        """
+        Build YOLO Dataset.
+        Args:
+            img_path (List[str] | str): Path to the folder containing images.
+            mode (str): `train` mode or `val` mode, users are able to customize different augmentations for each mode.
+            batch (int, optional): Size of batches, this is for `rect`. Defaults to None.
+        """
+        gs = max(int(de_parallel(self.model).stride.max() if self.model else 0), 32)
+        if mode == "train":
+            dataset = [
+                build_yolo_dataset(self.args, im_path, batch, self.data, stride=gs, multi_modal=True)
+                if isinstance(im_path, str)
+                else build_grounding(self.args, im_path["img_path"], im_path["json_file"], batch, stride=gs)
+                for im_path in img_path
+            ]
+            return YOLOConcatDataset(dataset) if len(dataset) > 1 else dataset[0]
+        else:
+            return build_yolo_dataset(self.args, img_path, batch, self.data, mode=mode, rect=mode == "val", stride=gs)
+    def get_dataset(self):
+        """
+        Get train, val path from data dict if it exists.
+        Returns None if data format is not recognized.
+        """
+        final_data = dict()
+        data_yaml = self.args.data
+        assert data_yaml.get("train", False)  # object365.yaml
+        assert data_yaml.get("val", False)  # lvis.yaml
+        data = {k: [check_det_dataset(d) for d in v.get("yolo_data", [])] for k, v in data_yaml.items()}
+        assert len(data["val"]) == 1, f"Only support validating on 1 dataset for now, but got {len(data['val'])}."
+        val_split = "minival" if "lvis" in data["val"][0]["val"] else "val"
+        for d in data["val"]:
+            if d.get("minival") is None:  # for lvis dataset
+                continue
+            d["minival"] = str(d["path"] / d["minival"])
+        for s in ["train", "val"]:
+            final_data[s] = [d["train" if s == "train" else val_split] for d in data[s]]
+            # save grounding data if there's one
+            grounding_data = data_yaml[s].get("grounding_data")
+            if grounding_data is None:
+                continue
+            grounding_data = [grounding_data] if not isinstance(grounding_data, list) else grounding_data
+            for g in grounding_data:
+                assert isinstance(g, dict), f"Grounding data should be provided in dict format, but got {type(g)}"
+            final_data[s] += grounding_data
+        # NOTE: to make training work properly, set `nc` and `names`
+        final_data["nc"] = data["val"][0]["nc"]
+        final_data["names"] = data["val"][0]["names"]
+        self.data = final_data
+        return final_data["train"], final_data["val"][0]
+    def plot_training_labels(self):
+        """DO NOT plot labels."""
+        pass
+    def final_eval(self):
+        """Performs final evaluation and validation for object detection YOLO-World model."""
+        val = self.args.data["val"]["yolo_data"][0]
+        self.validator.args.data = val
+        self.validator.args.split = "minival" if isinstance(val, str) and "lvis" in val else "val"
+        return super().final_eval()

ultralytics/nn/autobackend.py CHANGED Viewed

@@ -543,7 +543,7 @@ class AutoBackend(nn.Module):
                     if integer:
                         scale, zero_point = output["quantization"]
                         x = (x.astype(np.float32) - zero_point) * scale  # re-scale
-                    if x.ndim > 2:  # if task is not classification
+                    if x.ndim == 3:  # if task is not classification, excluding masks (ndim=4) as well
                         # Denormalize xywh by image size. See https://github.com/ultralytics/ultralytics/pull/1695
                         # xywh are normalized in TFLite/EdgeTPU to mitigate quantization error of integer models
                         x[:, [0, 2]] *= w

ultralytics/nn/modules/block.py CHANGED Viewed

@@ -519,7 +519,8 @@ class ContrastiveHead(nn.Module):
     def __init__(self):
         """Initializes ContrastiveHead with specified region-text similarity parameters."""
         super().__init__()
-        self.bias = nn.Parameter(torch.zeros([]))
+        # NOTE: use -10.0 to keep the init cls loss consistency with other losses
+        self.bias = nn.Parameter(torch.tensor([-10.0]))
         self.logit_scale = nn.Parameter(torch.ones([]) * torch.tensor(1 / 0.07).log())
     def forward(self, x, w):
@@ -542,7 +543,8 @@ class BNContrastiveHead(nn.Module):
         """Initialize ContrastiveHead with region-text similarity parameters."""
         super().__init__()
         self.norm = nn.BatchNorm2d(embed_dims)
-        self.bias = nn.Parameter(torch.zeros([]))
+        # NOTE: use -10.0 to keep the init cls loss consistency with other losses
+        self.bias = nn.Parameter(torch.tensor([-10.0]))
         # use -1.0 is more stable
         self.logit_scale = nn.Parameter(-1.0 * torch.ones([]))

ultralytics/nn/modules/head.py CHANGED Viewed

@@ -250,6 +250,15 @@ class WorldDetect(Detect):
         y = torch.cat((dbox, cls.sigmoid()), 1)
         return y if self.export else (y, x)
+    def bias_init(self):
+        """Initialize Detect() biases, WARNING: requires stride availability."""
+        m = self  # self.model[-1]  # Detect() module
+        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
+        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
+        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
+            a[-1].bias.data[:] = 1.0  # box
+            # b[-1].bias.data[:] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
 class RTDETRDecoder(nn.Module):
     """

ultralytics/nn/tasks.py CHANGED Viewed

@@ -564,28 +564,28 @@ class WorldModel(DetectionModel):
         self.clip_model = None  # CLIP model placeholder
         super().__init__(cfg=cfg, ch=ch, nc=nc, verbose=verbose)
-    def set_classes(self, text):
-        """Perform a forward pass with optional profiling, visualization, and embedding extraction."""
+    def set_classes(self, text, batch=80, cache_clip_model=True):
+        """Set classes in advance so that model could do offline-inference without clip model."""
         try:
             import clip
         except ImportError:
-            check_requirements("git+https://github.com/openai/CLIP.git")
+            check_requirements("git+https://github.com/ultralytics/CLIP.git")
             import clip
-        if not getattr(self, "clip_model", None):  # for backwards compatibility of models lacking clip_model attribute
+        if (
+            not getattr(self, "clip_model", None) and cache_clip_model
+        ):  # for backwards compatibility of models lacking clip_model attribute
             self.clip_model = clip.load("ViT-B/32")[0]
-        device = next(self.clip_model.parameters()).device
+        model = self.clip_model if cache_clip_model else clip.load("ViT-B/32")[0]
+        device = next(model.parameters()).device
         text_token = clip.tokenize(text).to(device)
-        txt_feats = self.clip_model.encode_text(text_token).to(dtype=torch.float32)
+        txt_feats = [model.encode_text(token).detach() for token in text_token.split(batch)]
+        txt_feats = txt_feats[0] if len(txt_feats) == 1 else torch.cat(txt_feats, dim=0)
         txt_feats = txt_feats / txt_feats.norm(p=2, dim=-1, keepdim=True)
-        self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1]).detach()
+        self.txt_feats = txt_feats.reshape(-1, len(text), txt_feats.shape[-1])
         self.model[-1].nc = len(text)
-    def init_criterion(self):
-        """Initialize the loss criterion for the model."""
-        raise NotImplementedError
-    def predict(self, x, profile=False, visualize=False, augment=False, embed=None):
+    def predict(self, x, profile=False, visualize=False, txt_feats=None, augment=False, embed=None):
         """
         Perform a forward pass through the model.
@@ -593,13 +593,14 @@ class WorldModel(DetectionModel):
             x (torch.Tensor): The input tensor.
             profile (bool, optional): If True, profile the computation time for each layer. Defaults to False.
             visualize (bool, optional): If True, save feature maps for visualization. Defaults to False.
+            txt_feats (torch.Tensor): The text features, use it if it's given. Defaults to None.
             augment (bool, optional): If True, perform data augmentation during inference. Defaults to False.
             embed (list, optional): A list of feature vectors/embeddings to return.
         Returns:
             (torch.Tensor): Model's output tensor.
         """
-        txt_feats = self.txt_feats.to(device=x.device, dtype=x.dtype)
+        txt_feats = (self.txt_feats if txt_feats is None else txt_feats).to(device=x.device, dtype=x.dtype)
         if len(txt_feats) != len(x):
             txt_feats = txt_feats.repeat(len(x), 1, 1)
         ori_txt_feats = txt_feats.clone()
@@ -627,6 +628,21 @@ class WorldModel(DetectionModel):
                     return torch.unbind(torch.cat(embeddings, 1), dim=0)
         return x
+    def loss(self, batch, preds=None):
+        """
+        Compute loss.
+        Args:
+            batch (dict): Batch to compute loss on.
+            preds (torch.Tensor | List[torch.Tensor]): Predictions.
+        """
+        if not hasattr(self, "criterion"):
+            self.criterion = self.init_criterion()
+        if preds is None:
+            preds = self.forward(batch["img"], txt_feats=batch["txt_feats"])
+        return self.criterion(preds, batch)
 class Ensemble(nn.ModuleList):
     """Ensemble of models."""

ultralytics 8.1.37__py3-none-any.whl → 8.1.39__py3-none-any.whl

Potentially problematic release.

ultralytics 8.1.37py3-none-any.whl → 8.1.39py3-none-any.whl