PyPI - dgenerate-ultralytics-headless - Versions diffs - 8.3.137__py3-none-any.whl → 8.3.224__py3-none-any.whl - Mend

dgenerate-ultralytics-headless 8.3.137py3-none-any.whl → 8.3.224py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (215) hide show

{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/METADATA +41 -34
dgenerate_ultralytics_headless-8.3.224.dist-info/RECORD +285 -0
{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/WHEEL +1 -1
tests/__init__.py +7 -6
tests/conftest.py +15 -39
tests/test_cli.py +17 -17
tests/test_cuda.py +17 -8
tests/test_engine.py +36 -10
tests/test_exports.py +98 -37
tests/test_integrations.py +12 -15
tests/test_python.py +126 -82
tests/test_solutions.py +319 -135
ultralytics/__init__.py +27 -9
ultralytics/cfg/__init__.py +83 -87
ultralytics/cfg/datasets/Argoverse.yaml +4 -4
ultralytics/cfg/datasets/DOTAv1.5.yaml +2 -2
ultralytics/cfg/datasets/DOTAv1.yaml +2 -2
ultralytics/cfg/datasets/GlobalWheat2020.yaml +2 -2
ultralytics/cfg/datasets/HomeObjects-3K.yaml +4 -5
ultralytics/cfg/datasets/ImageNet.yaml +3 -3
ultralytics/cfg/datasets/Objects365.yaml +24 -20
ultralytics/cfg/datasets/SKU-110K.yaml +9 -9
ultralytics/cfg/datasets/VOC.yaml +10 -13
ultralytics/cfg/datasets/VisDrone.yaml +43 -33
ultralytics/cfg/datasets/african-wildlife.yaml +5 -5
ultralytics/cfg/datasets/brain-tumor.yaml +4 -5
ultralytics/cfg/datasets/carparts-seg.yaml +5 -5
ultralytics/cfg/datasets/coco-pose.yaml +26 -4
ultralytics/cfg/datasets/coco.yaml +4 -4
ultralytics/cfg/datasets/coco128-seg.yaml +2 -2
ultralytics/cfg/datasets/coco128.yaml +2 -2
ultralytics/cfg/datasets/coco8-grayscale.yaml +103 -0
ultralytics/cfg/datasets/coco8-multispectral.yaml +2 -2
ultralytics/cfg/datasets/coco8-pose.yaml +23 -2
ultralytics/cfg/datasets/coco8-seg.yaml +2 -2
ultralytics/cfg/datasets/coco8.yaml +2 -2
ultralytics/cfg/datasets/construction-ppe.yaml +32 -0
ultralytics/cfg/datasets/crack-seg.yaml +5 -5
ultralytics/cfg/datasets/dog-pose.yaml +32 -4
ultralytics/cfg/datasets/dota8-multispectral.yaml +2 -2
ultralytics/cfg/datasets/dota8.yaml +2 -2
ultralytics/cfg/datasets/hand-keypoints.yaml +29 -4
ultralytics/cfg/datasets/lvis.yaml +9 -9
ultralytics/cfg/datasets/medical-pills.yaml +4 -5
ultralytics/cfg/datasets/open-images-v7.yaml +7 -10
ultralytics/cfg/datasets/package-seg.yaml +5 -5
ultralytics/cfg/datasets/signature.yaml +4 -4
ultralytics/cfg/datasets/tiger-pose.yaml +20 -4
ultralytics/cfg/datasets/xView.yaml +5 -5
ultralytics/cfg/default.yaml +96 -93
ultralytics/cfg/trackers/botsort.yaml +16 -17
ultralytics/cfg/trackers/bytetrack.yaml +9 -11
ultralytics/data/__init__.py +4 -4
ultralytics/data/annotator.py +12 -12
ultralytics/data/augment.py +531 -564
ultralytics/data/base.py +76 -81
ultralytics/data/build.py +206 -42
ultralytics/data/converter.py +179 -78
ultralytics/data/dataset.py +121 -121
ultralytics/data/loaders.py +114 -91
ultralytics/data/split.py +28 -15
ultralytics/data/split_dota.py +67 -48
ultralytics/data/utils.py +110 -89
ultralytics/engine/exporter.py +422 -460
ultralytics/engine/model.py +224 -252
ultralytics/engine/predictor.py +94 -89
ultralytics/engine/results.py +345 -595
ultralytics/engine/trainer.py +231 -134
ultralytics/engine/tuner.py +279 -73
ultralytics/engine/validator.py +53 -46
ultralytics/hub/__init__.py +26 -28
ultralytics/hub/auth.py +30 -16
ultralytics/hub/google/__init__.py +34 -36
ultralytics/hub/session.py +53 -77
ultralytics/hub/utils.py +23 -109
ultralytics/models/__init__.py +1 -1
ultralytics/models/fastsam/__init__.py +1 -1
ultralytics/models/fastsam/model.py +36 -18
ultralytics/models/fastsam/predict.py +33 -44
ultralytics/models/fastsam/utils.py +4 -5
ultralytics/models/fastsam/val.py +12 -14
ultralytics/models/nas/__init__.py +1 -1
ultralytics/models/nas/model.py +16 -20
ultralytics/models/nas/predict.py +12 -14
ultralytics/models/nas/val.py +4 -5
ultralytics/models/rtdetr/__init__.py +1 -1
ultralytics/models/rtdetr/model.py +9 -9
ultralytics/models/rtdetr/predict.py +22 -17
ultralytics/models/rtdetr/train.py +20 -16
ultralytics/models/rtdetr/val.py +79 -59
ultralytics/models/sam/__init__.py +8 -2
ultralytics/models/sam/amg.py +53 -38
ultralytics/models/sam/build.py +29 -31
ultralytics/models/sam/model.py +33 -38
ultralytics/models/sam/modules/blocks.py +159 -182
ultralytics/models/sam/modules/decoders.py +38 -47
ultralytics/models/sam/modules/encoders.py +114 -133
ultralytics/models/sam/modules/memory_attention.py +38 -31
ultralytics/models/sam/modules/sam.py +114 -93
ultralytics/models/sam/modules/tiny_encoder.py +268 -291
ultralytics/models/sam/modules/transformer.py +59 -66
ultralytics/models/sam/modules/utils.py +55 -72
ultralytics/models/sam/predict.py +745 -341
ultralytics/models/utils/loss.py +118 -107
ultralytics/models/utils/ops.py +118 -71
ultralytics/models/yolo/__init__.py +1 -1
ultralytics/models/yolo/classify/predict.py +28 -26
ultralytics/models/yolo/classify/train.py +50 -81
ultralytics/models/yolo/classify/val.py +68 -61
ultralytics/models/yolo/detect/predict.py +12 -15
ultralytics/models/yolo/detect/train.py +56 -46
ultralytics/models/yolo/detect/val.py +279 -223
ultralytics/models/yolo/model.py +167 -86
ultralytics/models/yolo/obb/predict.py +7 -11
ultralytics/models/yolo/obb/train.py +23 -25
ultralytics/models/yolo/obb/val.py +107 -99
ultralytics/models/yolo/pose/__init__.py +1 -1
ultralytics/models/yolo/pose/predict.py +12 -14
ultralytics/models/yolo/pose/train.py +31 -69
ultralytics/models/yolo/pose/val.py +119 -254
ultralytics/models/yolo/segment/predict.py +21 -25
ultralytics/models/yolo/segment/train.py +12 -66
ultralytics/models/yolo/segment/val.py +126 -305
ultralytics/models/yolo/world/train.py +53 -45
ultralytics/models/yolo/world/train_world.py +51 -32
ultralytics/models/yolo/yoloe/__init__.py +7 -7
ultralytics/models/yolo/yoloe/predict.py +30 -37
ultralytics/models/yolo/yoloe/train.py +89 -71
ultralytics/models/yolo/yoloe/train_seg.py +15 -17
ultralytics/models/yolo/yoloe/val.py +56 -41
ultralytics/nn/__init__.py +9 -11
ultralytics/nn/autobackend.py +179 -107
ultralytics/nn/modules/__init__.py +67 -67
ultralytics/nn/modules/activation.py +8 -7
ultralytics/nn/modules/block.py +302 -323
ultralytics/nn/modules/conv.py +61 -104
ultralytics/nn/modules/head.py +488 -186
ultralytics/nn/modules/transformer.py +183 -123
ultralytics/nn/modules/utils.py +15 -20
ultralytics/nn/tasks.py +327 -203
ultralytics/nn/text_model.py +81 -65
ultralytics/py.typed +1 -0
ultralytics/solutions/__init__.py +12 -12
ultralytics/solutions/ai_gym.py +19 -27
ultralytics/solutions/analytics.py +36 -26
ultralytics/solutions/config.py +29 -28
ultralytics/solutions/distance_calculation.py +23 -24
ultralytics/solutions/heatmap.py +17 -19
ultralytics/solutions/instance_segmentation.py +21 -19
ultralytics/solutions/object_blurrer.py +16 -17
ultralytics/solutions/object_counter.py +48 -53
ultralytics/solutions/object_cropper.py +22 -16
ultralytics/solutions/parking_management.py +61 -58
ultralytics/solutions/queue_management.py +19 -19
ultralytics/solutions/region_counter.py +63 -50
ultralytics/solutions/security_alarm.py +22 -25
ultralytics/solutions/similarity_search.py +107 -60
ultralytics/solutions/solutions.py +343 -262
ultralytics/solutions/speed_estimation.py +35 -31
ultralytics/solutions/streamlit_inference.py +104 -40
ultralytics/solutions/templates/similarity-search.html +31 -24
ultralytics/solutions/trackzone.py +24 -24
ultralytics/solutions/vision_eye.py +11 -12
ultralytics/trackers/__init__.py +1 -1
ultralytics/trackers/basetrack.py +18 -27
ultralytics/trackers/bot_sort.py +48 -39
ultralytics/trackers/byte_tracker.py +94 -94
ultralytics/trackers/track.py +7 -16
ultralytics/trackers/utils/gmc.py +37 -69
ultralytics/trackers/utils/kalman_filter.py +68 -76
ultralytics/trackers/utils/matching.py +13 -17
ultralytics/utils/__init__.py +251 -275
ultralytics/utils/autobatch.py +19 -7
ultralytics/utils/autodevice.py +68 -38
ultralytics/utils/benchmarks.py +169 -130
ultralytics/utils/callbacks/base.py +12 -13
ultralytics/utils/callbacks/clearml.py +14 -15
ultralytics/utils/callbacks/comet.py +139 -66
ultralytics/utils/callbacks/dvc.py +19 -27
ultralytics/utils/callbacks/hub.py +8 -6
ultralytics/utils/callbacks/mlflow.py +6 -10
ultralytics/utils/callbacks/neptune.py +11 -19
ultralytics/utils/callbacks/platform.py +73 -0
ultralytics/utils/callbacks/raytune.py +3 -4
ultralytics/utils/callbacks/tensorboard.py +9 -12
ultralytics/utils/callbacks/wb.py +33 -30
ultralytics/utils/checks.py +163 -114
ultralytics/utils/cpu.py +89 -0
ultralytics/utils/dist.py +24 -20
ultralytics/utils/downloads.py +176 -146
ultralytics/utils/errors.py +11 -13
ultralytics/utils/events.py +113 -0
ultralytics/utils/export/__init__.py +7 -0
ultralytics/utils/{export.py → export/engine.py} +81 -63
ultralytics/utils/export/imx.py +294 -0
ultralytics/utils/export/tensorflow.py +217 -0
ultralytics/utils/files.py +33 -36
ultralytics/utils/git.py +137 -0
ultralytics/utils/instance.py +105 -120
ultralytics/utils/logger.py +404 -0
ultralytics/utils/loss.py +99 -61
ultralytics/utils/metrics.py +649 -478
ultralytics/utils/nms.py +337 -0
ultralytics/utils/ops.py +263 -451
ultralytics/utils/patches.py +70 -31
ultralytics/utils/plotting.py +253 -223
ultralytics/utils/tal.py +48 -61
ultralytics/utils/torch_utils.py +244 -251
ultralytics/utils/tqdm.py +438 -0
ultralytics/utils/triton.py +22 -23
ultralytics/utils/tuner.py +11 -10
dgenerate_ultralytics_headless-8.3.137.dist-info/RECORD +0 -272
{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/entry_points.txt +0 -0
{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/licenses/LICENSE +0 -0
{dgenerate_ultralytics_headless-8.3.137.dist-info → dgenerate_ultralytics_headless-8.3.224.dist-info}/top_level.txt +0 -0

ultralytics/engine/trainer.py CHANGED Viewed

@@ -6,6 +6,8 @@ Usage:
     $ yolo mode=train model=yolo11n.pt data=coco8.yaml imgsz=640 epochs=100 batch=16
 """
+from __future__ import annotations
 import gc
 import math
 import os
@@ -24,9 +26,10 @@ from torch import nn, optim
 from ultralytics import __version__
 from ultralytics.cfg import get_cfg, get_save_dir
 from ultralytics.data.utils import check_cls_dataset, check_det_dataset
-from ultralytics.nn.tasks import attempt_load_one_weight, attempt_load_weights
+from ultralytics.nn.tasks import load_checkpoint
 from ultralytics.utils import (
     DEFAULT_CFG,
+    GIT,
     LOCAL_RANK,
     LOGGER,
     RANK,
@@ -41,10 +44,12 @@ from ultralytics.utils.autobatch import check_train_batch_size
 from ultralytics.utils.checks import check_amp, check_file, check_imgsz, check_model_file_from_stem, print_args
 from ultralytics.utils.dist import ddp_cleanup, generate_ddp_command
 from ultralytics.utils.files import get_latest_run
+from ultralytics.utils.plotting import plot_results
 from ultralytics.utils.torch_utils import (
     TORCH_2_4,
     EarlyStopping,
     ModelEMA,
+    attempt_compile,
     autocast,
     convert_optimizer_state_dict_to_fp16,
     init_seeds,
@@ -53,12 +58,15 @@ from ultralytics.utils.torch_utils import (
     strip_optimizer,
     torch_distributed_zero_first,
     unset_deterministic,
+    unwrap_model,
 )
 class BaseTrainer:
-    """
-    A base class for creating trainers.
+    """A base class for creating trainers.
+    This class provides the foundation for training YOLO models, handling the training loop, validation, checkpointing,
+    and various training utilities. It supports both single-GPU and multi-GPU distributed training.
     Attributes:
         args (SimpleNamespace): Configuration for the trainer.
@@ -89,21 +97,34 @@ class BaseTrainer:
         csv (Path): Path to results CSV file.
         metrics (dict): Dictionary of metrics.
         plots (dict): Dictionary of plots.
+    Methods:
+        train: Execute the training process.
+        validate: Run validation on the test set.
+        save_model: Save model training checkpoints.
+        get_dataset: Get train and validation datasets.
+        setup_model: Load, create, or download model.
+        build_optimizer: Construct an optimizer for the model.
+    Examples:
+        Initialize a trainer and start training
+        >>> trainer = BaseTrainer(cfg="config.yaml")
+        >>> trainer.train()
     """
     def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
-        """
-        Initialize the BaseTrainer class.
+        """Initialize the BaseTrainer class.
         Args:
-            cfg (str, optional): Path to a configuration file. Defaults to DEFAULT_CFG.
-            overrides (dict, optional): Configuration overrides. Defaults to None.
-            _callbacks (list, optional): List of callback functions. Defaults to None.
+            cfg (str, optional): Path to a configuration file.
+            overrides (dict, optional): Configuration overrides.
+            _callbacks (list, optional): List of callback functions.
         """
+        self.hub_session = overrides.pop("session", None)  # HUB
         self.args = get_cfg(cfg, overrides)
         self.check_resume(overrides)
-        self.device = select_device(self.args.device, self.args.batch)
-        # update "-1" devices so post-training val does not repeat search
+        self.device = select_device(self.args.device)
+        # Update "-1" devices so post-training val does not repeat search
         self.args.device = os.getenv("CUDA_VISIBLE_DEVICES") if "cuda" in str(self.device) else str(self.device)
         self.validator = None
         self.metrics = None
@@ -149,15 +170,32 @@ class BaseTrainer:
         self.tloss = None
         self.loss_names = ["Loss"]
         self.csv = self.save_dir / "results.csv"
+        if self.csv.exists() and not self.args.resume:
+            self.csv.unlink()
         self.plot_idx = [0, 1, 2]
-        # HUB
-        self.hub_session = None
+        self.nan_recovery_attempts = 0
         # Callbacks
         self.callbacks = _callbacks or callbacks.get_default_callbacks()
-        if RANK in {-1, 0}:
+        if isinstance(self.args.device, str) and len(self.args.device):  # i.e. device='0' or device='0,1,2,3'
+            world_size = len(self.args.device.split(","))
+        elif isinstance(self.args.device, (tuple, list)):  # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list)
+            world_size = len(self.args.device)
+        elif self.args.device in {"cpu", "mps"}:  # i.e. device='cpu' or 'mps'
+            world_size = 0
+        elif torch.cuda.is_available():  # i.e. device=None or device='' or device=number
+            world_size = 1  # default to device 0
+        else:  # i.e. device=None or device=''
+            world_size = 0
+        self.ddp = world_size > 1 and "LOCAL_RANK" not in os.environ
+        self.world_size = world_size
+        # Run subprocess if DDP training, else train normally
+        if RANK in {-1, 0} and not self.ddp:
             callbacks.add_integration_callbacks(self)
+            # Start console logging immediately at trainer initialization
+            self.run_callbacks("on_pretrain_routine_start")
     def add_callback(self, event: str, callback):
         """Append the given callback to the event's callback list."""
@@ -174,31 +212,20 @@ class BaseTrainer:
     def train(self):
         """Allow device='', device=None on Multi-GPU systems to default to device=0."""
-        if isinstance(self.args.device, str) and len(self.args.device):  # i.e. device='0' or device='0,1,2,3'
-            world_size = len(self.args.device.split(","))
-        elif isinstance(self.args.device, (tuple, list)):  # i.e. device=[0, 1, 2, 3] (multi-GPU from CLI is list)
-            world_size = len(self.args.device)
-        elif self.args.device in {"cpu", "mps"}:  # i.e. device='cpu' or 'mps'
-            world_size = 0
-        elif torch.cuda.is_available():  # i.e. device=None or device='' or device=number
-            world_size = 1  # default to device 0
-        else:  # i.e. device=None or device=''
-            world_size = 0
         # Run subprocess if DDP training, else train normally
-        if world_size > 1 and "LOCAL_RANK" not in os.environ:
+        if self.ddp:
             # Argument checks
             if self.args.rect:
                 LOGGER.warning("'rect=True' is incompatible with Multi-GPU training, setting 'rect=False'")
                 self.args.rect = False
             if self.args.batch < 1.0:
-                LOGGER.warning(
-                    "'batch<1' for AutoBatch is incompatible with Multi-GPU training, setting default 'batch=16'"
+                raise ValueError(
+                    "AutoBatch with batch<1 not supported for Multi-GPU training, "
+                    f"please specify a valid batch size multiple of GPU count {self.world_size}, i.e. batch={self.world_size * 8}."
                 )
-                self.args.batch = 16
             # Command
-            cmd, file = generate_ddp_command(world_size, self)
+            cmd, file = generate_ddp_command(self)
             try:
                 LOGGER.info(f"{colorstr('DDP:')} debug command {' '.join(cmd)}")
                 subprocess.run(cmd, check=True)
@@ -208,7 +235,7 @@ class BaseTrainer:
                 ddp_cleanup(self, str(file))
         else:
-            self._do_train(world_size)
+            self._do_train()
     def _setup_scheduler(self):
         """Initialize training learning rate scheduler."""
@@ -218,27 +245,27 @@ class BaseTrainer:
             self.lf = lambda x: max(1 - x / self.epochs, 0) * (1.0 - self.args.lrf) + self.args.lrf  # linear
         self.scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=self.lf)
-    def _setup_ddp(self, world_size):
+    def _setup_ddp(self):
         """Initialize and set the DistributedDataParallel parameters for training."""
         torch.cuda.set_device(RANK)
         self.device = torch.device("cuda", RANK)
-        # LOGGER.info(f'DDP info: RANK {RANK}, WORLD_SIZE {world_size}, DEVICE {self.device}')
         os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"  # set to enforce timeout
         dist.init_process_group(
             backend="nccl" if dist.is_nccl_available() else "gloo",
             timeout=timedelta(seconds=10800),  # 3 hours
             rank=RANK,
-            world_size=world_size,
+            world_size=self.world_size,
         )
-    def _setup_train(self, world_size):
+    def _setup_train(self):
         """Build dataloaders and optimizer on correct rank process."""
-        # Model
-        self.run_callbacks("on_pretrain_routine_start")
         ckpt = self.setup_model()
         self.model = self.model.to(self.device)
         self.set_model_attributes()
+        # Compile model
+        self.model = attempt_compile(self.model, device=self.device, mode=self.args.compile)
         # Freeze layers
         freeze_list = (
             self.args.freeze
@@ -268,13 +295,13 @@ class BaseTrainer:
             callbacks_backup = callbacks.default_callbacks.copy()  # backup callbacks as check_amp() resets them
             self.amp = torch.tensor(check_amp(self.model), device=self.device)
             callbacks.default_callbacks = callbacks_backup  # restore callbacks
-        if RANK > -1 and world_size > 1:  # DDP
+        if RANK > -1 and self.world_size > 1:  # DDP
             dist.broadcast(self.amp.int(), src=0)  # broadcast from rank 0 to all other ranks; gloo errors with boolean
         self.amp = bool(self.amp)  # as boolean
         self.scaler = (
             torch.amp.GradScaler("cuda", enabled=self.amp) if TORCH_2_4 else torch.cuda.amp.GradScaler(enabled=self.amp)
         )
-        if world_size > 1:
+        if self.world_size > 1:
             self.model = nn.parallel.DistributedDataParallel(self.model, device_ids=[RANK], find_unused_parameters=True)
         # Check imgsz
@@ -287,22 +314,22 @@ class BaseTrainer:
             self.args.batch = self.batch_size = self.auto_batch()
         # Dataloaders
-        batch_size = self.batch_size // max(world_size, 1)
+        batch_size = self.batch_size // max(self.world_size, 1)
         self.train_loader = self.get_dataloader(
             self.data["train"], batch_size=batch_size, rank=LOCAL_RANK, mode="train"
         )
+        # Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
+        self.test_loader = self.get_dataloader(
+            self.data.get("val") or self.data.get("test"),
+            batch_size=batch_size if self.args.task == "obb" else batch_size * 2,
+            rank=LOCAL_RANK,
+            mode="val",
+        )
+        self.validator = self.get_validator()
+        self.ema = ModelEMA(self.model)
         if RANK in {-1, 0}:
-            # Note: When training DOTA dataset, double batch size could get OOM on images with >2000 objects.
-            self.test_loader = self.get_dataloader(
-                self.data.get("val") or self.data.get("test"),
-                batch_size=batch_size if self.args.task == "obb" else batch_size * 2,
-                rank=-1,
-                mode="val",
-            )
-            self.validator = self.get_validator()
             metric_keys = self.validator.metrics.keys + self.label_loss_items(prefix="val")
             self.metrics = dict(zip(metric_keys, [0] * len(metric_keys)))
-            self.ema = ModelEMA(self.model)
             if self.args.plots:
                 self.plot_training_labels()
@@ -325,11 +352,11 @@ class BaseTrainer:
         self.scheduler.last_epoch = self.start_epoch - 1  # do not move
         self.run_callbacks("on_pretrain_routine_end")
-    def _do_train(self, world_size=1):
+    def _do_train(self):
         """Train the model with the specified world size."""
-        if world_size > 1:
-            self._setup_ddp(world_size)
-        self._setup_train(world_size)
+        if self.world_size > 1:
+            self._setup_ddp()
+        self._setup_train()
         nb = len(self.train_loader)  # number of batches
         nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1  # warmup iterations
@@ -340,7 +367,7 @@ class BaseTrainer:
         self.run_callbacks("on_train_start")
         LOGGER.info(
             f"Image sizes {self.args.imgsz} train, {self.args.imgsz} val\n"
-            f"Using {self.train_loader.num_workers * (world_size or 1)} dataloader workers\n"
+            f"Using {self.train_loader.num_workers * (self.world_size or 1)} dataloader workers\n"
             f"Logging results to {colorstr('bold', self.save_dir)}\n"
             f"Starting training for " + (f"{self.args.time} hours..." if self.args.time else f"{self.epochs} epochs...")
         )
@@ -387,18 +414,19 @@ class BaseTrainer:
                 # Forward
                 with autocast(self.amp):
                     batch = self.preprocess_batch(batch)
-                    loss, self.loss_items = self.model(batch)
+                    if self.args.compile:
+                        # Decouple inference and loss calculations for improved compile performance
+                        preds = self.model(batch["img"])
+                        loss, self.loss_items = unwrap_model(self.model).loss(batch, preds)
+                    else:
+                        loss, self.loss_items = self.model(batch)
                     self.loss = loss.sum()
                     if RANK != -1:
-                        self.loss *= world_size
-                    self.tloss = (
-                        (self.tloss * i + self.loss_items) / (i + 1) if self.tloss is not None else self.loss_items
-                    )
+                        self.loss *= self.world_size
+                    self.tloss = self.loss_items if self.tloss is None else (self.tloss * i + self.loss_items) / (i + 1)
                 # Backward
                 self.scaler.scale(self.loss).backward()
-                # Optimize - https://pytorch.org/docs/master/notes/amp_examples.html
                 if ni - last_opt_step >= self.accumulate:
                     self.optimizer_step()
                     last_opt_step = ni
@@ -433,14 +461,23 @@ class BaseTrainer:
                 self.run_callbacks("on_train_batch_end")
             self.lr = {f"lr/pg{ir}": x["lr"] for ir, x in enumerate(self.optimizer.param_groups)}  # for loggers
             self.run_callbacks("on_train_epoch_end")
             if RANK in {-1, 0}:
                 final_epoch = epoch + 1 >= self.epochs
                 self.ema.update_attr(self.model, include=["yaml", "nc", "args", "names", "stride", "class_weights"])
-                # Validation
-                if self.args.val or final_epoch or self.stopper.possible_stop or self.stop:
-                    self.metrics, self.fitness = self.validate()
+            # Validation
+            if self.args.val or final_epoch or self.stopper.possible_stop or self.stop:
+                self._clear_memory(threshold=0.5)  # prevent VRAM spike
+                self.metrics, self.fitness = self.validate()
+            # NaN recovery
+            if self._handle_nan_recovery(epoch):
+                continue
+            self.nan_recovery_attempts = 0
+            if RANK in {-1, 0}:
                 self.save_metrics(metrics={**self.label_loss_items(self.tloss), **self.metrics, **self.lr})
                 self.stop |= self.stopper(epoch + 1, self.fitness) or final_epoch
                 if self.args.time:
@@ -462,8 +499,7 @@ class BaseTrainer:
                 self.scheduler.last_epoch = self.epoch  # do not move
                 self.stop |= epoch >= self.epochs  # stop if exceeded epochs
             self.run_callbacks("on_fit_epoch_end")
-            if self._get_memory(fraction=True) > 0.5:
-                self._clear_memory()  # clear if memory utilization > 50%
+            self._clear_memory(0.5)  # clear if memory utilization > 50%
             # Early Stopping
             if RANK != -1:  # if DDP training
@@ -474,11 +510,11 @@ class BaseTrainer:
                 break  # must break all DDP ranks
             epoch += 1
+        seconds = time.time() - self.train_time_start
+        LOGGER.info(f"\n{epoch - self.start_epoch + 1} epochs completed in {seconds / 3600:.3f} hours.")
+        # Do final val with best.pt
+        self.final_eval()
         if RANK in {-1, 0}:
-            # Do final val with best.pt
-            seconds = time.time() - self.train_time_start
-            LOGGER.info(f"\n{epoch - self.start_epoch + 1} epochs completed in {seconds / 3600:.3f} hours.")
-            self.final_eval()
             if self.args.plots:
                 self.plot_metrics()
             self.run_callbacks("on_train_end")
@@ -509,8 +545,12 @@ class BaseTrainer:
                 total = torch.cuda.get_device_properties(self.device).total_memory
         return ((memory / total) if total > 0 else 0) if fraction else (memory / 2**30)
-    def _clear_memory(self):
+    def _clear_memory(self, threshold: float | None = None):
         """Clear accelerator memory by calling garbage collector and emptying cache."""
+        if threshold:
+            assert 0 <= threshold <= 1, "Threshold must be between 0 and 1."
+            if self._get_memory(fraction=True) <= threshold:
+                return
         gc.collect()
         if self.device.type == "mps":
             torch.mps.empty_cache()
@@ -520,10 +560,13 @@ class BaseTrainer:
             torch.cuda.empty_cache()
     def read_results_csv(self):
-        """Read results.csv into a dictionary using pandas."""
-        import pandas as pd  # scope for faster 'import ultralytics'
+        """Read results.csv into a dictionary using polars."""
+        import polars as pl  # scope for faster 'import ultralytics'
-        return pd.read_csv(self.csv).to_dict(orient="list")
+        try:
+            return pl.read_csv(self.csv, infer_schema_length=None).to_dict(as_series=False)
+        except Exception:
+            return {}
     def _model_train(self):
         """Set model in training mode."""
@@ -544,14 +587,21 @@ class BaseTrainer:
                 "epoch": self.epoch,
                 "best_fitness": self.best_fitness,
                 "model": None,  # resume and final checkpoints derive from EMA
-                "ema": deepcopy(self.ema.ema).half(),
+                "ema": deepcopy(unwrap_model(self.ema.ema)).half(),
                 "updates": self.ema.updates,
                 "optimizer": convert_optimizer_state_dict_to_fp16(deepcopy(self.optimizer.state_dict())),
+                "scaler": self.scaler.state_dict(),
                 "train_args": vars(self.args),  # save as dict
                 "train_metrics": {**self.metrics, **{"fitness": self.fitness}},
                 "train_results": self.read_results_csv(),
                 "date": datetime.now().isoformat(),
                 "version": __version__,
+                "git": {
+                    "root": str(GIT.root),
+                    "branch": GIT.branch,
+                    "commit": GIT.commit,
+                    "origin": GIT.origin,
+                },
                 "license": "AGPL-3.0 (https://ultralytics.com/license)",
                 "docs": "https://docs.ultralytics.com",
             },
@@ -560,17 +610,15 @@ class BaseTrainer:
         serialized_ckpt = buffer.getvalue()  # get the serialized content to save
         # Save checkpoints
+        self.wdir.mkdir(parents=True, exist_ok=True)  # ensure weights directory exists
         self.last.write_bytes(serialized_ckpt)  # save last.pt
         if self.best_fitness == self.fitness:
             self.best.write_bytes(serialized_ckpt)  # save best.pt
         if (self.save_period > 0) and (self.epoch % self.save_period == 0):
             (self.wdir / f"epoch{self.epoch}.pt").write_bytes(serialized_ckpt)  # save epoch, i.e. 'epoch3.pt'
-        # if self.args.close_mosaic and self.epoch == (self.epochs - self.args.close_mosaic - 1):
-        #    (self.wdir / "last_mosaic.pt").write_bytes(serialized_ckpt)  # save mosaic checkpoint
     def get_dataset(self):
-        """
-        Get train and validation datasets from data dictionary.
+        """Get train and validation datasets from data dictionary.
         Returns:
             (dict): A dictionary containing the training/validation/test dataset and category names.
@@ -578,7 +626,16 @@ class BaseTrainer:
         try:
             if self.args.task == "classify":
                 data = check_cls_dataset(self.args.data)
-            elif self.args.data.split(".")[-1] in {"yaml", "yml"} or self.args.task in {
+            elif self.args.data.rsplit(".", 1)[-1] == "ndjson":
+                # Convert NDJSON to YOLO format
+                import asyncio
+                from ultralytics.data.converter import convert_ndjson_to_yolo
+                yaml_path = asyncio.run(convert_ndjson_to_yolo(self.args.data))
+                self.args.data = str(yaml_path)
+                data = check_det_dataset(self.args.data)
+            elif self.args.data.rsplit(".", 1)[-1] in {"yaml", "yml"} or self.args.task in {
                 "detect",
                 "segment",
                 "pose",
@@ -596,8 +653,7 @@ class BaseTrainer:
         return data
     def setup_model(self):
-        """
-        Load, create, or download model for any task.
+        """Load, create, or download model for any task.
         Returns:
             (dict): Optional checkpoint to resume training from.
@@ -608,17 +664,17 @@ class BaseTrainer:
         cfg, weights = self.model, None
         ckpt = None
         if str(self.model).endswith(".pt"):
-            weights, ckpt = attempt_load_one_weight(self.model)
+            weights, ckpt = load_checkpoint(self.model)
             cfg = weights.yaml
         elif isinstance(self.args.pretrained, (str, Path)):
-            weights, _ = attempt_load_one_weight(self.args.pretrained)
+            weights, _ = load_checkpoint(self.args.pretrained)
         self.model = self.get_model(cfg=cfg, weights=weights, verbose=RANK == -1)  # calls Model(cfg, weights)
         return ckpt
     def optimizer_step(self):
         """Perform a single step of the training optimizer with gradient clipping and EMA update."""
         self.scaler.unscale_(self.optimizer)  # unscale gradients
-        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0)  # clip gradients
+        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0)
         self.scaler.step(self.optimizer)
         self.scaler.update()
         self.optimizer.zero_grad()
@@ -626,17 +682,23 @@ class BaseTrainer:
             self.ema.update(self.model)
     def preprocess_batch(self, batch):
-        """Allows custom preprocessing model inputs and ground truths depending on task type."""
+        """Allow custom preprocessing model inputs and ground truths depending on task type."""
         return batch
     def validate(self):
-        """
-        Run validation on test set using self.validator.
+        """Run validation on val set using self.validator.
         Returns:
-            (tuple): A tuple containing metrics dictionary and fitness score.
+            metrics (dict): Dictionary of validation metrics.
+            fitness (float): Fitness score for the validation.
         """
+        if self.ema and self.world_size > 1:
+            # Sync EMA buffers from rank 0 to all ranks
+            for buffer in self.ema.ema.buffers():
+                dist.broadcast(buffer, src=0)
         metrics = self.validator(self)
+        if metrics is None:
+            return None, None
         fitness = metrics.pop("fitness", -self.loss.detach().cpu().numpy())  # use loss as fitness measure if not found
         if not self.best_fitness or self.best_fitness < fitness:
             self.best_fitness = fitness
@@ -647,11 +709,11 @@ class BaseTrainer:
         raise NotImplementedError("This task trainer doesn't support loading cfg files")
     def get_validator(self):
-        """Returns a NotImplementedError when the get_validator function is called."""
+        """Return a NotImplementedError when the get_validator function is called."""
         raise NotImplementedError("get_validator function not implemented in trainer")
     def get_dataloader(self, dataset_path, batch_size=16, rank=0, mode="train"):
-        """Returns dataloader derived from torch.data.Dataloader."""
+        """Return dataloader derived from torch.data.Dataloader."""
         raise NotImplementedError("get_dataloader function not implemented in trainer")
     def build_dataset(self, img_path, mode="train", batch=None):
@@ -659,10 +721,9 @@ class BaseTrainer:
         raise NotImplementedError("build_dataset function not implemented in trainer")
     def label_loss_items(self, loss_items=None, prefix="train"):
-        """
-        Returns a loss dict with labelled training loss items tensor.
+        """Return a loss dict with labeled training loss items tensor.
-        Note:
+        Notes:
             This is not needed for classification but necessary for segmentation & detection
         """
         return {"loss": loss_items} if loss_items is not None else ["loss"]
@@ -672,55 +733,57 @@ class BaseTrainer:
         self.model.names = self.data["names"]
     def build_targets(self, preds, targets):
-        """Builds target tensors for training YOLO model."""
+        """Build target tensors for training YOLO model."""
         pass
     def progress_string(self):
-        """Returns a string describing training progress."""
+        """Return a string describing training progress."""
         return ""
     # TODO: may need to put these following functions into callback
     def plot_training_samples(self, batch, ni):
-        """Plots training samples during YOLO training."""
+        """Plot training samples during YOLO training."""
         pass
     def plot_training_labels(self):
-        """Plots training labels for YOLO model."""
+        """Plot training labels for YOLO model."""
         pass
     def save_metrics(self, metrics):
         """Save training metrics to a CSV file."""
         keys, vals = list(metrics.keys()), list(metrics.values())
         n = len(metrics) + 2  # number of cols
-        s = "" if self.csv.exists() else (("%s," * n % tuple(["epoch", "time"] + keys)).rstrip(",") + "\n")  # header
         t = time.time() - self.train_time_start
+        self.csv.parent.mkdir(parents=True, exist_ok=True)  # ensure parent directory exists
+        s = "" if self.csv.exists() else (("%s," * n % tuple(["epoch", "time", *keys])).rstrip(",") + "\n")  # header
         with open(self.csv, "a", encoding="utf-8") as f:
-            f.write(s + ("%.6g," * n % tuple([self.epoch + 1, t] + vals)).rstrip(",") + "\n")
+            f.write(s + ("%.6g," * n % tuple([self.epoch + 1, t, *vals])).rstrip(",") + "\n")
     def plot_metrics(self):
-        """Plot and display metrics visually."""
-        pass
+        """Plot metrics from a CSV file."""
+        plot_results(file=self.csv, on_plot=self.on_plot)  # save results.png
     def on_plot(self, name, data=None):
-        """Registers plots (e.g. to be consumed in callbacks)."""
+        """Register plots (e.g. to be consumed in callbacks)."""
         path = Path(name)
         self.plots[path] = {"data": data, "timestamp": time.time()}
     def final_eval(self):
         """Perform final evaluation and validation for object detection YOLO model."""
-        ckpt = {}
-        for f in self.last, self.best:
-            if f.exists():
-                if f is self.last:
-                    ckpt = strip_optimizer(f)
-                elif f is self.best:
-                    k = "train_results"  # update best.pt train_metrics from last.pt
-                    strip_optimizer(f, updates={k: ckpt[k]} if k in ckpt else None)
-                    LOGGER.info(f"\nValidating {f}...")
-                    self.validator.args.plots = self.args.plots
-                    self.metrics = self.validator(model=f)
-                    self.metrics.pop("fitness", None)
-                    self.run_callbacks("on_fit_epoch_end")
+        model = self.best if self.best.exists() else None
+        with torch_distributed_zero_first(LOCAL_RANK):  # strip only on GPU 0; other GPUs should wait
+            if RANK in {-1, 0}:
+                ckpt = strip_optimizer(self.last) if self.last.exists() else {}
+                if model:
+                    # update best.pt train_metrics from last.pt
+                    strip_optimizer(self.best, updates={"train_results": ckpt.get("train_results")})
+        if model:
+            LOGGER.info(f"\nValidating {model}...")
+            self.validator.args.plots = self.args.plots
+            self.validator.args.compile = False  # disable final val compile as too slow
+            self.metrics = self.validator(model=model)
+            self.metrics.pop("fitness", None)
+            self.run_callbacks("on_fit_epoch_end")
     def check_resume(self, overrides):
         """Check if resume checkpoint exists and update arguments accordingly."""
@@ -731,7 +794,7 @@ class BaseTrainer:
                 last = Path(check_file(resume) if exists else get_latest_run())
                 # Check that resume data YAML exists, otherwise strip to force re-download of dataset
-                ckpt_args = attempt_load_weights(last).args
+                ckpt_args = load_checkpoint(last)[0].args
                 if not isinstance(ckpt_args["data"], dict) and not Path(ckpt_args["data"]).exists():
                     ckpt_args["data"] = self.args.data
@@ -754,18 +817,54 @@ class BaseTrainer:
                 ) from e
         self.resume = resume
+    def _load_checkpoint_state(self, ckpt):
+        """Load optimizer, scaler, EMA, and best_fitness from checkpoint."""
+        if ckpt.get("optimizer") is not None:
+            self.optimizer.load_state_dict(ckpt["optimizer"])
+        if ckpt.get("scaler") is not None:
+            self.scaler.load_state_dict(ckpt["scaler"])
+        if self.ema and ckpt.get("ema"):
+            self.ema = ModelEMA(self.model)  # validation with EMA creates inference tensors that can't be updated
+            self.ema.ema.load_state_dict(ckpt["ema"].float().state_dict())
+            self.ema.updates = ckpt["updates"]
+        self.best_fitness = ckpt.get("best_fitness", 0.0)
+    def _handle_nan_recovery(self, epoch):
+        """Detect and recover from NaN/Inf loss and fitness collapse by loading last checkpoint."""
+        loss_nan = self.loss is not None and not self.loss.isfinite()
+        fitness_nan = self.fitness is not None and not np.isfinite(self.fitness)
+        fitness_collapse = self.best_fitness and self.best_fitness > 0 and self.fitness == 0
+        corrupted = RANK in {-1, 0} and loss_nan and (fitness_nan or fitness_collapse)
+        reason = "Loss NaN/Inf" if loss_nan else "Fitness NaN/Inf" if fitness_nan else "Fitness collapse"
+        if RANK != -1:  # DDP: broadcast to all ranks
+            broadcast_list = [corrupted if RANK == 0 else None]
+            dist.broadcast_object_list(broadcast_list, 0)
+            corrupted = broadcast_list[0]
+        if not corrupted:
+            return False
+        if epoch == self.start_epoch or not self.last.exists():
+            LOGGER.warning(f"{reason} detected but can not recover from last.pt...")
+            return False  # Cannot recover on first epoch, let training continue
+        self.nan_recovery_attempts += 1
+        if self.nan_recovery_attempts > 3:
+            raise RuntimeError(f"Training failed: NaN persisted for {self.nan_recovery_attempts} epochs")
+        LOGGER.warning(f"{reason} detected (attempt {self.nan_recovery_attempts}/3), recovering from last.pt...")
+        self._model_train()  # set model to train mode before loading checkpoint to avoid inference tensor errors
+        _, ckpt = load_checkpoint(self.last)
+        ema_state = ckpt["ema"].float().state_dict()
+        if not all(torch.isfinite(v).all() for v in ema_state.values() if isinstance(v, torch.Tensor)):
+            raise RuntimeError(f"Checkpoint {self.last} is corrupted with NaN/Inf weights")
+        unwrap_model(self.model).load_state_dict(ema_state)  # Load EMA weights into model
+        self._load_checkpoint_state(ckpt)  # Load optimizer/scaler/EMA/best_fitness
+        del ckpt, ema_state
+        self.scheduler.last_epoch = epoch - 1
+        return True
     def resume_training(self, ckpt):
         """Resume YOLO training from given epoch and best fitness."""
         if ckpt is None or not self.resume:
             return
-        best_fitness = 0.0
         start_epoch = ckpt.get("epoch", -1) + 1
-        if ckpt.get("optimizer", None) is not None:
-            self.optimizer.load_state_dict(ckpt["optimizer"])  # optimizer
-            best_fitness = ckpt["best_fitness"]
-        if self.ema and ckpt.get("ema"):
-            self.ema.ema.load_state_dict(ckpt["ema"].float().state_dict())  # EMA
-            self.ema.updates = ckpt["updates"]
         assert start_epoch > 0, (
             f"{self.args.model} training to {self.epochs} epochs is finished, nothing to resume.\n"
             f"Start a new training without resuming, i.e. 'yolo train model={self.args.model}'"
@@ -776,7 +875,7 @@ class BaseTrainer:
                 f"{self.model} has been trained for {ckpt['epoch']} epochs. Fine-tuning for {self.epochs} more epochs."
             )
             self.epochs += ckpt["epoch"]  # finetune additional epochs
-        self.best_fitness = best_fitness
+        self._load_checkpoint_state(ckpt)
         self.start_epoch = start_epoch
         if start_epoch > (self.epochs - self.args.close_mosaic):
             self._close_dataloader_mosaic()
@@ -790,18 +889,16 @@ class BaseTrainer:
             self.train_loader.dataset.close_mosaic(hyp=copy(self.args))
     def build_optimizer(self, model, name="auto", lr=0.001, momentum=0.9, decay=1e-5, iterations=1e5):
-        """
-        Construct an optimizer for the given model.
+        """Construct an optimizer for the given model.
         Args:
             model (torch.nn.Module): The model for which to build an optimizer.
-            name (str, optional): The name of the optimizer to use. If 'auto', the optimizer is selected
-                based on the number of iterations. Default: 'auto'.
-            lr (float, optional): The learning rate for the optimizer. Default: 0.001.
-            momentum (float, optional): The momentum factor for the optimizer. Default: 0.9.
-            decay (float, optional): The weight decay for the optimizer. Default: 1e-5.
-            iterations (float, optional): The number of iterations, which determines the optimizer if
-                name is 'auto'. Default: 1e5.
+            name (str, optional): The name of the optimizer to use. If 'auto', the optimizer is selected based on the
+                number of iterations.
+            lr (float, optional): The learning rate for the optimizer.
+            momentum (float, optional): The momentum factor for the optimizer.
+            decay (float, optional): The weight decay for the optimizer.
+            iterations (float, optional): The number of iterations, which determines the optimizer if name is 'auto'.
         Returns:
             (torch.optim.Optimizer): The constructed optimizer.

dgenerate-ultralytics-headless 8.3.137__py3-none-any.whl → 8.3.224__py3-none-any.whl

dgenerate-ultralytics-headless 8.3.137py3-none-any.whl → 8.3.224py3-none-any.whl